Skip to content

Instantly share code, notes, and snippets.

@aurthurm
Last active June 27, 2025 16:11
Show Gist options
  • Save aurthurm/b7d13d21a7eac9722076255283dd9651 to your computer and use it in GitHub Desktop.
Save aurthurm/b7d13d21a7eac9722076255283dd9651 to your computer and use it in GitHub Desktop.
BreakPoints from whonet exploration
import pandas as pd
breakpoints = pd.read_csv('Breakpoints.txt', sep='\t', low_memory=False)
antibiotics = pd.read_csv('Antibiotics.txt', sep='\t', low_memory=False)
organisms = pd.read_csv('Organisms.txt', sep='\t', low_memory=False)
# limit breakpoints to 2024 CLSI for Humans only
breakpoints = breakpoints[
breakpoints['TEST_METHOD'].isin(['MIC', 'DISK']) &
(breakpoints['GUIDELINES'] == 'CLSI') &
(breakpoints['YEAR'] == 2024) &
(breakpoints['HOST'] == 'Human')
]
# map to groups names based on the grouping codes provided
def get_org_category(row, organisms_df):
code_type = row['ORGANISM_CODE_TYPE']
code = row['ORGANISM_CODE']
# Handle special cases
if code_type == 'WHONET_ORG_CODE':
return "organism-direct"
if code_type == 'ALL':
return "organisms-all"
if code_type == 'ANAEROBE':
return "Anaerobe"
# Handle special mappings for GENUS_GROUP
if code_type == 'GENUS_GROUP':
special_map = {
'NFR': 'Non-Fermenting Rods',
}
return special_map.get(code, 'drop-me')
# Handle special mappings for SEROVAR_GROUP
if code_type == 'SEROVAR_GROUP':
special_map = {
'HIN': 'Haemophilus',
'ECO': 'Enterobacteriaceae'
}
return special_map.get(code, 'drop-me')
# Handle special mappings for SPECIES_GROUP
# Species code definitions not in any file: Thanks to deepseek AI for the research to find mapping names
if code_type == 'SPECIES_GROUP':
special_map = {
'ABX': 'Acinetobacter baumannii complex',
'MTX': 'Mycobacterium tuberculosis complex',
'MAX': 'Mycobacterium avium complex',
'SGM': 'Slowly Growing Mycobacteria',
'RGM': 'Rapidly Growing Mycobacteria',
'BCX': 'Burkholderia cepacia complex',
'SVI': 'Streptococcus (Viridans)',
'BS-': 'Streptococcus (Beta Hem)',
'COF': 'Coryneform (Diphtheroids)',
}
return special_map.get(code, 'drop-me')
# Check if code_type is in organism dataframe
if code_type in organisms_df.columns:
value_column = code_type.split('_')[0].upper()
if value_column not in organisms_df.columns:
return "value-column-not-found"
# Find matching organism
subset = organisms_df[organisms_df[code_type] == code]
if not subset.empty:
non_null_values = subset[value_column].dropna()
if not non_null_values.empty:
return non_null_values.iloc[0]
return "no-match"
breakpoints['CATEGORY'] = breakpoints.apply(get_org_category, axis=1, args=(organisms,))
# map to organisms for those with direct mappings to specific organisms
def get_org_direct(row, organisms_df):
code_type = row['ORGANISM_CODE_TYPE']
code = row['ORGANISM_CODE']
# Handle special cases
if code_type == 'WHONET_ORG_CODE':
subset = organisms_df[organisms_df[code_type] == code]
if not subset.empty:
return subset.iloc[0]['ORGANISM']
return "Grouping"
breakpoints['ORGANISM'] = breakpoints.apply(get_org_direct, axis=1, args=(organisms,))
breakpoints['CATEGORY'].unique()
# get good breakpoints that mapped well with our criterias
not_found = ['value-column-not-found','no-match','organisms-all','drop-me']
good_bp = breakpoints[~breakpoints['CATEGORY'].isin(not_found)]
# merge remapped breakpoints to antibiotics
dataset = pd.merge(good_bp, antibiotics, on='WHONET_ABX_CODE', how='inner') # 'inner' join
dataset.columns
# select only required columns and rename
def columner(df):
return df[[
'GUIDELINES_x', 'YEAR', 'TEST_METHOD', 'HOST', 'ORGANISM_CODE', 'ORGANISM_CODE_TYPE', 'BREAKPOINT_TYPE',
'ANTIBIOTIC', 'POTENCY_x', 'R', 'I','S', 'CATEGORY', 'ORGANISM'
]]
dataset = columner(dataset)
dataset = dataset.rename(columns={
'GUIDELINES_x': 'GUIDELINES',
'POTENCY_x': 'POTENCY',
})[
[
'GUIDELINES', 'YEAR', 'TEST_METHOD', 'HOST', 'ORGANISM_CODE', 'ORGANISM_CODE_TYPE', 'BREAKPOINT_TYPE',
'ANTIBIOTIC', 'POTENCY', 'R', 'I','S', 'CATEGORY', 'ORGANISM'
]
]
# drop duplicated cleanup
dataset.drop_duplicates(inplace=True)
# split mic and disk
mic_breakpoints = dataset[dataset['TEST_METHOD'] == 'MIC']
disk_breakpoints = dataset[dataset['TEST_METHOD'] == 'DISK']
# save to csv
dataset.to_csv("bp_recategorised_all.csv", index=False)
mic_breakpoints.to_csv("bp_recategorised_mic.csv", index=False)
disk_breakpoints.to_csv("bp_recategorised_disc.csv", index=False)
# understand the mappings
dataset[['CATEGORY', 'ORGANISM', 'ANTIBIOTIC']] \
.value_counts() \
.sort_index() \
.reset_index(name='count') \
.to_csv('bp_all_counts.csv', index=False)
# get the ast category mappings that we created as a dict
mapping_dict = {}
for _, row in breakpoints.iterrows():
key = (row['ORGANISM_CODE_TYPE'], row['ORGANISM_CODE'])
category = row['CATEGORY']
if category == 'organism-direct': continue
elif key in mapping_dict: continue
else: mapping_dict[key] = category
# For each organism determine its AST category function
def find_category_for_organism(row, mapper):
miss = None
for (col, code), category in mapper.items():
if col in row and row[col] == code:
return category
return None
organisms['CATEGORY'] = organisms.apply(find_category_for_organism, axis=1, args=(mapping_dict,))
GRAM_POSITIVE_PHYLA = {
"Actinobacteria", "Actinomycetota",
"Firmicutes", "Bacillota",
"Tenericutes", "Mycoplasmatota",
"Chloroflexi", "Chloroflexota"
}
def mo_gramstain(row):
if row["KINGDOM"] != "Bacteria":
return None
gram = "gram-"
if (
row["PHYLUM"] in GRAM_POSITIVE_PHYLA and
row["CLASS"] != "Negativicutes"
):
gram = "gram+"
if row['ORGANISM_TYPE'] == '-':
gram = "gram-"
if row['ORGANISM_TYPE'] == '+':
gram = "gram+"
return gram
organisms["GRAM_STAIN"] = organisms.apply(lambda row: mo_gramstain(row), axis=1)
def infer_morphology(row):
genus = str(row.get("GENUS", "")).strip()
full_name = str(row.get("ORGANISM", "")).strip().lower()
# Specific species overrides (optional)
species_overrides = {
"neisseria meningitidis": "coccus.diplococci",
"neisseria gonorrhoeae": "coccus.diplococci",
"streptococcus pneumoniae": "coccus.diplococci",
"staphylococcus aureus": "coccus.staphylococci",
"streptococcus pyogenes": "coccus.streptococci",
"lactobacillus acidophilus": "rod.bacillus",
"vibrio cholerae": "spiral.vibrio",
"helicobacter pylori": "spiral.spirillum",
"treponema pallidum": "spiral.spirillum",
}
if full_name in species_overrides:
return species_overrides[full_name]
# Genus-level default mapping
genus_map = {
# Cocci
"Staphylococcus": "coccus.staphylococci",
"Streptococcus": "coccus.streptococci",
"Neisseria": "coccus.diplococci",
"Sarcina": "coccus.sarcina",
"Micrococcus": "coccus.tetrad",
"Enterococcus": "coccus.streptococci",
"Planococcus": "coccus",
# Rods
"Escherichia": "rod.bacillus",
"Klebsiella": "rod.bacillus",
"Salmonella": "rod.bacillus",
"Shigella": "rod.bacillus",
"Bacillus": "rod.bacillus",
"Lactobacillus": "rod.bacillus",
"Listeria": "rod.bacillus",
"Corynebacterium": "rod.bacillus",
"Yersinia": "rod.coccobacilli",
"Brucella": "rod.coccobacilli",
"Haemophilus": "rod.coccobacilli",
"Acinetobacter": "rod.coccobacilli",
"Pseudomonas": "rod.bacillus",
"Mycobacterium": "rod.bacillus",
"Clostridium": "rod.bacillus",
"Enterobacter": "rod.bacillus",
# Spirals
"Vibrio": "spiral.vibrio",
"Campylobacter": "spiral.spirillum",
"Helicobacter": "spiral.spirillum",
"Spirillum": "spiral.spirillum",
"Treponema": "spiral.spirillum",
"Borrelia": "spiral.spirillum",
"Leptospira": "spiral.spirillum",
}
return genus_map.get(genus, None) # Return None if unknown
organisms["SHAPE"] = organisms.apply(infer_morphology, axis=1)
def is_glass_priority(row):
"""
Return 'Yes' or 'No' based on whether the organism is part of the GLASS priority list.
Uses genus and species if available.
"""
genus = str(row.get("GENUS", "")).strip()
full_name = str(row.get("ORGANISM", "")).strip().lower()
# GLASS-priority organisms (WHO GLASS list, version 2023)
glass_species = {
"escherichia coli",
"klebsiella pneumoniae",
"acinetobacter baumannii",
"pseudomonas aeruginosa",
"salmonella spp",
"shigella spp",
"neisseria gonorrhoeae",
"streptococcus pneumoniae",
"staphylococcus aureus"
}
glass_genera = {
"Escherichia", "Klebsiella", "Acinetobacter", "Pseudomonas",
"Salmonella", "Shigella", "Neisseria", "Streptococcus", "Staphylococcus"
}
# Exact match preferred
if full_name in glass_species:
return "Yes"
# Genus-based fallback
elif genus in glass_genera:
return "Yes"
return "No"
organisms["GLASS"] = organisms.apply(is_glass_priority, axis=1)
def infer_mro(row):
"""
Return a tuple (is_mro: Yes/No, phenotype: string or None)
Based on genus and species. Uses known epidemiological patterns.
"""
genus = str(row.get("GENUS", "")).strip()
full_name = str(row.get("ORGANISM", "")).strip().lower()
# Dictionary of known MROs and their phenotypes
mro_phenotypes = {
"escherichia coli": "ESBL or CRE",
"klebsiella pneumoniae": "ESBL or CRE",
"enterobacter cloacae": "ESBL or CRE",
"acinetobacter baumannii": "MDR/XDR",
"pseudomonas aeruginosa": "MDR/XDR",
"staphylococcus aureus": "MRSA",
"enterococcus faecalis": "VRE",
"enterococcus faecium": "VRE",
"streptococcus pneumoniae": "DRSP",
"neisseria gonorrhoeae": "FQ resistant",
}
# Genus-level fallbacks
genus_to_phenotype = {
"Klebsiella": "ESBL or CRE",
"Enterobacter": "ESBL or CRE",
"Escherichia": "ESBL or CRE",
"Acinetobacter": "MDR/XDR",
"Pseudomonas": "MDR/XDR",
"Staphylococcus": "MRSA (if mecA or cefoxitin resistant)",
"Enterococcus": "VRE (if vancomycin resistant)",
"Streptococcus": "DRSP (if penicillin or macrolide resistant)",
"Neisseria": "FQ resistant (if ciprofloxacin resistant)"
}
# Check full name first
if full_name in mro_phenotypes:
return "Yes", mro_phenotypes[full_name]
elif genus in genus_to_phenotype:
return "Yes", genus_to_phenotype[genus]
else:
return "No", None
organisms[["MRO", "MRO_PHENOTYPE"]] = organisms.apply(lambda row: pd.Series(infer_mro(row)), axis=1)
# save
organisms.to_csv("org_recategorised.csv", index=False)
@aurthurm
Copy link
Author

The files there were obtained from whonet - - install it and do to its data dirs and get them

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment