Last active
June 27, 2025 16:11
-
-
Save aurthurm/b7d13d21a7eac9722076255283dd9651 to your computer and use it in GitHub Desktop.
BreakPoints from whonet exploration
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
breakpoints = pd.read_csv('Breakpoints.txt', sep='\t', low_memory=False) | |
antibiotics = pd.read_csv('Antibiotics.txt', sep='\t', low_memory=False) | |
organisms = pd.read_csv('Organisms.txt', sep='\t', low_memory=False) | |
# limit breakpoints to 2024 CLSI for Humans only | |
breakpoints = breakpoints[ | |
breakpoints['TEST_METHOD'].isin(['MIC', 'DISK']) & | |
(breakpoints['GUIDELINES'] == 'CLSI') & | |
(breakpoints['YEAR'] == 2024) & | |
(breakpoints['HOST'] == 'Human') | |
] | |
# map to groups names based on the grouping codes provided | |
def get_org_category(row, organisms_df): | |
code_type = row['ORGANISM_CODE_TYPE'] | |
code = row['ORGANISM_CODE'] | |
# Handle special cases | |
if code_type == 'WHONET_ORG_CODE': | |
return "organism-direct" | |
if code_type == 'ALL': | |
return "organisms-all" | |
if code_type == 'ANAEROBE': | |
return "Anaerobe" | |
# Handle special mappings for GENUS_GROUP | |
if code_type == 'GENUS_GROUP': | |
special_map = { | |
'NFR': 'Non-Fermenting Rods', | |
} | |
return special_map.get(code, 'drop-me') | |
# Handle special mappings for SEROVAR_GROUP | |
if code_type == 'SEROVAR_GROUP': | |
special_map = { | |
'HIN': 'Haemophilus', | |
'ECO': 'Enterobacteriaceae' | |
} | |
return special_map.get(code, 'drop-me') | |
# Handle special mappings for SPECIES_GROUP | |
# Species code definitions not in any file: Thanks to deepseek AI for the research to find mapping names | |
if code_type == 'SPECIES_GROUP': | |
special_map = { | |
'ABX': 'Acinetobacter baumannii complex', | |
'MTX': 'Mycobacterium tuberculosis complex', | |
'MAX': 'Mycobacterium avium complex', | |
'SGM': 'Slowly Growing Mycobacteria', | |
'RGM': 'Rapidly Growing Mycobacteria', | |
'BCX': 'Burkholderia cepacia complex', | |
'SVI': 'Streptococcus (Viridans)', | |
'BS-': 'Streptococcus (Beta Hem)', | |
'COF': 'Coryneform (Diphtheroids)', | |
} | |
return special_map.get(code, 'drop-me') | |
# Check if code_type is in organism dataframe | |
if code_type in organisms_df.columns: | |
value_column = code_type.split('_')[0].upper() | |
if value_column not in organisms_df.columns: | |
return "value-column-not-found" | |
# Find matching organism | |
subset = organisms_df[organisms_df[code_type] == code] | |
if not subset.empty: | |
non_null_values = subset[value_column].dropna() | |
if not non_null_values.empty: | |
return non_null_values.iloc[0] | |
return "no-match" | |
breakpoints['CATEGORY'] = breakpoints.apply(get_org_category, axis=1, args=(organisms,)) | |
# map to organisms for those with direct mappings to specific organisms | |
def get_org_direct(row, organisms_df): | |
code_type = row['ORGANISM_CODE_TYPE'] | |
code = row['ORGANISM_CODE'] | |
# Handle special cases | |
if code_type == 'WHONET_ORG_CODE': | |
subset = organisms_df[organisms_df[code_type] == code] | |
if not subset.empty: | |
return subset.iloc[0]['ORGANISM'] | |
return "Grouping" | |
breakpoints['ORGANISM'] = breakpoints.apply(get_org_direct, axis=1, args=(organisms,)) | |
breakpoints['CATEGORY'].unique() | |
# get good breakpoints that mapped well with our criterias | |
not_found = ['value-column-not-found','no-match','organisms-all','drop-me'] | |
good_bp = breakpoints[~breakpoints['CATEGORY'].isin(not_found)] | |
# merge remapped breakpoints to antibiotics | |
dataset = pd.merge(good_bp, antibiotics, on='WHONET_ABX_CODE', how='inner') # 'inner' join | |
dataset.columns | |
# select only required columns and rename | |
def columner(df): | |
return df[[ | |
'GUIDELINES_x', 'YEAR', 'TEST_METHOD', 'HOST', 'ORGANISM_CODE', 'ORGANISM_CODE_TYPE', 'BREAKPOINT_TYPE', | |
'ANTIBIOTIC', 'POTENCY_x', 'R', 'I','S', 'CATEGORY', 'ORGANISM' | |
]] | |
dataset = columner(dataset) | |
dataset = dataset.rename(columns={ | |
'GUIDELINES_x': 'GUIDELINES', | |
'POTENCY_x': 'POTENCY', | |
})[ | |
[ | |
'GUIDELINES', 'YEAR', 'TEST_METHOD', 'HOST', 'ORGANISM_CODE', 'ORGANISM_CODE_TYPE', 'BREAKPOINT_TYPE', | |
'ANTIBIOTIC', 'POTENCY', 'R', 'I','S', 'CATEGORY', 'ORGANISM' | |
] | |
] | |
# drop duplicated cleanup | |
dataset.drop_duplicates(inplace=True) | |
# split mic and disk | |
mic_breakpoints = dataset[dataset['TEST_METHOD'] == 'MIC'] | |
disk_breakpoints = dataset[dataset['TEST_METHOD'] == 'DISK'] | |
# save to csv | |
dataset.to_csv("bp_recategorised_all.csv", index=False) | |
mic_breakpoints.to_csv("bp_recategorised_mic.csv", index=False) | |
disk_breakpoints.to_csv("bp_recategorised_disc.csv", index=False) | |
# understand the mappings | |
dataset[['CATEGORY', 'ORGANISM', 'ANTIBIOTIC']] \ | |
.value_counts() \ | |
.sort_index() \ | |
.reset_index(name='count') \ | |
.to_csv('bp_all_counts.csv', index=False) | |
# get the ast category mappings that we created as a dict | |
mapping_dict = {} | |
for _, row in breakpoints.iterrows(): | |
key = (row['ORGANISM_CODE_TYPE'], row['ORGANISM_CODE']) | |
category = row['CATEGORY'] | |
if category == 'organism-direct': continue | |
elif key in mapping_dict: continue | |
else: mapping_dict[key] = category | |
# For each organism determine its AST category function | |
def find_category_for_organism(row, mapper): | |
miss = None | |
for (col, code), category in mapper.items(): | |
if col in row and row[col] == code: | |
return category | |
return None | |
organisms['CATEGORY'] = organisms.apply(find_category_for_organism, axis=1, args=(mapping_dict,)) | |
GRAM_POSITIVE_PHYLA = { | |
"Actinobacteria", "Actinomycetota", | |
"Firmicutes", "Bacillota", | |
"Tenericutes", "Mycoplasmatota", | |
"Chloroflexi", "Chloroflexota" | |
} | |
def mo_gramstain(row): | |
if row["KINGDOM"] != "Bacteria": | |
return None | |
gram = "gram-" | |
if ( | |
row["PHYLUM"] in GRAM_POSITIVE_PHYLA and | |
row["CLASS"] != "Negativicutes" | |
): | |
gram = "gram+" | |
if row['ORGANISM_TYPE'] == '-': | |
gram = "gram-" | |
if row['ORGANISM_TYPE'] == '+': | |
gram = "gram+" | |
return gram | |
organisms["GRAM_STAIN"] = organisms.apply(lambda row: mo_gramstain(row), axis=1) | |
def infer_morphology(row): | |
genus = str(row.get("GENUS", "")).strip() | |
full_name = str(row.get("ORGANISM", "")).strip().lower() | |
# Specific species overrides (optional) | |
species_overrides = { | |
"neisseria meningitidis": "coccus.diplococci", | |
"neisseria gonorrhoeae": "coccus.diplococci", | |
"streptococcus pneumoniae": "coccus.diplococci", | |
"staphylococcus aureus": "coccus.staphylococci", | |
"streptococcus pyogenes": "coccus.streptococci", | |
"lactobacillus acidophilus": "rod.bacillus", | |
"vibrio cholerae": "spiral.vibrio", | |
"helicobacter pylori": "spiral.spirillum", | |
"treponema pallidum": "spiral.spirillum", | |
} | |
if full_name in species_overrides: | |
return species_overrides[full_name] | |
# Genus-level default mapping | |
genus_map = { | |
# Cocci | |
"Staphylococcus": "coccus.staphylococci", | |
"Streptococcus": "coccus.streptococci", | |
"Neisseria": "coccus.diplococci", | |
"Sarcina": "coccus.sarcina", | |
"Micrococcus": "coccus.tetrad", | |
"Enterococcus": "coccus.streptococci", | |
"Planococcus": "coccus", | |
# Rods | |
"Escherichia": "rod.bacillus", | |
"Klebsiella": "rod.bacillus", | |
"Salmonella": "rod.bacillus", | |
"Shigella": "rod.bacillus", | |
"Bacillus": "rod.bacillus", | |
"Lactobacillus": "rod.bacillus", | |
"Listeria": "rod.bacillus", | |
"Corynebacterium": "rod.bacillus", | |
"Yersinia": "rod.coccobacilli", | |
"Brucella": "rod.coccobacilli", | |
"Haemophilus": "rod.coccobacilli", | |
"Acinetobacter": "rod.coccobacilli", | |
"Pseudomonas": "rod.bacillus", | |
"Mycobacterium": "rod.bacillus", | |
"Clostridium": "rod.bacillus", | |
"Enterobacter": "rod.bacillus", | |
# Spirals | |
"Vibrio": "spiral.vibrio", | |
"Campylobacter": "spiral.spirillum", | |
"Helicobacter": "spiral.spirillum", | |
"Spirillum": "spiral.spirillum", | |
"Treponema": "spiral.spirillum", | |
"Borrelia": "spiral.spirillum", | |
"Leptospira": "spiral.spirillum", | |
} | |
return genus_map.get(genus, None) # Return None if unknown | |
organisms["SHAPE"] = organisms.apply(infer_morphology, axis=1) | |
def is_glass_priority(row): | |
""" | |
Return 'Yes' or 'No' based on whether the organism is part of the GLASS priority list. | |
Uses genus and species if available. | |
""" | |
genus = str(row.get("GENUS", "")).strip() | |
full_name = str(row.get("ORGANISM", "")).strip().lower() | |
# GLASS-priority organisms (WHO GLASS list, version 2023) | |
glass_species = { | |
"escherichia coli", | |
"klebsiella pneumoniae", | |
"acinetobacter baumannii", | |
"pseudomonas aeruginosa", | |
"salmonella spp", | |
"shigella spp", | |
"neisseria gonorrhoeae", | |
"streptococcus pneumoniae", | |
"staphylococcus aureus" | |
} | |
glass_genera = { | |
"Escherichia", "Klebsiella", "Acinetobacter", "Pseudomonas", | |
"Salmonella", "Shigella", "Neisseria", "Streptococcus", "Staphylococcus" | |
} | |
# Exact match preferred | |
if full_name in glass_species: | |
return "Yes" | |
# Genus-based fallback | |
elif genus in glass_genera: | |
return "Yes" | |
return "No" | |
organisms["GLASS"] = organisms.apply(is_glass_priority, axis=1) | |
def infer_mro(row): | |
""" | |
Return a tuple (is_mro: Yes/No, phenotype: string or None) | |
Based on genus and species. Uses known epidemiological patterns. | |
""" | |
genus = str(row.get("GENUS", "")).strip() | |
full_name = str(row.get("ORGANISM", "")).strip().lower() | |
# Dictionary of known MROs and their phenotypes | |
mro_phenotypes = { | |
"escherichia coli": "ESBL or CRE", | |
"klebsiella pneumoniae": "ESBL or CRE", | |
"enterobacter cloacae": "ESBL or CRE", | |
"acinetobacter baumannii": "MDR/XDR", | |
"pseudomonas aeruginosa": "MDR/XDR", | |
"staphylococcus aureus": "MRSA", | |
"enterococcus faecalis": "VRE", | |
"enterococcus faecium": "VRE", | |
"streptococcus pneumoniae": "DRSP", | |
"neisseria gonorrhoeae": "FQ resistant", | |
} | |
# Genus-level fallbacks | |
genus_to_phenotype = { | |
"Klebsiella": "ESBL or CRE", | |
"Enterobacter": "ESBL or CRE", | |
"Escherichia": "ESBL or CRE", | |
"Acinetobacter": "MDR/XDR", | |
"Pseudomonas": "MDR/XDR", | |
"Staphylococcus": "MRSA (if mecA or cefoxitin resistant)", | |
"Enterococcus": "VRE (if vancomycin resistant)", | |
"Streptococcus": "DRSP (if penicillin or macrolide resistant)", | |
"Neisseria": "FQ resistant (if ciprofloxacin resistant)" | |
} | |
# Check full name first | |
if full_name in mro_phenotypes: | |
return "Yes", mro_phenotypes[full_name] | |
elif genus in genus_to_phenotype: | |
return "Yes", genus_to_phenotype[genus] | |
else: | |
return "No", None | |
organisms[["MRO", "MRO_PHENOTYPE"]] = organisms.apply(lambda row: pd.Series(infer_mro(row)), axis=1) | |
# save | |
organisms.to_csv("org_recategorised.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The files there were obtained from whonet - - install it and do to its data dirs and get them