cthoyt · April 9, 2025 15:54
diff --git a/bioregistry-text-mining-regexes.tsv b/bioregistry-text-mining-regexes.tsv
diff --git a/get-bioregistry-text-mining-regexes.py b/get-bioregistry-text-mining-regexes.py
 import bioregistry
 from tabulate import tabulate
 import click
 from textwrap import shorten
 import pandas as pd


 def main():
    rows = []
    for resource in bioregistry.resources():
        pattern = resource.get_pattern()
        if pattern is None:
            continue

        stripped = pattern.removeprefix("^").removesuffix("$")
        if not stripped.endswith("\\d+"):
            continue
        if stripped == "\\d+":
            continue

        sstripped = stripped.removesuffix("\\d+")
        if len(sstripped) == 1 or sstripped == "\\w" or not sstripped.isalpha():
            continue

        rows.append((
            resource.prefix,
            shorten(resource.get_name(), 50),
            stripped,
            resource.get_example()
        ))

    df = pd.DataFrame(rows, columns=['Prefix', 'Resource Name', "RegEx", "Example"])
    print(df.to_markdown(index=False))

    df.to_csv("bioregistry-text-mining-regexes.tsv", index=False, sep='\t')


 if __name__ == '__main__':
    main()
Prefix	Resource Name	RegEx	Example
anzctr	Australian New Zealand Clinical Trials Registry	ACTRN\d+	ACTRN12623000498695
beetlebase	Tribolium Genome Database -- Insertion	TC\d+	TC010103
blastrule	BlastRule	NBR\d+	NBR016799
cellopub	Cellosaurus Publication	CLPUB\d+	CLPUB00496
chembl	ChEMBL	CHEMBL\d+	CHEMBL4303805
chembl.cell	ChEMBL database of bioactive drug-like small [...]	CHEMBL\d+	CHEMBL3307800
chembl.compound	ChEMBL	CHEMBL\d+	CHEMBL465070
chembl.target	ChEMBL target	CHEMBL\d+	CHEMBL3467
chempro.competitor	ChemPro Competitor	LDCM\d+	LDCM0160
chempro.probe	ChemPro Probe	LDPC\d+	LDPC0032
chempro.target	ChemPro Target	LDTP\d+	LDTP03903
clingen.allele	ClinGen Allele Registry	CA\d+	CA981206459
codelink	GE Healthcare/Amersham Biosciences CodeLink [...]	GE\d+	GE86325
cog	Cluster of orthologous genes	COG\d+	COG0001
comptox	DSSTox substance	DTXSID\d+	DTXSID2021028
dbsnp	dbSNP Reference SNP number	rs\d+	rs121909098
ddinter.drug	Curated Drug-Drug Interactions Database - Drug	DDInter\d+	DDInter20
dictybase.est	dictyBase Expressed Sequence Tag	DDB\d+	DDB0016567
drks	German Clinical Trials Register	DRKS\d+	DRKS00031815
drsc	Drosophila RNAi Screening Center	DRSC\d+	DRSC05221
drugbank.category	DrugBank Drug Category	DBCAT\d+	DBCAT000600
drugbank.condition	DrugBank Condition	DBCOND\d+	DBCOND0066902
drugbank.metabolite	DrugBank Metabolite	DBMET\d+	DBMET02292
echobase	EchoBASE post-genomic database for [...]	EB\d+	EB0170
ecmdb	E. coli Metabolite Database	ECMDB\d+	ECMDB00005
ecogene	Database of Escherichia coli Sequence and Function	EG\d+	EG10173
foodb.compound	FooDB compound	FDB\d+	FDB002100
foodb.food	FooDB Food	FOOD\d+	FOOD00020
fsnp	F-SNP	rs\d+	rs17852708
genetree	GeneTree	ENSGT\d+	ENSGT00550000074763
genprop	Genome Properties	GenProp\d+	GenProp0699
gold.meta	GOLD metadata	Gm\d+	Gm00047
gpmdb	Global Proteome Machine Database	GPM\d+	GPM32310002988
gwascentral.marker	GWAS Central Marker	HGVM\d+	HGVM15354
gwascentral.phenotype	GWAS Central Phenotype	HGVPM\d+	HGVPM623
gwascentral.study	GWAS Central Study	HGVST\d+	HGVST1828
hmdb	Human Metabolome Database	HMDB\d+	HMDB00001
hovergen	Homologous Vertebrate Genes Database	HBG\d+	HBG004341
ideal	Intrinsically Disordered proteins with [...]	IID\d+	IID00001
ihw	International Histocompatibility Workshop [...]	IHW\d+	IHW09326
integbio	Integbio	nbdc\d+	nbdc01071
isrctn	International Traditional Medicine Clinical [...]	ISRCTN\d+	ISRCTN10175490
itmctr	International Traditional Medicine Clinical [...]	ITMCTR\d+	ITMCTR2023000002
jcrb	JRBC Cell Bank	JCRB\d+	JCRB1355
kcris	Korean Clinical Research Information Service	KCT\d+	KCT0008394
kegg.dgroup	KEGG Drug Group	DG\d+	DG00301
kegg.rclass	KEGG Reaction Class	RC\d+	RC00001
lbctr	Lebanon Clinical Trials Registry	LBCTR\d+	LBCTR2023015204
massive	MassIVE	MSV\d+	MSV000082131
metabolights	MetaboLights Compound	MTBLS\d+	MTBLS1
mgnify.analysis	MGnify Analysis	MGYA\d+	MGYA00002270
mirnest	miRNEST	MNEST\d+	MNEST029358
molbic.cellline	MolBiC Cell Line	CL\d+	CL000025
molbic.compound	MolBiC Compound	CP\d+	CP0041613
molbic.protein	MolBiC Protein	PT\d+	PT00819
ncbibook	NCBI Bookshelf	NBK\d+	NBK331
nif.std	NIF Standard Ontology	BAMSC\d+	BAMSC981
norine	Nonribosomal Peptides Database	NOR\d+	NOR00681
npass	Natural Product Activity and Species Source [...]	NPC\d+	NPC139585
nrfc	National Repository of Fish Cell Lines	NRFC\d+	NRFC051
pactr	Pan African Clinical Trials Registry	PACTR\d+	PACTR202304525632216
pdc.study	Proteomic Data Commons	PDC\d+	PDC000351
pfam.clan	Pfam Clans	CL\d+	CL0192
pharmgkb.disease	PharmGKB Disease	PA\d+	PA447218
pharmgkb.drug	PharmGKB Drug	PA\d+	PA448710
pharmgkb.pathways	PharmGKB	PA\d+	PA146123006
pharmgkb.variant	PharmGKB Variant	PA\d+	PA166156302
piroplasma	PiroplasmaDB	TA\d+	TA14985
ppr	Europe PMC Preprints	PPR\d+	PPR103739
prodom	ProDom	PD\d+	PD10000
radlex	RSNA Informatics RadLex	RID\d+	RID1
rcb	RIKEN Bioresource Center Cell Bank	RCB\d+	RCB0002
rdbsb	Registry and database of bioparts for [...]	OENC\d+	OENC205
rpcec	Cuban Registry of Clinical Trials	RPCEC\d+	RPCEC00000423
rtecs	Registry of Toxic Effects of Chemical Substances	AB\d+	AB1925000
seed.compound	SEED Compound	cpd\d+	cpd15380
seed.reaction	SEED Reactions	rxn\d+	rxn00001
skip	Stemcell Knowledge and Information Portal	SKIP\d+	SKIP001214
smpdb	Small Molecule Pathway Database	SMP\d+	SMP0000219
subtilist	Bacillus subtilis genome sequencing project	BG\d+	BG11523
synbip.bts	Synthetic Binding Proteins Binding Target	st\d+	st00576
synbip.epitope	Synthetic Binding Protein Epitope	mt\d+	mt002305
synbip.pss	Synthetic Binding Protein Protein Scaffold	ps\d+	ps048
synbip.sbp	Synthetic Binding Protein	sbp\d+	sbp000002
tctr	Thai Clinical Trials Registry	TCTR\d+	TCTR20230429001
tigrfam	TIGR protein families	TIGR\d+	TIGR00010
tsc	Tetrahymena Stock Center	SD\d+	SD00043
ttd.drug	TTD Drug	DAP\d+	DAP000773
ttd.target	TTD Target	TTDS\d+	TTDS00056
umbbd.rule	EAWAG Biocatalysis/Biodegradation Database	bt\d+	bt0001
uniprot.arba	Association-Rule-Based Annotator	ARBA\d+	ARBA00000001
virsirna	VIRsiRNA	virsi\d+	virsi1909
wwf.ecoregion	World Wildlife Fund Ecoregion	AT\d+	AT1402
ymdb	Yeast Metabolome Database	YMDB\d+	YMDB00001
	import bioregistry
	from tabulate import tabulate
	import click
	from textwrap import shorten
	import pandas as pd


	def main():
	rows = []
	for resource in bioregistry.resources():
	pattern = resource.get_pattern()
	if pattern is None:
	continue

	stripped = pattern.removeprefix("^").removesuffix("$")
	if not stripped.endswith("\\d+"):
	continue
	if stripped == "\\d+":
	continue

	sstripped = stripped.removesuffix("\\d+")
	if len(sstripped) == 1 or sstripped == "\\w" or not sstripped.isalpha():
	continue

	rows.append((
	resource.prefix,
	shorten(resource.get_name(), 50),
	stripped,
	resource.get_example()
	))

	df = pd.DataFrame(rows, columns=['Prefix', 'Resource Name', "RegEx", "Example"])
	print(df.to_markdown(index=False))

	df.to_csv("bioregistry-text-mining-regexes.tsv", index=False, sep='\t')


	if __name__ == '__main__':
	main()