Last active
December 23, 2018 08:11
-
-
Save sgsfak/ed87e19a71f09c7517d2 to your computer and use it in GitHub Desktop.
Parse DrugBank XML file with xmlstarlet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- | |
-- PostgreSQL database dump | |
-- | |
-- Dumped from database version 9.5.2 | |
-- Dumped by pg_dump version 9.5.2 | |
SET statement_timeout = 0; | |
SET lock_timeout = 0; | |
SET client_encoding = 'UTF8'; | |
SET standard_conforming_strings = on; | |
SET check_function_bodies = false; | |
SET client_min_messages = warning; | |
SET row_security = off; | |
CREATE TABLE atc_drugs ( | |
drug_id character varying(10), | |
atc_code character varying(10) | |
); | |
CREATE TABLE drug_brands ( | |
drug_id character varying(10), | |
brand text, | |
company text | |
); | |
CREATE TABLE drug_interactions ( | |
drug_id1 character varying(10), | |
drug_id2 character varying(10), | |
name text, | |
description text | |
); | |
CREATE TABLE drugs ( | |
drug_id character varying(10) NOT NULL, | |
name text, | |
description text | |
); | |
CREATE TABLE drug_dosages ( | |
drug_id character varying(10) NOT NULL REFERENCES drugs(drug_id), | |
form text, | |
route text, | |
strength text | |
); | |
CREATE INDEX drug_dosages_id on drug_dosages(drug_id); | |
CREATE VIEW drug_ints_vw AS | |
SELECT di.drug_id1 AS drug_id, | |
array_agg(di.drug_id2 ORDER BY di.drug_id2) AS drug_ids2, | |
array_agg(di.name ORDER BY di.drug_id2) AS nmes, | |
array_agg(di.description ORDER BY di.drug_id2) AS descriptions | |
FROM (drug_interactions di | |
JOIN drugs ON (((di.drug_id2)::text = (drugs.drug_id)::text))) | |
GROUP BY di.drug_id1; | |
CREATE TABLE drug_synonyms ( | |
drug_id character varying(10), | |
synonym text | |
); | |
CREATE MATERIALIZED VIEW drugs_ftsearch AS | |
SELECT drugs.drug_id, | |
to_tsvector('english'::regconfig, concat(drugs.name, ' ', b.brands, ' ', s.synonyms)) AS ftext | |
FROM ((drugs | |
LEFT JOIN ( SELECT drug_brands.drug_id, | |
string_agg(drug_brands.brand, ' '::text) AS brands | |
FROM drug_brands | |
GROUP BY drug_brands.drug_id) b USING (drug_id)) | |
LEFT JOIN ( SELECT drug_synonyms.drug_id, | |
string_agg(drug_synonyms.synonym, ' '::text) AS synonyms | |
FROM drug_synonyms | |
GROUP BY drug_synonyms.drug_id) s USING (drug_id)) | |
WITH NO DATA; | |
CREATE MATERIALIZED VIEW drugs_name_ftsearch AS | |
SELECT drugs.drug_id, | |
to_tsvector('english'::regconfig, drugs.name) AS name_ftext | |
FROM drugs | |
WITH NO DATA; | |
CREATE VIEW drugs_vw AS | |
SELECT drugs.drug_id, | |
drugs.name, | |
a.atc_codes, | |
b.brands, | |
s.synonyms, | |
drugs.description | |
FROM (((drugs | |
LEFT JOIN ( SELECT atc_drugs.drug_id, | |
array_agg(atc_drugs.atc_code) AS atc_codes | |
FROM atc_drugs | |
GROUP BY atc_drugs.drug_id) a USING (drug_id)) | |
LEFT JOIN ( SELECT drug_brands.drug_id, | |
array_agg(drug_brands.brand) AS brands | |
FROM drug_brands | |
GROUP BY drug_brands.drug_id) b USING (drug_id)) | |
LEFT JOIN ( SELECT drug_synonyms.drug_id, | |
array_agg(drug_synonyms.synonym) AS synonyms | |
FROM drug_synonyms | |
GROUP BY drug_synonyms.drug_id) s USING (drug_id)); | |
ALTER TABLE ONLY drugs | |
ADD CONSTRAINT drugs_pkey PRIMARY KEY (drug_id); | |
CREATE INDEX atc_drugs_atc_code_idx ON atc_drugs USING btree (atc_code); | |
CREATE INDEX atc_drugs_drug_id_idx ON atc_drugs USING btree (drug_id); | |
CREATE INDEX drug_brands_drug_id_idx ON drug_brands USING btree (drug_id); | |
CREATE INDEX drug_interactions_drug_id1_idx ON drug_interactions USING btree (drug_id1); | |
CREATE INDEX drug_interactions_drug_id2_idx ON drug_interactions USING hash (drug_id2); | |
CREATE INDEX drug_synonyms_drug_id_idx ON drug_synonyms USING btree (drug_id); | |
CREATE INDEX drugs_ftsearch_ftext_idx ON drugs_ftsearch USING gin (ftext); | |
CREATE INDEX drugs_name_ftsearch_name_ftext_idx ON drugs_name_ftsearch USING gin (name_ftext); | |
ALTER TABLE ONLY atc_drugs | |
ADD CONSTRAINT atc_drugs_drug_id_fkey FOREIGN KEY (drug_id) REFERENCES drugs(drug_id); | |
ALTER TABLE ONLY drug_brands | |
ADD CONSTRAINT drug_brands_drug_id_fkey FOREIGN KEY (drug_id) REFERENCES drugs(drug_id); | |
ALTER TABLE ONLY drug_interactions | |
ADD CONSTRAINT drug_interactions_drug_id1_fkey FOREIGN KEY (drug_id1) REFERENCES drugs(drug_id); | |
ALTER TABLE ONLY drug_synonyms | |
ADD CONSTRAINT drug_synonyms_drug_id_fkey FOREIGN KEY (drug_id) REFERENCES drugs(drug_id); | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
all : drugs.tsv atc_drugs.tsv drug_interactions.tsv drug_synonyms.tsv drug_brands.tsv drug_dosages.tsv | |
drugbank.xml: | |
curl -L -o drugbank.xml.zip -u EMAIL:PASSWORD http://www.drugbank.ca/releases/5-0-1/downloads/all-full-database | |
rm -f drugbank.xml ; unzip -p drugbank.xml.zip > $@ | |
drugs.tsv: drugbank.xml | |
echo 'drug_id name description' > $@ | |
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:drugbank-id[@primary="true"]' -v 'concat(.," ", ../t:name, " ", normalize-space(../t:description))' -n $< >> $@ | |
atc_drugs.tsv: drugbank.xml | |
echo 'drug_id atc_code' > $@ | |
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:atc-codes/t:atc-code' -v 'concat(../../t:drugbank-id," ", ./@code)' -n $< >> $@ | |
drug_interactions.tsv: drugbank.xml | |
echo 'drug_id1 drug_id2 name description' > $@ | |
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:drug-interactions/t:drug-interaction' -v 'concat(../../t:drugbank-id," ", t:drugbank-id, " ", normalize-space(t:name), " ", normalize-space(t:description))' -n $< >> $@ | |
drug_synonyms.tsv: drugbank.xml | |
echo 'drug_id synonym' > $@ | |
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:synonyms/t:synonym' -v 'concat(../../t:drugbank-id," ", normalize-space(.))' -n $< >> $@ | |
drug_brands.tsv: drugbank.xml | |
echo 'drug_id brand company' > $@ | |
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:international-brands/t:international-brand' -v 'concat(../../t:drugbank-id," ", normalize-space(.), " ", ./@company)' -n $< >> $@ | |
drug_dosages.tsv: drugbank.xml | |
echo 'drug_id form route strength' > $@ | |
xmlstarlet sel -T -N t=http://www.drugbank.ca -t -m 't:drugbank/t:drug/t:dosages/t:dosage' -v 'concat(../../t:drugbank-id," ", normalize-space(t:form)," ", normalize-space(t:route), " ", normalize-space(t:strength))' -n $< >> $@ | |
db_insert : drugs.tsv atc_drugs.tsv drug_interactions.tsv drug_synonyms.tsv drug_brands.tsv drug_dosages.tsv | |
psql -c "TRUNCATE drugs, atc_drugs, drug_interactions, drug_synonyms, drug_brands, drug_dosages" drugbank | |
psql -c "COPY drugs from '`pwd`/drugs.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank | |
psql -c "COPY atc_drugs from '`pwd`/atc_drugs.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank | |
psql -c "COPY drug_interactions from '`pwd`/drug_interactions.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank | |
psql -c "COPY drug_synonyms from '`pwd`/drug_synonyms.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank | |
psql -c "COPY drug_brands from '`pwd`/drug_brands.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank | |
psql -c "COPY drug_dosages from '`pwd`/drug_dosages.tsv' with (format csv, delimiter ' ', header true, quote '|')" drugbank | |
psql -c "REFRESH MATERIALIZED VIEW drugs_name_ftsearch" drugbank | |
psql -c "REFRESH MATERIALIZED VIEW drugs_ftsearch" drugbank | |
.PHONY: all db_insert |
@sgsfak, thanks. I'm trying to use this but I'm new and just trying to get a simple crosswalk between TDD ID, DrugBankID and ATC code. I'm trying to run your code in ubuntu but when I run the script in the terminal it doesn't work. I'm unsure on what system requirements may be needed to run this.
Thanks in advance.
@mauricioromero86 The requirements are a recent version of Linux (Ubuntu "Trusty" (14.04) is fine), PostgreSQL >= 9.5, curl
, and xmlstarlet
. For the latter sudo apt-get curl xmlstarlet
should be enough.
PostgreSQL is not really needed if you just want the CSV (*.tsv
) files.
Hi, how to use the 'Makefile' script ?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
As explained at http://www.drugbank.ca/releases/latest currently in order to access the DrugBank downloads you need to signup for a free DrugBank account. Once you have signed up you will be able to download the complete database manually or programatically (for example through an application or script). Please update the Makefile to replace your email and password in the Makefile in order for this to work.