Created
September 19, 2016 21:30
-
-
Save daler/a71d7c952875e47744c904dc65c77bd5 to your computer and use it in GitHub Desktop.
Example for https://www.biostars.org/p/212519/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Install into new environment and activate it before running this script: | |
# | |
# conda create -n biostars-212519 python=3 requests numpy pandas | |
# source activate biostars-212519 | |
# python identify_encode_controls.py | |
import pandas | |
import numpy as np | |
import requests | |
def accession_metadata(acc): | |
""" | |
Returns the metadata for ENCODE accession `acc` (e.g., ENCSR000BJN) | |
""" | |
HEADERS = {'accept': 'application/json'} | |
URL = ( | |
'https://www.encodeproject.org/experiments/{0}/?frame=object' | |
.format(acc) | |
) | |
response = requests.get(URL, headers=HEADERS) | |
return response.content | |
# You can get a URL interactively on encodeproject.org by subsetting your | |
# query, clicking on the "download" button, and extracting the first line of | |
# that file. This example is all HepG2 ChIP-seq data. | |
metadata_url = ( | |
"https://www.encodeproject.org/metadata/type=Experiment&" | |
"biosample_term_name=HepG2&assay_title=ChIP-seq&limit=all/metadata.tsv" | |
) | |
df = pandas.read_table(metadata_url) | |
# subset just the first 100 rows for this example | |
df = df.iloc[:100] | |
def find_controls(acc): | |
""" | |
The metadata for an accession contains a "possible controls" field. I'm | |
taking that to mean there can be multiple controls, so to be safe I'm | |
returning a list of them. | |
""" | |
m = pandas.read_json(accession_metadata(acc), typ='series') | |
c = m['possible_controls'] | |
return [i.split('/')[2] for i in c] | |
# the metadata has multiple rows for each accession. To speed things up | |
# dramatically, only look for controls for the unique set of accessions, and | |
# then join them to the dataframe afterwards. | |
# | |
ds = [] | |
for acc in df['Experiment accession'].unique(): | |
print('getting metadata for accession:', acc) | |
ds.append( | |
{ | |
'Experiment accession': acc, | |
'controls': find_controls(acc) | |
} | |
) | |
controls = pandas.DataFrame(ds).set_index('Experiment accession') | |
# join controls to full metadata | |
df = df.join(controls, on='Experiment accession') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment