Created
April 3, 2023 02:30
-
-
Save audy/beb2edce512aa24acb55f9cafaecf87b to your computer and use it in GitHub Desktop.
quickly fetch assemblies from NCBI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import downloads | |
from joblib import Parallel, delayed | |
from itertools import islice | |
from tqdm import tqdm | |
def get_gbk_path(assembly) -> str: | |
""" | |
Given an Assembly object, return the https path to its GenBank file | |
""" | |
path = assembly["ftp_path"].replace("ftp://", "https://") | |
filename = f"{path.rsplit('/', 1)[-1]}_genomic.gbff.gz" | |
return f"{os.path.join(path, filename)}" | |
def iter_assembly_summary(handle): | |
""" | |
Iterate over an assembly_summary.txt from NCBI yielding rows as dictionaries | |
# requires assembly summary file from ncbi genbank/refseq | |
curl --silent --progress-bar https://ftp.ncbi.nih.gov/genomes/{genbank,refseq}/{bacteria,archaea,viruses,fungi}/assembly_summary.txt assembly_summary.txt | |
""" | |
next(handle) # skip false header | |
header = [x.strip() for x in next(handle).split("\t")] | |
header[0] = header[0].replace("# ", "") | |
for line in handle: | |
yield dict(zip(header, [x.strip() for x in line.split("\t")])) | |
def download_assembly(assembly): | |
out_path = f"assemblies/{assembly['assembly_accession']}.gbk.gz" | |
remote_path = get_gbk_path(assembly) | |
if not os.path.exists(out_path): | |
downloads.download(remote_path, out_path=out_path) | |
return True | |
def main(): | |
with open("combined_assembly_summary.txt") as handle: | |
Parallel(n_jobs=8)(delayed(download_assembly)(assembly) for assembly in tqdm(iter_assembly_summary(handle))) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment