Last active
July 19, 2018 15:36
-
-
Save hsiaoyi0504/966affd9d6aa7785338f1d5a751c28b8 to your computer and use it in GitHub Desktop.
fetch file and run assembly metrics calculation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
from bs4 import BeautifulSoup | |
from requests import get | |
from os import mkdir, remove | |
from os.path import exists | |
import re | |
from six.moves import urllib | |
from subprocess import run, PIPE | |
def listFileUrl(url, checkExt=False): | |
page = get(url).text | |
soup = BeautifulSoup(page, 'html.parser') | |
urls = [url + '/' + node.get('href') for node in soup.find_all('a')] | |
urls = urls[5:] # exclude sort by (1)name, (2)Last modified, (3)Size, | |
# (4)Description, (5) Parent directory | |
acceptable_ext = ('fa', 'fa.gz', 'fasta', 'fasta.gz', 'fna', 'fna.gz', 'fsa', 'fsa.gz') | |
if checkExt: | |
filtered_urls = [] | |
for u in urls: | |
file_name = u.split('/')[-1] | |
if not file_name.lower().endswith('fa.qual.gz') \ | |
and (file_name.lower().endswith(acceptable_ext) | |
or re.compile(r'.+[.]fa[.].+[.]gz').match(file_name) | |
or re.compile(r'.+[.]fna[.].+[.]gz').match(file_name)): | |
filtered_urls.append(u) | |
else: | |
print(('The file: "{}" is not ' | |
'related to assembly metrics.' | |
' It will be skipped.').format(file_name)) | |
return filtered_urls | |
else: | |
return urls | |
if not exists('./data'): | |
mkdir('./data') | |
if not exists('./json'): | |
mkdir('./json') | |
data_url = 'https://i5k.nal.usda.gov/data/Arthropoda' | |
organism_urls = listFileUrl(data_url) | |
for base_url in organism_urls: | |
assembly_urls = listFileUrl(base_url | |
+ 'Current Genome Assembly/1.Genome Assembly') | |
organism = base_url.split('/')[-2].split('-')[-1].lstrip('(').rstrip(')') | |
num_assembly = len(assembly_urls) | |
if num_assembly == 1: | |
print('Total 1 assembly for {}'.format(organism)) | |
else: | |
print('Total {} assemblies for {}'.format(num_assembly, organism)) | |
print(('Start processing assembly' | |
' (or assemblies) of {} ...').format(organism)) | |
for num, a_url in enumerate(assembly_urls, 1): | |
assembly_name = a_url.split('/')[-2] | |
print(('Processing {}th assembly' | |
' with name of {} ...').format(num, assembly_name)) | |
hasScaffolds = True | |
try: | |
scaffolds_url = listFileUrl(a_url + 'Scaffolds', True) | |
except TypeError: | |
hasScaffolds = False | |
if scaffolds_url == []: | |
hasScaffolds = False | |
elif len(scaffolds_url) == 2: | |
scaffolds_url = [scaffolds_url[0]] if 'refseqids' in scaffolds_url[0].lower() else [scaffolds_url[1]] | |
hasContigs = True | |
try: | |
contigs_url = listFileUrl(a_url + 'Contigs', True) | |
except TypeError: | |
hasContigs = False | |
if contigs_url == []: | |
hasContigs = False | |
if hasScaffolds and hasContigs: | |
print('This assembly is with contigs and scaffolds.') | |
scaffolds_url = scaffolds_url[0] | |
contigs_url = contigs_url[0] | |
scaffolds_file_name = scaffolds_url.split('/')[-1] | |
contigs_file_name = contigs_url.split('/')[-1] | |
scaffolds_url = urllib.parse.quote(scaffolds_url, safe=':/') | |
contigs_url = urllib.parse.quote(contigs_url, safe=':/') | |
print('Downloading the scaffolds file ...') | |
urllib.request.urlretrieve(scaffolds_url, './data/' + scaffolds_file_name) | |
print('Downloading the contigs file ...') | |
urllib.request.urlretrieve(contigs_url, './data/' + contigs_file_name) | |
print('Running the calculation of assembly metrics ...', end='') | |
r = run(['python', './assembly_metrics_toolkit.py', | |
'-s', './data/' + scaffolds_file_name, | |
'-c', './data/' + contigs_file_name, | |
'-o', './json/{}.json'.format(organism.lower())], | |
stdout=PIPE, stderr=PIPE) | |
if r.stderr != '': | |
print(r.stderr) | |
print('Finished !') | |
print('Deleting the scaffolds file ...') | |
remove('./data/' + scaffolds_file_name) | |
print('Deleting the contigs file ...') | |
remove('./data/' + contigs_file_name) | |
elif hasScaffolds: | |
print('This assembly is with scaffolds only.') | |
scaffolds_url = scaffolds_url[0] | |
scaffolds_file_name = scaffolds_url.split('/')[-1] | |
scaffolds_url = urllib.parse.quote(scaffolds_url, safe=':/') | |
print('Downloading the scaffolds file ...') | |
urllib.request.urlretrieve(scaffolds_url, './data/' + scaffolds_file_name) | |
print('Running the calculation of assembly metrics ...', end='') | |
r = run(['python', './assembly_metrics_toolkit.py', | |
'-s', './data/' + scaffolds_file_name, | |
'-o', './json/{}.json'.format(organism.lower())], | |
stdout=PIPE, stderr=PIPE) | |
if r.stderr != '': | |
print(r.stderr) | |
print('Finished !') | |
print('Deleting the scaffolds file ...') | |
remove('./data/' + scaffolds_file_name) | |
elif hasContigs: | |
print('This assembly is with contigs only.') | |
contigs_url = contigs_url[0] | |
contigs_file_name = contigs_url.split('/')[-1] | |
contigs_url = urllib.parse.quote(contigs_url, safe=':/') | |
print('Downloading the contigs file ...') | |
urllib.request.urlretrieve(contigs_url, './data/' + contigs_file_name) | |
print('Running the calculation of assembly metrics ...', end='') | |
if organism.lower() == 'diaphorina_citri': | |
r = run(['python', './assembly_metrics_toolkit.py', | |
'-c', './data/' + contigs_file_name, | |
'-o', './json/{}_{}.json'.format(organism.lower(), assembly_name.lower())], | |
stdout=PIPE, stderr=PIPE) | |
else: | |
r = run(['python', './assembly_metrics_toolkit.py', | |
'-c', './data/' + contigs_file_name, | |
'-o', './json/{}_{}.json'.format(organism.lower())], | |
stdout=PIPE, stderr=PIPE) | |
if r.stderr != '': | |
print(r.stderr) | |
print('Finished !') | |
print('Deleting the contigs file ...') | |
remove('./data/' + contigs_file_name) | |
else: | |
print(('This assembly is without scaffolds and contigs' | |
', no processing has done.')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It should be noted that this script is only for Python 3.