Skip to content

Instantly share code, notes, and snippets.

@hsiaoyi0504
Last active July 19, 2018 15:36
Show Gist options
  • Save hsiaoyi0504/966affd9d6aa7785338f1d5a751c28b8 to your computer and use it in GitHub Desktop.
Save hsiaoyi0504/966affd9d6aa7785338f1d5a751c28b8 to your computer and use it in GitHub Desktop.
fetch file and run assembly metrics calculation
from __future__ import print_function
from bs4 import BeautifulSoup
from requests import get
from os import mkdir, remove
from os.path import exists
import re
from six.moves import urllib
from subprocess import run, PIPE
def listFileUrl(url, checkExt=False):
page = get(url).text
soup = BeautifulSoup(page, 'html.parser')
urls = [url + '/' + node.get('href') for node in soup.find_all('a')]
urls = urls[5:] # exclude sort by (1)name, (2)Last modified, (3)Size,
# (4)Description, (5) Parent directory
acceptable_ext = ('fa', 'fa.gz', 'fasta', 'fasta.gz', 'fna', 'fna.gz', 'fsa', 'fsa.gz')
if checkExt:
filtered_urls = []
for u in urls:
file_name = u.split('/')[-1]
if not file_name.lower().endswith('fa.qual.gz') \
and (file_name.lower().endswith(acceptable_ext)
or re.compile(r'.+[.]fa[.].+[.]gz').match(file_name)
or re.compile(r'.+[.]fna[.].+[.]gz').match(file_name)):
filtered_urls.append(u)
else:
print(('The file: "{}" is not '
'related to assembly metrics.'
' It will be skipped.').format(file_name))
return filtered_urls
else:
return urls
if not exists('./data'):
mkdir('./data')
if not exists('./json'):
mkdir('./json')
data_url = 'https://i5k.nal.usda.gov/data/Arthropoda'
organism_urls = listFileUrl(data_url)
for base_url in organism_urls:
assembly_urls = listFileUrl(base_url
+ 'Current Genome Assembly/1.Genome Assembly')
organism = base_url.split('/')[-2].split('-')[-1].lstrip('(').rstrip(')')
num_assembly = len(assembly_urls)
if num_assembly == 1:
print('Total 1 assembly for {}'.format(organism))
else:
print('Total {} assemblies for {}'.format(num_assembly, organism))
print(('Start processing assembly'
' (or assemblies) of {} ...').format(organism))
for num, a_url in enumerate(assembly_urls, 1):
assembly_name = a_url.split('/')[-2]
print(('Processing {}th assembly'
' with name of {} ...').format(num, assembly_name))
hasScaffolds = True
try:
scaffolds_url = listFileUrl(a_url + 'Scaffolds', True)
except TypeError:
hasScaffolds = False
if scaffolds_url == []:
hasScaffolds = False
elif len(scaffolds_url) == 2:
scaffolds_url = [scaffolds_url[0]] if 'refseqids' in scaffolds_url[0].lower() else [scaffolds_url[1]]
hasContigs = True
try:
contigs_url = listFileUrl(a_url + 'Contigs', True)
except TypeError:
hasContigs = False
if contigs_url == []:
hasContigs = False
if hasScaffolds and hasContigs:
print('This assembly is with contigs and scaffolds.')
scaffolds_url = scaffolds_url[0]
contigs_url = contigs_url[0]
scaffolds_file_name = scaffolds_url.split('/')[-1]
contigs_file_name = contigs_url.split('/')[-1]
scaffolds_url = urllib.parse.quote(scaffolds_url, safe=':/')
contigs_url = urllib.parse.quote(contigs_url, safe=':/')
print('Downloading the scaffolds file ...')
urllib.request.urlretrieve(scaffolds_url, './data/' + scaffolds_file_name)
print('Downloading the contigs file ...')
urllib.request.urlretrieve(contigs_url, './data/' + contigs_file_name)
print('Running the calculation of assembly metrics ...', end='')
r = run(['python', './assembly_metrics_toolkit.py',
'-s', './data/' + scaffolds_file_name,
'-c', './data/' + contigs_file_name,
'-o', './json/{}.json'.format(organism.lower())],
stdout=PIPE, stderr=PIPE)
if r.stderr != '':
print(r.stderr)
print('Finished !')
print('Deleting the scaffolds file ...')
remove('./data/' + scaffolds_file_name)
print('Deleting the contigs file ...')
remove('./data/' + contigs_file_name)
elif hasScaffolds:
print('This assembly is with scaffolds only.')
scaffolds_url = scaffolds_url[0]
scaffolds_file_name = scaffolds_url.split('/')[-1]
scaffolds_url = urllib.parse.quote(scaffolds_url, safe=':/')
print('Downloading the scaffolds file ...')
urllib.request.urlretrieve(scaffolds_url, './data/' + scaffolds_file_name)
print('Running the calculation of assembly metrics ...', end='')
r = run(['python', './assembly_metrics_toolkit.py',
'-s', './data/' + scaffolds_file_name,
'-o', './json/{}.json'.format(organism.lower())],
stdout=PIPE, stderr=PIPE)
if r.stderr != '':
print(r.stderr)
print('Finished !')
print('Deleting the scaffolds file ...')
remove('./data/' + scaffolds_file_name)
elif hasContigs:
print('This assembly is with contigs only.')
contigs_url = contigs_url[0]
contigs_file_name = contigs_url.split('/')[-1]
contigs_url = urllib.parse.quote(contigs_url, safe=':/')
print('Downloading the contigs file ...')
urllib.request.urlretrieve(contigs_url, './data/' + contigs_file_name)
print('Running the calculation of assembly metrics ...', end='')
if organism.lower() == 'diaphorina_citri':
r = run(['python', './assembly_metrics_toolkit.py',
'-c', './data/' + contigs_file_name,
'-o', './json/{}_{}.json'.format(organism.lower(), assembly_name.lower())],
stdout=PIPE, stderr=PIPE)
else:
r = run(['python', './assembly_metrics_toolkit.py',
'-c', './data/' + contigs_file_name,
'-o', './json/{}_{}.json'.format(organism.lower())],
stdout=PIPE, stderr=PIPE)
if r.stderr != '':
print(r.stderr)
print('Finished !')
print('Deleting the contigs file ...')
remove('./data/' + contigs_file_name)
else:
print(('This assembly is without scaffolds and contigs'
', no processing has done.'))
@hsiaoyi0504
Copy link
Author

It should be noted that this script is only for Python 3.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment