|
from os import system |
|
import os |
|
import io |
|
|
|
RM_PATH = '/csbiohome02/mcmillan/Notebooks/Charles/yet-summer-again/repeat-masker/RepeatMasker/RepeatMasker' |
|
COLUMNS = ['score','div', 'del', 'ins', 'query', 'begin', 'end', 'left', 'sgn', 'repeat', 'family', 'begin', 'end', 'left', 'id'] |
|
def parse_repeatmasker(fp): |
|
for i, l in enumerate(fp): |
|
if i <= 2: |
|
continue |
|
toks = l.split() |
|
yield toks |
|
|
|
identity = lambda x: x |
|
|
|
def apply_elements(f, coll): |
|
return (g(e) for g, e in zip(f, coll)) |
|
|
|
def repeat_master_(seqs): |
|
with open('in.fasta.out', 'w') as fp: |
|
os.remove('in.fasta.out') |
|
system('rm -rf *.RMoutput') |
|
res = [None] * len(seqs) |
|
buf = '' |
|
for i, s in enumerate(seqs): |
|
buf += '>%06d\n' % i |
|
buf += s |
|
buf += '\n' |
|
with open('in.fasta', 'w+') as fp: |
|
fp.write(buf) |
|
system('{} -q -species \'mouse\' in.fasta > /dev/null'.format(RM_PATH)) |
|
with open('in.fasta.out') as fp: |
|
for r in parse_repeatmasker(fp): |
|
dtypes = [int, float, float, float, int, int, int, str, str, str, str, str, str, str, int] |
|
r = list(apply_elements(dtypes,r)) |
|
res[r[4]] = r |
|
return res |
|
|
|
from itertools import chain, imap |
|
def flatmap(f, items): |
|
return chain.from_iterable(imap(f, items)) |
|
|
|
def groups_in(it, n): |
|
cache = [] |
|
for i in it: |
|
cache.append(i) |
|
if len(cache) == n: |
|
yield cache |
|
cache = [] |
|
if cache: |
|
yield cache |
|
|
|
def batch_map(fn, seq, n): |
|
return flatmap(fn, groups_in(seq, n)) |
|
|
|
def repeat_master(seqs): |
|
return batch_map(repeat_master_, seqs, 5000) |
|
|
|
if __name__ == "__main__": |
|
r = repeat_master_(['AAAATACCTTGGCATGACTCTAACTAAGGAAGTGAAAGATCTGTA']) |
|
print(r) |
|
|
|
r = repeat_master(['AAAATACCTTGGCATGACTCTAACTAAGGAAGTGAAAGATCTGTA']) |
|
print(list(r)) |