Last active
July 18, 2019 11:56
-
-
Save mkweskin/b7cad87d579892a07c081ddcb2cd2c4e to your computer and use it in GitHub Desktop.
Takes a tab-delimited BIOM feature table that lists sequences for each sample and a fasta file with all sequences for all samples. Outputs a separate fasta file for each sample containing only the sequences found in that sample. Optionally, you can specify a minimum count for each sequence for each sample and a minimum proportion (calculated wit…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import pandas as pd | |
from Bio import SeqIO | |
import argparse | |
import os | |
import logging | |
import sys | |
import datetime | |
def get_args(): | |
parser = argparse.ArgumentParser(description="""Takes a feature table and a fasta file and returns the subset of sequences found in each sample of the feature table. """) | |
parser.add_argument( | |
'--feature-table', | |
type=str, | |
required=True, | |
help="""Path to feature table""" | |
) | |
parser.add_argument( | |
'--fasta-input', | |
type=str, | |
required=True, | |
help="""Path to fasta file""" | |
) | |
parser.add_argument( | |
'--outputdir', '-o', | |
type=str, | |
default=".", | |
help="""Directory where output should go. Default: current directory""" | |
) | |
parser.add_argument( | |
'--header-row', | |
type=int, | |
default="2", | |
help="""Row of the features table file with the column headers, lines above the header are ignored. (Default 2)""" | |
) | |
parser.add_argument( | |
'--min-count', | |
type=int, | |
default="0", | |
help="""Minimum number of reads required for the sequence to be included in the output. Both --min-count and --min-prop can be specified. (Default 0, no filtering by count)""" | |
) | |
parser.add_argument( | |
'--min-prop', | |
type=float, | |
default="0", | |
help="""Minimum proportion of reads of a sequence that are required for the sequence to be included in the output. Both --min-count and --min-prop can be specified. (Default 0, no filtering by proportion count)""" | |
) | |
parser.add_argument( | |
'--verbose', '-v', | |
action="store_true", | |
help="""Report the sequences included for each sample. (Default disabled)""" | |
) | |
parser.add_argument( | |
'--log', '-l', | |
type=str, | |
default="", | |
help="""Name of output log. (Default fasta file name .log extension)""" | |
) | |
return parser.parse_args() | |
def saveSeqs(seqs, col, idList, countList, outputpath): | |
""" | |
Outputs the sequences from the input fasta for the ids given (idList). | |
Output file is col.fasta | |
""" | |
for id, count in zip(idList, countList): | |
if id in seqs: | |
with open( outputpath + col + ".fasta", "a") as output_handle: | |
output_handle.write(">" + str(col)+ "_" + str(id) + "_" + str(count) + "\n" + str(seqs[id].seq) + "\n") | |
else: | |
logging.warning ("WARNING: sequence \"" + id + "\" listed in \"" + str(col) + "\" was not found in fasta file!") | |
def main(): | |
args = get_args() | |
features = pd.read_csv(args.feature_table, sep='\t', header=(args.header_row-1)) | |
seqs = SeqIO.to_dict(SeqIO.parse(args.fasta_input, "fasta")) | |
outputpath = args.outputdir + "/" | |
if not os.path.exists(outputpath): | |
os.mkdir(outputpath) | |
# Setup logging | |
if args.log == "": | |
args.log = os.path.splitext(os.path.basename(args.fasta_input))[0] + ".log" | |
logging.basicConfig(filename=outputpath + args.log, format='%(message)s', level=logging.DEBUG) | |
logging.info(datetime.datetime.now().replace(microsecond=0).isoformat() + " " + " ".join(sys.argv)) | |
# Have some levels of logging go to the screen (https://stackoverflow.com/a/9321890) | |
console = logging.StreamHandler() | |
if args.verbose: | |
console.setLevel(logging.DEBUG) | |
else: | |
console.setLevel(logging.INFO) | |
formatter = logging.Formatter('%(message)s') | |
console.setFormatter(formatter) | |
logging.getLogger('').addHandler(console) | |
firstCol=True | |
#Tallies for end of run report | |
#Running total number of sequences | |
totalCount=0 | |
#Running total of sequences meeting filtering rules | |
includedCount=0 | |
#Running total of number of samples | |
sampleCount=0 | |
#Running total of number of fasta files output | |
fastaCount=0 | |
if args.min_prop > 0 or args.min_count > 0: | |
logging.debug("ID\tTotal seqs\tIncluded seqs") | |
else: | |
logging.debug("ID\tSequences") | |
for col in features.columns: | |
if firstCol: | |
# Don't process the first column, it has sample names | |
firstCol=False | |
firstColName=col | |
else: | |
if os.path.exists(outputpath + col + ".fasta"): | |
os.remove(outputpath + col + ".fasta") | |
sampleCount += 1 | |
# Include these sequence if the min_prop and min_count values are met | |
feature_df = features[ (features[col] >= (args.min_prop * features[col].sum())) & (features[col] >= args.min_count) & (features[col] != 0) ] | |
if args.min_prop > 0 or args.min_count > 0: | |
totalCount += features[features[col] != 0].shape[0] | |
logging.debug(str(col) + "\t" + str(features[features[col] != 0].shape[0]) + "\t" + str(feature_df.shape[0])) | |
else: | |
logging.debug(str(col) + "\t" + str(features[features[col] != 0].shape[0])) | |
#Output the sequences for this sample to a fasta file | |
if feature_df.shape[0] > 0: | |
fastaCount += 1 | |
includedCount += feature_df.shape[0] | |
saveSeqs (seqs, col, list(feature_df[firstColName]), list(feature_df[col]), outputpath) | |
# Print end of run summary | |
logging.info ("Samples: " + str(sampleCount)) | |
logging.info ("FASTA files output: " + str(fastaCount)) | |
logging.info ("Sum of all sequences output across FASTA files: " + str(includedCount)) | |
if args.min_prop > 0 or args.min_count > 0: | |
logging.info ("Sum of all filtered out sequences across FASTA files: " + str(totalCount - includedCount)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment