Created
February 5, 2016 19:21
-
-
Save bsmith89/4c3866bcdb77614b315e to your computer and use it in GitHub Desktop.
Minorly redacted Makefile for a project using mothur to analyze 16S data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ==================== | |
# Project Makefile | |
# ==================== | |
# User Configuration {{{1 | |
# ==================== | |
# Major targets {{{2 | |
.PHONY: figs res | |
figs: | |
# [Partially redacted] | |
res: res/rrs.procd.otus.shared | |
res: res/rrs.procd.otus.tax | |
res: seq/rrs.procd.otus.reps.afn | |
# [Partially redacted] | |
# What files are generated on `make all`? | |
all: docs figs res | |
# Compute environment {{{2 | |
# Name, and directory, of the python virtual environment: | |
VENV = .venv | |
# Directory for recipe-internal files and terminal documents: | |
BUILD_DIR := build | |
# Documentation settings {{{2 | |
.PHONY: docs | |
docs: ${BUILD_DIR}/2016-02-02_pg-meeting.pdf | |
BIB_FILE=doc/main.bib | |
MAKEFILE_GRAPH_FLAGS=-d 'raw/seq/.*' -d 'seq/split/.*' -d 'meta/split/.*' -d 'res/split.*' -d 'raw/hplc/*' | |
# Preqs for major documentation targets {{{3 | |
${BUILD_DIR}/README.%: | |
# Cleanup settings {{{2 | |
# Use the following line to add files and directories to be deleted on `make clean`: | |
CLEANUP += *.docx *.html *.log *.logfile *.pbs.o* *.pbs.e* ${BUILD_DIR}/* | |
TESTCLEANUP = res/test* res/split/test* ${BUILD_DIR}/test* meta/test* seq/test* seq/split/test* | |
.PHONY: testclean | |
testclean: | |
rm -rf ${TESTCLEANUP} | |
# Initialization settings {{{2 | |
# What directories to generate on `make data-dirs`. | |
# By default, already includes build/ fig/ | |
DATA_DIRS += raw/ raw/ref raw/seq \ | |
raw/hplc raw/hplc/proto \ | |
meta/ ref/ \ | |
res/ res/split \ | |
seq/ seq/split | |
# MOTHUR settings {{{2 | |
# Link pre-requisites to the $BUILD dir | |
define MOTHUR_SETUP | |
@ln -fsrt ${BUILD_DIR}/ $^ | |
# START: > $@ | |
endef | |
define MOTHUR_TEARDOWN | |
# FINISH: > $@ | |
endef | |
# Ordered list of pre-requisite link locations (in the $BUILD dir) | |
LN_PREQS = $(addprefix ${BUILD_DIR}/,$(notdir $^)) | |
MOTHUR ?= mothur -q | |
# If you wanna swap in mothur-mpi use: | |
# MOTHUR ?= mpirun -np ${MAX_PROCS} mothur-mpi | |
# Requires that you have built `mothur-mpi` as a MPI capable version | |
# MOTHUR_STDOUT := >/dev/null | |
MOTHUR_STDOUT := | |
MAX_PROCS ?= $(shell nproc) | |
# Random seed for reproduciblity. | |
SEED ?= 1 | |
MIN_READS_PER_SAMPLE = 6000 # When to throw out samples from community matrix | |
NUM_PRECLUST_DIFFS = 2 | |
OTU_CUTOFF ?= 0.03 | |
DIST_CUTOFF ?= 0.15 | |
CLUST_METHOD ?= average | |
SPLIT_TAXLEVEL ?= 4 | |
ifeq (${CLUST_METHOD}, furthest) | |
CLUST_METHOD_ABBRV := fn | |
else | |
ifeq (${CLUST_METHOD}, average) | |
CLUST_METHOD_ABBRV := an | |
else | |
ifeq (${CLUST_METHOD}, nearest) | |
CLUST_METHOD_ABBRV := nn # TODO: Is this correct? | |
else | |
CLUST_METHOD_ABBRV := an | |
endif | |
endif | |
endif | |
# FINALLY: Include base makefile {{{2 | |
include base.mk | |
# ============== | |
# Data {{{1 | |
# ============== | |
# User defined recipes for cleaning up and initially parsing data. | |
# e.g. Slicing out columns, combining data sources, alignment, generating | |
# phylogenies, etc. | |
# 16S Library {{{2 | |
# Reference sets {{{3 | |
extract_from_tarball = tar -Oxzf ${1} ${2} > $@ | |
raw/ref/Silva.nr_v119.tgz: | |
wget --directory-prefix=${@D} http://www.mothur.org/w/images/2/27/Silva.nr_v119.tgz | |
ref/silva.nr.afn: raw/ref/Silva.nr_v119.tgz | |
$(call extract_from_tarball,$^,silva.nr_v119.align) | |
ref/silva.nr.tax: raw/ref/Silva.nr_v119.tgz | |
$(call extract_from_tarball,$^,silva.nr_v119.tax) | |
# TODO: Is this the region I really want? | |
ref/silva.nr.pcr-v4.afn: ref/silva.nr.afn | |
${MOTHUR_SETUP} | |
${MOTHUR} "#pcr.seqs(fasta=${LN_PREQS}, start=11894, end=25319, \ | |
keepdots=F, processors=${MAX_PROCS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/silva.nr.pcr.afn $@ | |
${MOTHUR_TEARDOWN} | |
raw/ref/Silva.gold.bacteria.zip: | |
wget --directory-prefix=${@D} http://www.mothur.org/w/images/f/f1/Silva.gold.bacteria.zip | |
ref/silva.gold.afn: raw/ref/Silva.gold.bacteria.zip | |
unzip $^ | |
mv silva.gold.align $@ | |
@touch $@ | |
raw/ref/Silva.seed_v119.tgz: | |
wget --directory-prefix=${@D} http://www.mothur.org/w/images/1/15/Silva.seed_v123.tgz | |
ref/silva.seed.afn: raw/ref/Silva.seed_v119.tgz | |
$(call extract_from_tarball,$^,silva.seed_v119.align) | |
ref/silva.seed.tax: raw/ref/Silva.seed_v119.tgz | |
$(call extract_from_tarball,$^,silva.seed_v119.tax) | |
raw/ref/Trainset14_032015.pds.tgz: | |
wget --directory-prefix=${@D} http://www.mothur.org/w/images/8/88/Trainset14_032015.pds.tgz | |
ref/rdp.nr.afn: raw/ref/Trainset14_032015.pds.tgz | |
$(call extract_from_tarball,$^,trainset14_032015.pds/trainset14_032015.pds.fasta) | |
ref/rdp.nr.tax: raw/ref/Trainset14_032015.pds.tgz | |
$(call extract_from_tarball,$^,trainset14_032015.pds/trainset14_032015.pds.tax) | |
# Metadata {{{3 | |
# http://www.mothur.org/wiki/MiSeq_SOP#Getting_started | |
meta/rrs.files meta/%.rrs.files: scripts/generate_files_file.py \ | |
etc/rrs/library.tsv etc/rrs/analysis_group.tsv | |
$^ $* > $@ | |
# Dynamic `include`ing and generation of *.contigs.mk {{{3 | |
# For every unique library.series, generate a makefile in build/ which | |
ALL_LIBRARY_ANALYSIS_GROUPS := \ | |
$(shell sed '1,1d' etc/rrs/analysis_group.tsv | cut -f2 | sort | uniq) | |
ALL_CONTIGS_MAKEFILES := \ | |
$(patsubst %,${BUILD_DIR}/%.rrs.contigs.mk,${ALL_LIBRARY_ANALYSIS_GROUPS}) \ | |
${BUILD_DIR}/rrs.contigs.mk | |
ALL_LIBRARY_SERIES += all | |
ifeq (${INITIALIZED},True) | |
$(foreach makefile,${ALL_CONTIGS_MAKEFILES},$(eval include ${makefile})) | |
endif | |
${BUILD_DIR}/%.contigs.mk: scripts/generate_contigs_makefile.py meta/%.files | |
$^ $* > $@ | |
raw/seq/%.fastq.gz: raw/seq/fake_root.tgz | |
echo "This does not make $@. If you actually run this you are doing something wrong." | |
# Make contigs from paired-end reads | |
# Preqs defined in each build/%.contigs.mk | |
seq/split/%.rrs.contigs.fn seq/split/%.rrs.contigs.qual res/split/%.rrs.contigs.report: | |
gzip -dc $(word 1,$^) > ${BUILD_DIR}/$*.r1.fastq | |
gzip -dc $(word 2,$^) > ${BUILD_DIR}/$*.r2.fastq | |
${MOTHUR} "#make.contigs(ffastq=${BUILD_DIR}/$*.r1.fastq, \ | |
rfastq=${BUILD_DIR}/$*.r2.fastq, \ | |
trimoverlap=T)" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.r1.trim.contigs.fasta seq/split/$*.rrs.contigs.fn | |
@mv ${BUILD_DIR}/$*.r1.trim.contigs.qual seq/split/$*.rrs.contigs.qual | |
@mv ${BUILD_DIR}/$*.r1.contigs.report res/split/$*.rrs.contigs.report | |
${MOTHUR_TEARDOWN} | |
# Concatenate individual contig files | |
# Preqs defined in each build/%.contigs.mk | |
seq/%.contigs.fn: | |
cat $^ > $@ | |
# Define groups based on names of individual contig files | |
# Preqs defined in each build/%.contigs.mk | |
meta/%.contigs.groups: | |
$^ > $@ | |
# Align, classify, screen and cluster seqs {{{3 | |
# Screen 1: remove sequences which are the wrong length or have too many | |
# ambiguous nucleotides | |
# TODO: Screen params? | |
# TODO: bug report the problem with dashes in file names for some commands. | |
seq/%.screen1.fn meta/%.screen1.groups: seq/%.fn meta/%.groups | |
${MOTHUR_SETUP} | |
${MOTHUR} "#screen.seqs(fasta=$(word 1,${LN_PREQS}), group=$(word 2,${LN_PREQS}), \ | |
maxambig=0, maxlength=295, processors=${MAX_PROCS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.good.fn seq/$*.screen1.fn | |
@mv ${BUILD_DIR}/$*.good.groups meta/$*.screen1.groups | |
${MOTHUR_TEARDOWN} | |
# Uniq-ify seqs {{{3 | |
seq/%.uniq.fn meta/%.uniq.names: seq/%.fn | |
${MOTHUR_SETUP} | |
${MOTHUR} "#unique.seqs(fasta=${LN_PREQS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.unique.fn seq/$*.uniq.fn | |
@mv ${BUILD_DIR}/$*.names meta/$*.uniq.names | |
${MOTHUR_TEARDOWN} | |
seq/%.uniq.afn meta/%.uniq.names: seq/%.afn | |
${MOTHUR_SETUP} | |
${MOTHUR} "#unique.seqs(fasta=${LN_PREQS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.unique.afn seq/$*.uniq.afn | |
@mv ${BUILD_DIR}/$*.names meta/$*.uniq.names | |
${MOTHUR_TEARDOWN} | |
meta/%.contigs.screen1.uniq.count_table: meta/%.contigs.screen1.uniq.names meta/%.contigs.screen1.groups | |
${MOTHUR_SETUP} | |
${MOTHUR} "#count.seqs(name=$(word 1,${LN_PREQS}), \ | |
group=$(word 2,${LN_PREQS}), processors=${MAX_PROCS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.contigs.screen1.uniq.count_table meta/$*.contigs.screen1.uniq.count_table | |
${MOTHUR_TEARDOWN} | |
# Alignment {{{3 | |
seq/%.align.afn res/%.align.report: seq/%.fn ref/silva.nr.pcr-v4.afn | |
${MOTHUR_SETUP} | |
${MOTHUR} "#align.seqs(fasta=$(word 1,${LN_PREQS}), \ | |
reference=$(word 2,${LN_PREQS}), processors=${MAX_PROCS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.align seq/$*.align.afn | |
@mv ${BUILD_DIR}/$*.align.report res/$*.align.report | |
${MOTHUR_TEARDOWN} | |
# Sequence cleanup and screening {{{3 | |
seq/%.align.screen2.afn meta/%.align.screen2.count_table: seq/%.align.afn meta/%.count_table | |
${MOTHUR_SETUP} | |
${MOTHUR} "#screen.seqs(fasta=$(word 1,${LN_PREQS}), start=3100, \ | |
count=$(word 2,${LN_PREQS}), end=10600, maxhomop=8, \ | |
processors=${MAX_PROCS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.align.good.afn seq/$*.align.screen2.afn | |
@mv ${BUILD_DIR}/$*.good.count_table meta/$*.align.screen2.count_table | |
${MOTHUR_TEARDOWN} | |
seq/%.trim.afn: seq/%.afn | |
${MOTHUR_SETUP} | |
${MOTHUR} "#filter.seqs(fasta=$(word 1,${LN_PREQS}), vertical=T, trump=., \ | |
processors=${MAX_PROCS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.filter.fasta seq/$*.trim.afn | |
${MOTHUR_TEARDOWN} | |
seq/%.trim.uniq.afn meta/%.trim.uniq.count_table: seq/%.trim.afn meta/%.count_table | |
${MOTHUR_SETUP} | |
${MOTHUR} "#unique.seqs(fasta=$(word 1,${LN_PREQS}), \ | |
count=$(word 2,${LN_PREQS}))" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.trim.unique.afn seq/$*.trim.uniq.afn | |
@mv ${BUILD_DIR}/$*.trim.count_table meta/$*.trim.uniq.count_table | |
${MOTHUR_TEARDOWN} | |
# TODO: Why doesn't the .name and .groups file combination work? | |
# I get told that there are names in the .names file which are not in the | |
# fasta, but those names are sub-names of uniq seqs, so that's what I expect...? | |
seq/%.preclust.afn meta/%.preclust.count_table: \ | |
seq/%.afn \ | |
meta/%.count_table | |
${MOTHUR_SETUP} | |
${MOTHUR} "#pre.cluster(fasta=$(word 1,${LN_PREQS}), \ | |
count=$(word 2,${LN_PREQS}), diffs=${NUM_PRECLUST_DIFFS}, \ | |
processors=${MAX_PROCS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.precluster.afn seq/$*.preclust.afn | |
@mv ${BUILD_DIR}/$*.precluster.count_table meta/$*.preclust.count_table | |
${MOTHUR_TEARDOWN} | |
# Chimera screening {{{3 | |
seq/%.screen3.afn res/%.screen3.chimeras meta/%.screen3.count_table: seq/%.afn meta/%.count_table | |
${MOTHUR_SETUP} | |
${MOTHUR} "#chimera.uchime(fasta=$(word 1,${LN_PREQS}), \ | |
count=$(word 2,${LN_PREQS}), \ | |
dereplicate=T, processors=${MAX_PROCS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.denovo.uchime.chimeras res/$*.screen3.chimeras | |
${MOTHUR} "#remove.seqs(fasta=${BUILD_DIR}/$*.afn, \ | |
accnos=${BUILD_DIR}/$*.denovo.uchime.accnos)" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.pick.afn seq/$*.screen3.afn | |
@mv ${BUILD_DIR}/$*.denovo.uchime.pick.count_table meta/$*.screen3.count_table | |
${MOTHUR_TEARDOWN} | |
# Taxonomic classification {{{3 | |
res/%.tax res/%.tax-summary: seq/%.afn ref/rdp.nr.afn ref/rdp.nr.tax meta/%.count_table | |
${MOTHUR_SETUP} | |
${MOTHUR} "#classify.seqs(fasta=$(word 1,${LN_PREQS}), \ | |
count=$(word 4, ${LN_PREQS}), \ | |
reference=$(word 2,${LN_PREQS}), \ | |
taxonomy=$(word 3,${LN_PREQS}), \ | |
cutoff=80, processors=${MAX_PROCS})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.nr.wang.taxonomy res/$*.tax | |
@mv ${BUILD_DIR}/$*.nr.wang.tax.summary res/$*.tax-summary | |
${MOTHUR_TEARDOWN} | |
# Taxonomic screening {{{3 | |
# Screen out Chloroplasts, Mitochondria, Archaea, Eukaryota | |
seq/%.screen4.afn res/%.screen4.tax meta/%.screen4.count_table: seq/%.afn res/%.tax meta/%.count_table | |
${MOTHUR_SETUP} | |
${MOTHUR} "#remove.lineage(fasta=$(word 1,${LN_PREQS}), \ | |
count=$(word 3,${LN_PREQS}), \ | |
taxonomy=$(word 2,${LN_PREQS}), \ | |
taxon=Chloroplast-Mitochondria-Archaea-Eukaryota-Unknown)" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.pick.afn seq/$*.screen4.afn | |
@mv ${BUILD_DIR}/$*.pick.tax res/$*.screen4.tax | |
@mv ${BUILD_DIR}/$*.pick.count_table meta/$*.screen4.count_table | |
${MOTHUR_TEARDOWN} | |
# OTU tallying {{{3 | |
# TODO: .names dependency instead of .count_table? | |
# This'll increase the size of the .otus file, but it will also mean it includes | |
# all of the names and not just the names associated with the counts. | |
# Maybe useful for representative sequence pulling? | |
res/%.otus: seq/%.afn meta/%.count_table res/%.tax | |
${MOTHUR_SETUP} | |
${MOTHUR} "#cluster.split(fasta=$(word 1,${LN_PREQS}), \ | |
count=$(word 2,${LN_PREQS}), taxonomy=$(word 3,${LN_PREQS}), \ | |
splitmethod=classify, taxlevel=${SPLIT_TAXLEVEL}, cutoff=${DIST_CUTOFF}, \ | |
processors=${MAX_PROCS}, method=${CLUST_METHOD})" ${MOTHUR_STDOUT} | |
@mv ${BUILD_DIR}/$*.${CLUST_METHOD_ABBRV}.unique_list.list res/$*.otus | |
${MOTHUR_TEARDOWN} | |
res/%.otus.shared: res/%.otus meta/%.count_table | |
${MOTHUR_SETUP} | |
${MOTHUR} "#make.shared(list=$(word 1,${LN_PREQS}), count=$(word 2,${LN_PREQS}), label=${OTU_CUTOFF})" | |
@mv ${BUILD_DIR}/$*.shared $@ | |
${MOTHUR_TEARDOWN} | |
res/%.otus.tax: res/%.otus meta/%.count_table res/%.tax | |
${MOTHUR_SETUP} | |
${MOTHUR} "#classify.otu(list=$(word 1,${LN_PREQS}), count=$(word 2,${LN_PREQS}), \ | |
taxonomy=$(word 3,${LN_PREQS}), label=${OTU_CUTOFF})" | |
@mv ${BUILD_DIR}/$*.${OTU_CUTOFF}.cons.taxonomy res/$*.otus.tax | |
${MOTHUR_TEARDOWN} | |
seq/%.screen3.screen4.otus.reps.afn: scripts/get_otu_reps.py res/%.screen3.screen4.otus \ | |
meta/%.screen3.screen4.count_table seq/%.afn | |
$(word 1,$^) $(word 2,$^) ${OTU_CUTOFF} $(word 3,$^) $(word 4,$^) > $@ | |
# Rename fully processed files. {{{3 | |
res/%.procd.otus.shared: res/%.contigs.screen1.uniq.align.screen2.trim.uniq.preclust.screen3.screen4.otus.shared | |
cp $< $@ | |
res/%.procd.otus.tax: res/%.contigs.screen1.uniq.align.screen2.trim.uniq.preclust.screen3.screen4.otus.tax | |
cp $< $@ | |
seq/%.procd.otus.reps.afn: seq/%.contigs.screen1.uniq.align.screen2.trim.uniq.preclust.screen3.screen4.otus.reps.afn | |
cp $< $@ | |
res/%.community.tsv: scripts/unstack_shared.py res/%.shared | |
${P1} < ${P2} > $@ | |
# [Partially redacted] | |
# ======================= | |
# Analysis {{{1 | |
# ======================= | |
# User defined recipes for analyzing the data. | |
# e.g. Calculating means, distributions, correlations, fitting models, etc. | |
# Basically anything that *could* go into the paper as a table. | |
# [Partially redacted] | |
# QC {{{3 | |
# TODO | |
# LEFSE {{{2 | |
# TODO: Fix this up for the new metadata format | |
meta/rrs.design: scripts/generate_lefse_design.py etc/rrs/library.tsv etc/extraction.tsv etc/sample.tsv etc/mouse.tsv | |
$^ > $@ | |
res/%.procd.otus.lefse: res/%.procd.otus.shared meta/%.design | |
${MOTHUR_SETUP} | |
${MOTHUR} "#lefse(shared=$(word 1,${LN_PREQS}), \ | |
design=$(word 2,${LN_PREQS}), \ | |
class=treatment, subclass=site)" | |
@mv ${BUILD_DIR}/$*.procd.otus.${OTU_CUTOFF}.lefse_summary $@ | |
${MOTHUR_TEARDOWN} | |
# Jupyter Notebooks {{{2 | |
# Add pre-requisites for particular jupyter notebooks so that | |
# `make build/<notebook>.ipynb` will first make those pre-reqs. | |
# e.g. | |
# build/notebook.ipynb: res/results.tsv | |
# ================== | |
# Graphing {{{1 | |
# ================== | |
# User defined recipes for plotting figures. These should use | |
# the targets of analysis recipes above as their prerequisites. | |
# [Partially redacted] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment