jnhutchinson · December 15, 2015 12:49
diff --git a/STEM.r b/STEM.r
 ## Rory's kickass annotated dataframe function
 annotate_df = function(d) {
  require(biomaRt)
  ensembl = useMart('ensembl', dataset = ensembl_gene)
 	a = getBM(attributes=c(filter_type, gene_symbol, "description"),
 		filters=c(filter_type), values=d[, 'id'],
 		mart=ensembl)
 	m = merge(d, a, by.x='id', by.y=filter_type)
 	return(m)
 }

 # the deseq counts 
 norm.counts = counts(cds, normalized=TRUE)
 filt.norm.counts=norm.counts[use,] ## "use" are the genes with rowcounts above cutoff, i.e. expressed genes

 counts.for.stem <- as.data.frame(filt.norm.counts)
 counts.for.stem <- counts.for.stem[,c(3:1,18:16,12:4,21:19,15:13)] # rearrange time series columns to be by time instead of alphabetical 
 counts.for.stem.t <- t(counts.for.stem) # transform for the aggregation step
 agg.vec <- sub("rep.", "", row.names(counts.for.stem.t)) # generate vector with time series replicates grouped by time 
 med.counts.for.stem.t <- aggregate(filt.norm.counts.t, by=list(agg.vec), function(n) median(n)) # median aggregate replicates
 row.names(med.counts.for.stem.t) <- med.counts.for.stem.t$Group.1 # put row.names back as times
 med.counts.for.stem.t <- med.counts.for.stem.t[,-(grep("Group.1", names(med.counts.for.stem.t)))] # dump the grouping column
 med.counts.for.stem <- as.data.frame(t(med.counts.for.stem.t)) # transform back to normal form now that you're done aggregating
 names(med.counts.for.stem) <- sub("Normal", 0, sub("FA", "", sub("day", "", names(med.counts.for.stem)))) # reduce the column names to just the numbers
 med.counts.for.stem <- med.counts.for.stem[,order(as.numeric(names(med.counts.for.stem)))] # reorder columns by number (not sure I needed to do this) 
 med.counts.for.stem$id <- row.names(med.counts.for.stem) # make Ensembl id column before annotating dataframe 
 med.counts.for.stem <- annotate_df(med.counts.for.stem) # annotate dataframe with Rory's function
 med.counts.for.stem <- med.counts.for.stem[,c(grep("mgi", names(med.counts.for.stem)), grep("[0-9]", names(med.counts.for.stem)))] # pull out the column you need, mgi_symbol and numerical time point columns
 write.table(med.counts.for.stem, file.path(resultsDir, "normalized.median.counts.genes.forstem.tab"), quote=F, sep="\t", col.names=T, row.names=F) # write out table
	## Rory's kickass annotated dataframe function
	annotate_df = function(d) {
	require(biomaRt)
	ensembl = useMart('ensembl', dataset = ensembl_gene)
	a = getBM(attributes=c(filter_type, gene_symbol, "description"),
	filters=c(filter_type), values=d[, 'id'],
	mart=ensembl)
	m = merge(d, a, by.x='id', by.y=filter_type)
	return(m)
	}

	# the deseq counts
	norm.counts = counts(cds, normalized=TRUE)
	filt.norm.counts=norm.counts[use,] ## "use" are the genes with rowcounts above cutoff, i.e. expressed genes

	counts.for.stem <- as.data.frame(filt.norm.counts)
	counts.for.stem <- counts.for.stem[,c(3:1,18:16,12:4,21:19,15:13)] # rearrange time series columns to be by time instead of alphabetical
	counts.for.stem.t <- t(counts.for.stem) # transform for the aggregation step
	agg.vec <- sub("rep.", "", row.names(counts.for.stem.t)) # generate vector with time series replicates grouped by time
	med.counts.for.stem.t <- aggregate(filt.norm.counts.t, by=list(agg.vec), function(n) median(n)) # median aggregate replicates
	row.names(med.counts.for.stem.t) <- med.counts.for.stem.t$Group.1 # put row.names back as times
	med.counts.for.stem.t <- med.counts.for.stem.t[,-(grep("Group.1", names(med.counts.for.stem.t)))] # dump the grouping column
	med.counts.for.stem <- as.data.frame(t(med.counts.for.stem.t)) # transform back to normal form now that you're done aggregating
	names(med.counts.for.stem) <- sub("Normal", 0, sub("FA", "", sub("day", "", names(med.counts.for.stem)))) # reduce the column names to just the numbers
	med.counts.for.stem <- med.counts.for.stem[,order(as.numeric(names(med.counts.for.stem)))] # reorder columns by number (not sure I needed to do this)
	med.counts.for.stem$id <- row.names(med.counts.for.stem) # make Ensembl id column before annotating dataframe
	med.counts.for.stem <- annotate_df(med.counts.for.stem) # annotate dataframe with Rory's function
	med.counts.for.stem <- med.counts.for.stem[,c(grep("mgi", names(med.counts.for.stem)), grep("[0-9]", names(med.counts.for.stem)))] # pull out the column you need, mgi_symbol and numerical time point columns
	write.table(med.counts.for.stem, file.path(resultsDir, "normalized.median.counts.genes.forstem.tab"), quote=F, sep="\t", col.names=T, row.names=F) # write out table