Last active
May 25, 2021 20:03
-
-
Save nievergeltlab/5cbaa65b550b0b44f48b7a42126d1e26 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for geno in $(ls genotypes | grep gz ) | |
do | |
echo running $geno | |
date | |
Rscript 00_plinkR_v2.txt dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr15_093_103.out.dosage.fam \ | |
1_MRSC_C_V2_2.5_PCL4.txt \ | |
pts_mrsc_mix_am-qc-eur_pca.menv.mds_cov \ | |
$geno \ | |
"genotypes" \ | |
"outputs" \ | |
"pcl4_c_12_future" | |
done | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
args <- commandArgs(trailingOnly = TRUE) | |
famfile <- args[1] | |
phenofile <- args[2] | |
covfile <- args[3] | |
genofile <- args[4] | |
genofile_path <- args[5] | |
outfile <- args[6] | |
phenoname <- args[7] | |
#Right now: assuming covariates C1-C5 are used, pheno is coded such that minimum score is 0 | |
#Model will give false positives for monomorphic markers, some filtering must be applied! | |
#Load libraries | |
library(data.table) | |
library(MASS) | |
library(fastglm) | |
famfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr15_093_103.out.dosage.fam' | |
phenofile='1_MRSC_C_V2_2.5_PCL4.txt' | |
covfile='pts_mrsc_mix_am-qc-eur_pca.menv.mds_cov' | |
genofile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz' | |
genofile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz' | |
genofile_path='genotypes' | |
phenoname='pcl4_c_12_future' | |
outfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz.results.txt' | |
outfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_039_052.out.dosage.doscnt.gz.results.txt' | |
# dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz | |
# dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_039_052.out.dosage.doscnt.gz | |
#Load famfile | |
fam <- read.table(famfile,header=F,na.strings=c("NA","-9")) #PLINK phenotype | |
names(fam) <- c("FID","IID","F","M","G","P") | |
#Make a variable to index subjects, because this data will be merged with phenotype data and cause data to be resorted | |
fam$order <- c(1:nrow(fam)) | |
#Load phenotypes. | |
pheno <- read.table(phenofile,header=T,na.strings=c("NA","-9")) #PLINK phenotype | |
#If the phenotype ranges from 0-4, you MUST make sure the mininum value is 0! | |
#Load covariates | |
covar <- read.table(covfile,header=T,na.strings=c("NA","-9")) #PLINK phenotype | |
covar$intercept <- 1 # It is necessary to add an intercept | |
#Notice that all.x=T, because the family file length must be preserved! | |
d1 <- merge(fam,pheno,by=c("FID","IID"),all.x=T,suffixes=c("_fam","")) | |
dm0 <- merge(d1,covar,by=c("FID","IID"),all.x=T,suffixes=c("_cov","")) | |
dm <- dm0[order(dm0$order),] #This will order the data according to the geotypes | |
dm$pheno <- dm[,phenoname] -1 #Make generic column 'pheno' that takes the data of the phenotype | |
#have -1 for now because of my crummy coding | |
#Load (stream?) genotypes. Currently a CSV file where columns 1,2,3 are rsID, A1, and A2. Map file is therefore optional. Possibly just stack into this file intiially!! | |
genotypes <- fread(paste('zcat ', genofile_path,'/', genofile,sep=''),data.table=F,sep=",")[-1,] #remove row 1: that is just going to be SNP A1 A2... columns | |
#Remove genotypes and data with missing phenotype or PCs | |
remove <- is.na(dm$pcl4_c_12_future) | is.na(dm$C1) | |
dm2 <- dm[!remove,] | |
genotypes_rs <- genotypes[,c(1:3)] #SNP A1 A2 columns | |
genotypes <- as.matrix(genotypes[,-c(1:3)][,!remove]) #note, because the indexing for 'remove' is based on the famfile, which has no extra columns, must remove columns 1-3 in the genotype matrix to align indexing | |
row.names(genotypes) <- genotypes_rs[,1] | |
#Make covariate matrix | |
covmat <- model.matrix(~C1+C2+C3+C4+C5,data=dm2) | |
#Get estimate for theta assuming null genetic effect | |
theta_baseline <- summary(glm.nb(pheno ~ C1 + C2 + C3 +C4+C5,data=dm2))$theta | |
#apply function right now, could also do some splitting based approach | |
nbglm <- function(genovector,datamatrix,theta) | |
{ | |
#it's faster to save the variable and concatenate the relevant stuff than to use SUMMARY | |
m <- fastglm(cbind(genovector,datamatrix),dm2$pheno, ,family=negative.binomial(theta),method=3) | |
#Contrast with a gaussian... | |
#m <- fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=gaussian()) | |
# print(summary(m)) | |
c(m$coefficients,m$se) | |
#if you just save c(m$coefficients[1],m$se[1]) you ignore the covariates but makes indexing easy. if you do this , set zvalues <- results[,1]/results[,2] | |
# summary((fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=negative.binomial(theta),method=3)))$coefficients | |
} | |
#everycolumn is a SNP. Results are reported as estimate, SE, tvalue, and pr>t.Currently genovector, | |
results <- t(apply(genotypes,1,nbglm,datamatrix=covmat,theta=theta_baseline)) | |
zvalues <- results[,1]/results[,8] #this indexing is precariuous and needs fixing | |
pvalues <- 2*pnorm(abs(zvalues),lower.tail=F) | |
total_results <- cbind(genotypes_rs,results[,c(1,8)],zvalues,pvalues) | |
names(total_results) <- c("SNP","A1","A2","Beta","SE","Z","P") | |
write.table(total_results,file=paste(outfile,'/',phenoname,'.',genofile,'.txt',sep=''),quote=F,row.names=F) | |
#cat outputs/*.txt | awk '{if (NR==1 || $1 != "SNP") print}' > adam.results.total.txt | |
# library(data.table) | |
# total_results <- fread('adam.results.total.txt',data.table=F) | |
# liz_snps <- fread('zcat liz_mrsc.pcl4_c_12_future.assoc.linear_fixed.gz',data.table=F) | |
# data <- total_results[which(total_results$SNP %in% liz_snps$ID) ,] | |
# data <- liz_snps[which( liz_snps$ID %in% total_results$SNP ) ,] | |
# outfilename=paste(outfile,'.qq.png',sep='') | |
# outfilename="chr22_liz.png" | |
# unadj_filtered <- sort(data[,"P"]) | |
# UNADJ <- -log(unadj_filtered,10) | |
# QQ <- -log(ppoints(length(UNADJ)),10) | |
# GCfactor= round(median(qchisq(data[,"P"],1,lower.tail=F),na.rm=T)/.455,3) | |
# png(paste(outfilename,'.png', sep='')) | |
# par(bty='l') | |
# plot(c(0,max(QQ)), c(0,max(UNADJ)+ 1), xlab='Expected -log10(p)', ylab='Observed -log10(p)', col='white', cex=1.3, cex.axis=1.2, cex.lab=1.5,pch=20) | |
# if(errorbars == 1) | |
# { | |
# #code for error bars | |
# ranks <- c(1:length(QQ)) | |
# CIlower <- qbeta(.025, ranks, length(QQ)-ranks +1) | |
# CIupper <- qbeta(.975, ranks, length(QQ)-ranks +1) | |
# plotCIlower <- -log(CIlower,10) | |
# plotCIupper <- -log(CIupper,10) | |
# segments(x0=QQ,x1=QQ, y0=plotCIlower,y1=plotCIupper,lwd=2,col='grey') | |
# } | |
# abline(0,1,col='red', lwd=2) | |
# points(QQ, UNADJ, ,pch=20,col='blue') | |
# legend('topleft', paste('GC Lambda =', GCfactor), bty='n', cex=1.5, xjust=1) | |
# dev.off() | |
# # Error checking | |
# which(pvalues==0) | |
# #odd things going on which.min(pvalues) | |
# summary(glm.nb(pheno ~ genotypes[503,] + C1 + C2 + C3 +C4+C5,data=dm2)) #won't even run under this model | |
# summary(fastglm(cbind(genotypes[503,],covmat),dm2$pheno, family=negative.binomial(theta_baseline),method=3)) | |
# summary(glm.nb(pheno ~ genotypes[2,] + C1 + C2 + C3 +C4+C5,data=dm2)) | |
# which(total_results$P < 0.00001) | |
# fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=negative.binomial(theta),method=3) | |
# total_results[which(total_results$P < 5e-8 & total_results$P > 8.312274e-83),] | |
# data[which(data$P < 5e-7),] | |
# which(genotypes_rs[,1]== "rs190121825") | |
# which(genotypes_rs[,1]== "rs16991916") | |
# summary(fastglm(cbind(genotypes[84565,],covmat),dm2$pheno, family=negative.binomial(theta_baseline),method=3)) | |
# summary(glm.nb(pheno ~ genotypes[84565,] + C1 + C2 + C3 +C4+C5,data=dm2)) | |
# #Convergence isn't sufficient, fastglm tends to converge where glm does not! | |
# 84565 | |
# liz | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment