nievergeltlab · May 25, 2021 20:03
diff --git a/00_launch_plink.sh b/00_launch_plink.sh
   for geno in $(ls genotypes | grep gz )
    do 
    echo running $geno
    date
    Rscript 00_plinkR_v2.txt  dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr15_093_103.out.dosage.fam \
                             1_MRSC_C_V2_2.5_PCL4.txt \
                             pts_mrsc_mix_am-qc-eur_pca.menv.mds_cov \
                             $geno \
                             "genotypes" \
                             "outputs" \
                             "pcl4_c_12_future"
                             done
                             
diff --git a/00_plinkR_v2.txt b/00_plinkR_v2.txt
 args <- commandArgs(trailingOnly = TRUE)
 famfile <- args[1]
 phenofile <- args[2]
 covfile <- args[3]
 genofile <- args[4]
 genofile_path <- args[5]
 outfile <- args[6]
 phenoname <- args[7]

 #Right now: assuming covariates C1-C5 are used, pheno is coded such that minimum score is 0
 #Model will give false positives for monomorphic markers, some filtering must be applied!

 #Load libraries
  library(data.table)
  library(MASS)
  library(fastglm)
  
  

  
  famfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr15_093_103.out.dosage.fam'
  phenofile='1_MRSC_C_V2_2.5_PCL4.txt'  
  covfile='pts_mrsc_mix_am-qc-eur_pca.menv.mds_cov' 
  genofile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz'
  genofile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz'
  genofile_path='genotypes'
  phenoname='pcl4_c_12_future'
  outfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz.results.txt'
  outfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_039_052.out.dosage.doscnt.gz.results.txt'
    
 # dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz
 # dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_039_052.out.dosage.doscnt.gz
  
 #Load famfile
  fam <- read.table(famfile,header=F,na.strings=c("NA","-9")) #PLINK phenotype

  names(fam) <- c("FID","IID","F","M","G","P")
 #Make a variable to index subjects, because this data will be merged with phenotype data and cause data to be resorted
  fam$order <- c(1:nrow(fam))

 #Load phenotypes.
  pheno <- read.table(phenofile,header=T,na.strings=c("NA","-9")) #PLINK phenotype
  #If the phenotype ranges from 0-4, you MUST make sure the mininum value is 0!
  
 #Load covariates
  covar <- read.table(covfile,header=T,na.strings=c("NA","-9")) #PLINK phenotype
   covar$intercept <- 1 # It is necessary to add an intercept
   
 #Notice that all.x=T, because the family file length must be preserved!
  d1 <- merge(fam,pheno,by=c("FID","IID"),all.x=T,suffixes=c("_fam",""))
  dm0 <- merge(d1,covar,by=c("FID","IID"),all.x=T,suffixes=c("_cov",""))
  dm <- dm0[order(dm0$order),] #This will order the data according to the geotypes
  dm$pheno <- dm[,phenoname] -1  #Make generic column 'pheno' that takes the data of the phenotype
    #have -1 for now because of my crummy coding
    
 #Load (stream?) genotypes. Currently a CSV file where columns 1,2,3 are rsID, A1, and A2. Map file is therefore optional. Possibly just stack into this file intiially!!
  genotypes <- fread(paste('zcat ', genofile_path,'/', genofile,sep=''),data.table=F,sep=",")[-1,] #remove row 1: that is just going to be SNP A1 A2... columns
  
 #Remove genotypes and data with missing phenotype or PCs
  remove <- is.na(dm$pcl4_c_12_future) | is.na(dm$C1)
  dm2 <- dm[!remove,]
  
  genotypes_rs <- genotypes[,c(1:3)] #SNP A1 A2 columns
  genotypes <-  as.matrix(genotypes[,-c(1:3)][,!remove]) #note, because the indexing for 'remove' is based on the famfile, which has no extra columns, must remove columns 1-3 in the genotype matrix to align indexing
  row.names(genotypes) <- genotypes_rs[,1]
 
 
 #Make covariate matrix
  covmat <- model.matrix(~C1+C2+C3+C4+C5,data=dm2)

 #Get estimate for theta assuming null genetic effect
  theta_baseline <- summary(glm.nb(pheno ~  C1 + C2 + C3 +C4+C5,data=dm2))$theta

 #apply function right now, could also do some splitting based approach

  nbglm <- function(genovector,datamatrix,theta)
  {
  #it's faster to save the variable and concatenate the relevant stuff than to use SUMMARY
   m <- fastglm(cbind(genovector,datamatrix),dm2$pheno, ,family=negative.binomial(theta),method=3)
  
  #Contrast with a gaussian...
   #m <- fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=gaussian())
  
  # print(summary(m))
   c(m$coefficients,m$se)
   #if you just save c(m$coefficients[1],m$se[1]) you ignore the covariates but makes indexing easy. if you do this , set zvalues <- results[,1]/results[,2]
  # summary((fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=negative.binomial(theta),method=3)))$coefficients
   
  }
  
  #everycolumn is a SNP. Results are reported as estimate, SE, tvalue, and pr>t.Currently genovector,
  results <- t(apply(genotypes,1,nbglm,datamatrix=covmat,theta=theta_baseline))
  

  zvalues <- results[,1]/results[,8] #this indexing is precariuous and needs fixing
  pvalues <- 2*pnorm(abs(zvalues),lower.tail=F)
 
  total_results <- cbind(genotypes_rs,results[,c(1,8)],zvalues,pvalues)
  names(total_results) <- c("SNP","A1","A2","Beta","SE","Z","P")
  
  write.table(total_results,file=paste(outfile,'/',phenoname,'.',genofile,'.txt',sep=''),quote=F,row.names=F)
  
  #cat outputs/*.txt | awk '{if (NR==1 || $1 != "SNP") print}' > adam.results.total.txt
  # library(data.table)
  # total_results <- fread('adam.results.total.txt',data.table=F)
  # liz_snps <- fread('zcat liz_mrsc.pcl4_c_12_future.assoc.linear_fixed.gz',data.table=F)
  
  
 # data <- total_results[which(total_results$SNP %in% liz_snps$ID) ,]

 # data <- liz_snps[which( liz_snps$ID %in% total_results$SNP ) ,]

 # outfilename=paste(outfile,'.qq.png',sep='')

 # outfilename="chr22_liz.png"

 # unadj_filtered <- sort(data[,"P"])
 # UNADJ <- -log(unadj_filtered,10)
 # QQ <- -log(ppoints(length(UNADJ)),10)

 # GCfactor= round(median(qchisq(data[,"P"],1,lower.tail=F),na.rm=T)/.455,3)

 # png(paste(outfilename,'.png', sep=''))
 # par(bty='l')

 # plot(c(0,max(QQ)), c(0,max(UNADJ)+ 1), xlab='Expected -log10(p)', ylab='Observed -log10(p)', col='white', cex=1.3, cex.axis=1.2, cex.lab=1.5,pch=20)

 # if(errorbars == 1)
 # {
    # #code for error bars
    # ranks <- c(1:length(QQ))
    # CIlower <- qbeta(.025, ranks, length(QQ)-ranks +1)
    # CIupper <- qbeta(.975, ranks, length(QQ)-ranks +1)
    # plotCIlower <- -log(CIlower,10)
    # plotCIupper <- -log(CIupper,10)
    # segments(x0=QQ,x1=QQ, y0=plotCIlower,y1=plotCIupper,lwd=2,col='grey')
 # }

 # abline(0,1,col='red', lwd=2)
 # points(QQ, UNADJ, ,pch=20,col='blue')

 # legend('topleft', paste('GC Lambda =', GCfactor),  bty='n', cex=1.5, xjust=1)

 # dev.off()

  
  
 # # Error checking
 # which(pvalues==0)
  
  # #odd things going on which.min(pvalues)
   # summary(glm.nb(pheno ~  genotypes[503,] + C1 + C2 + C3 +C4+C5,data=dm2)) #won't even run under this model
   # summary(fastglm(cbind(genotypes[503,],covmat),dm2$pheno, family=negative.binomial(theta_baseline),method=3))
   # summary(glm.nb(pheno ~  genotypes[2,] + C1 + C2 + C3 +C4+C5,data=dm2))
   
   # which(total_results$P < 0.00001)
   
   # fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=negative.binomial(theta),method=3)
 

 # total_results[which(total_results$P < 5e-8 & total_results$P > 8.312274e-83),]
 
  # data[which(data$P < 5e-7),]
  # which(genotypes_rs[,1]== "rs190121825")
 
 # which(genotypes_rs[,1]== "rs16991916")
 # summary(fastglm(cbind(genotypes[84565,],covmat),dm2$pheno, family=negative.binomial(theta_baseline),method=3))
     # summary(glm.nb(pheno ~  genotypes[84565,] + C1 + C2 + C3 +C4+C5,data=dm2))
   # #Convergence isn't sufficient, fastglm tends to converge where glm does not!
 # 84565

 # liz
	for geno in $(ls genotypes \| grep gz )
	do
	echo running $geno
	date
	Rscript 00_plinkR_v2.txt dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr15_093_103.out.dosage.fam \
	1_MRSC_C_V2_2.5_PCL4.txt \
	pts_mrsc_mix_am-qc-eur_pca.menv.mds_cov \
	$geno \
	"genotypes" \
	"outputs" \
	"pcl4_c_12_future"
	done
	args <- commandArgs(trailingOnly = TRUE)
	famfile <- args[1]
	phenofile <- args[2]
	covfile <- args[3]
	genofile <- args[4]
	genofile_path <- args[5]
	outfile <- args[6]
	phenoname <- args[7]

	#Right now: assuming covariates C1-C5 are used, pheno is coded such that minimum score is 0
	#Model will give false positives for monomorphic markers, some filtering must be applied!

	#Load libraries
	library(data.table)
	library(MASS)
	library(fastglm)




	famfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr15_093_103.out.dosage.fam'
	phenofile='1_MRSC_C_V2_2.5_PCL4.txt'
	covfile='pts_mrsc_mix_am-qc-eur_pca.menv.mds_cov'
	genofile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz'
	genofile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz'
	genofile_path='genotypes'
	phenoname='pcl4_c_12_future'
	outfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz.results.txt'
	outfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_039_052.out.dosage.doscnt.gz.results.txt'

	# dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz
	# dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_039_052.out.dosage.doscnt.gz

	#Load famfile
	fam <- read.table(famfile,header=F,na.strings=c("NA","-9")) #PLINK phenotype

	names(fam) <- c("FID","IID","F","M","G","P")
	#Make a variable to index subjects, because this data will be merged with phenotype data and cause data to be resorted
	fam$order <- c(1:nrow(fam))

	#Load phenotypes.
	pheno <- read.table(phenofile,header=T,na.strings=c("NA","-9")) #PLINK phenotype
	#If the phenotype ranges from 0-4, you MUST make sure the mininum value is 0!

	#Load covariates
	covar <- read.table(covfile,header=T,na.strings=c("NA","-9")) #PLINK phenotype
	covar$intercept <- 1 # It is necessary to add an intercept

	#Notice that all.x=T, because the family file length must be preserved!
	d1 <- merge(fam,pheno,by=c("FID","IID"),all.x=T,suffixes=c("_fam",""))
	dm0 <- merge(d1,covar,by=c("FID","IID"),all.x=T,suffixes=c("_cov",""))
	dm <- dm0[order(dm0$order),] #This will order the data according to the geotypes
	dm$pheno <- dm[,phenoname] -1 #Make generic column 'pheno' that takes the data of the phenotype
	#have -1 for now because of my crummy coding

	#Load (stream?) genotypes. Currently a CSV file where columns 1,2,3 are rsID, A1, and A2. Map file is therefore optional. Possibly just stack into this file intiially!!
	genotypes <- fread(paste('zcat ', genofile_path,'/', genofile,sep=''),data.table=F,sep=",")[-1,] #remove row 1: that is just going to be SNP A1 A2... columns

	#Remove genotypes and data with missing phenotype or PCs
	remove <- is.na(dm$pcl4_c_12_future) \| is.na(dm$C1)
	dm2 <- dm[!remove,]

	genotypes_rs <- genotypes[,c(1:3)] #SNP A1 A2 columns
	genotypes <- as.matrix(genotypes[,-c(1:3)][,!remove]) #note, because the indexing for 'remove' is based on the famfile, which has no extra columns, must remove columns 1-3 in the genotype matrix to align indexing
	row.names(genotypes) <- genotypes_rs[,1]


	#Make covariate matrix
	covmat <- model.matrix(~C1+C2+C3+C4+C5,data=dm2)

	#Get estimate for theta assuming null genetic effect
	theta_baseline <- summary(glm.nb(pheno ~ C1 + C2 + C3 +C4+C5,data=dm2))$theta

	#apply function right now, could also do some splitting based approach

	nbglm <- function(genovector,datamatrix,theta)
	{
	#it's faster to save the variable and concatenate the relevant stuff than to use SUMMARY
	m <- fastglm(cbind(genovector,datamatrix),dm2$pheno, ,family=negative.binomial(theta),method=3)

	#Contrast with a gaussian...
	#m <- fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=gaussian())

	# print(summary(m))
	c(m$coefficients,m$se)
	#if you just save c(m$coefficients[1],m$se[1]) you ignore the covariates but makes indexing easy. if you do this , set zvalues <- results[,1]/results[,2]
	# summary((fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=negative.binomial(theta),method=3)))$coefficients

	}

	#everycolumn is a SNP. Results are reported as estimate, SE, tvalue, and pr>t.Currently genovector,
	results <- t(apply(genotypes,1,nbglm,datamatrix=covmat,theta=theta_baseline))


	zvalues <- results[,1]/results[,8] #this indexing is precariuous and needs fixing
	pvalues <- 2*pnorm(abs(zvalues),lower.tail=F)

	total_results <- cbind(genotypes_rs,results[,c(1,8)],zvalues,pvalues)
	names(total_results) <- c("SNP","A1","A2","Beta","SE","Z","P")

	write.table(total_results,file=paste(outfile,'/',phenoname,'.',genofile,'.txt',sep=''),quote=F,row.names=F)

	#cat outputs/*.txt \| awk '{if (NR==1 \|\| $1 != "SNP") print}' > adam.results.total.txt
	# library(data.table)
	# total_results <- fread('adam.results.total.txt',data.table=F)
	# liz_snps <- fread('zcat liz_mrsc.pcl4_c_12_future.assoc.linear_fixed.gz',data.table=F)


	# data <- total_results[which(total_results$SNP %in% liz_snps$ID) ,]

	# data <- liz_snps[which( liz_snps$ID %in% total_results$SNP ) ,]

	# outfilename=paste(outfile,'.qq.png',sep='')

	# outfilename="chr22_liz.png"

	# unadj_filtered <- sort(data[,"P"])
	# UNADJ <- -log(unadj_filtered,10)
	# QQ <- -log(ppoints(length(UNADJ)),10)

	# GCfactor= round(median(qchisq(data[,"P"],1,lower.tail=F),na.rm=T)/.455,3)

	# png(paste(outfilename,'.png', sep=''))
	# par(bty='l')

	# plot(c(0,max(QQ)), c(0,max(UNADJ)+ 1), xlab='Expected -log10(p)', ylab='Observed -log10(p)', col='white', cex=1.3, cex.axis=1.2, cex.lab=1.5,pch=20)

	# if(errorbars == 1)
	# {
	# #code for error bars
	# ranks <- c(1:length(QQ))
	# CIlower <- qbeta(.025, ranks, length(QQ)-ranks +1)
	# CIupper <- qbeta(.975, ranks, length(QQ)-ranks +1)
	# plotCIlower <- -log(CIlower,10)
	# plotCIupper <- -log(CIupper,10)
	# segments(x0=QQ,x1=QQ, y0=plotCIlower,y1=plotCIupper,lwd=2,col='grey')
	# }

	# abline(0,1,col='red', lwd=2)
	# points(QQ, UNADJ, ,pch=20,col='blue')

	# legend('topleft', paste('GC Lambda =', GCfactor), bty='n', cex=1.5, xjust=1)

	# dev.off()



	# # Error checking
	# which(pvalues==0)

	# #odd things going on which.min(pvalues)
	# summary(glm.nb(pheno ~ genotypes[503,] + C1 + C2 + C3 +C4+C5,data=dm2)) #won't even run under this model
	# summary(fastglm(cbind(genotypes[503,],covmat),dm2$pheno, family=negative.binomial(theta_baseline),method=3))
	# summary(glm.nb(pheno ~ genotypes[2,] + C1 + C2 + C3 +C4+C5,data=dm2))

	# which(total_results$P < 0.00001)

	# fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=negative.binomial(theta),method=3)


	# total_results[which(total_results$P < 5e-8 & total_results$P > 8.312274e-83),]

	# data[which(data$P < 5e-7),]
	# which(genotypes_rs[,1]== "rs190121825")

	# which(genotypes_rs[,1]== "rs16991916")
	# summary(fastglm(cbind(genotypes[84565,],covmat),dm2$pheno, family=negative.binomial(theta_baseline),method=3))
	# summary(glm.nb(pheno ~ genotypes[84565,] + C1 + C2 + C3 +C4+C5,data=dm2))
	# #Convergence isn't sufficient, fastglm tends to converge where glm does not!
	# 84565

	# liz