Created
August 3, 2022 14:24
-
-
Save liutiming/8723973ecd0ef2d8fc9245e10fee5324 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## 20220612_samples.qc.rename.autosome_ancestryplot.R for KING Ancestry plot, by Zhennan Zhu and Wei-Min Chen | |
rm(list=ls(all=TRUE)) | |
library(dplyr) | |
library(e1071) | |
options(scipen = 999) | |
prefix <- "20220612_samples.qc.rename.autosome" | |
pc <- read.table(paste0(prefix, "pc.txt"), header = TRUE) | |
phe <- read.table(paste0(prefix, "_popref.txt"), header = TRUE, stringsAsFactors = FALSE) | |
Population <- c(as.character(phe$Population), rep(NA, (nrow(pc) - nrow(phe)))) | |
dat <- cbind(pc, Population) | |
dat$fold <- 0 | |
set.seed(123) | |
dat$fold[!is.na(dat$Population)] <- sample(1:5, sum(!is.na(dat$Population)), replace = T) | |
numpc <- length(grep("PC", colnames(pc))) | |
numpc <- ifelse(numpc >= 10, 10, numpc) | |
svm.mod <- as.formula(paste0("Population~", paste0("PC", 1:numpc, collapse = "+"))) | |
best.set <- c(-2, 0) | |
step.mat <- cbind(c(3,3), c(2,2), c(1,1)) | |
para.one <- function(x) { | |
fold.index <- para[x, "fold"] | |
data_pop = mutate(dat[dat$fold != fold.index, ], Population = factor(Population, levels = c("EUR", "EAS", "AMR", "SAS", "AFR"))) | |
mod.svm <- svm(svm.mod, data = data_pop, kernel = "radial", gamma = 10^para[x, "gamma"], cost = 10^para[x, "cost"], fitted = F) | |
pred.svm <- predict(mod.svm, newdata = dat[dat$fold == fold.index, ]) | |
return(sum(pred.svm == dat[dat$fold == fold.index, "Population"]))} | |
for (i in 1:2) { | |
index <- dat$fold != 0 | |
best.count <- 0 | |
para.list <- NULL | |
for (j in 2:ncol(step.mat)) { | |
val1 <- best.set - step.mat[, j - 1] | |
val2 <- best.set + step.mat[, j - 1] | |
gamma.init <- seq(val1[1], val2[1], by = step.mat[1, j]) | |
cost.init <- seq(val1[2], val2[2], by = step.mat[2, j]) | |
para.set <- cbind(gamma = gamma.init, cost = rep(cost.init, each = length(gamma.init))) | |
para.all <- paste0(para.set[, 1], "_", para.set[, 2]) | |
para.set <- para.set[!para.all %in% para.list, ] | |
para.list <- para.all | |
para = cbind(fold = rep(1:5, nrow(para.set)), gamma = rep(para.set[, "gamma"], 5), cost = rep(para.set[, "cost"], 5), count = NA) | |
para <- as.data.frame(para) | |
if (require("doParallel", quietly = TRUE)) { | |
registerDoParallel(cores = 48) | |
results <- foreach(para.index = 1:nrow(para), .combine = c) %dopar% { | |
para.one(para.index)} | |
} else { | |
results <- sapply(1:nrow(para), para.one)} | |
para[, "count"] <- unlist(results) | |
results <- aggregate(count ~ cost + gamma, para, sum) | |
results <- results[order(results$cost, results$gamma), ] | |
best.para <- results[which.max(results$count), ] | |
if (max(results$count) > best.count) { | |
best.set <- unlist(best.para[, c("gamma", "cost")]) | |
best.count <- max(results$count)} | |
} | |
mod_svm <- svm(formula = svm.mod, data = mutate(dat[index, ], Population = factor(Population, levels = c("EUR", "EAS", "AMR", "SAS", "AFR"))), kernel = "radial", gamma = 10^best.set[1], | |
cost = 10^best.set[2], probability = T, fitted = T) | |
pred_svm <- predict(mod_svm, newdata = dat[, paste0("PC", 1:numpc)], probability = T) | |
suspi_pop <- index & pred_svm != dat$Population | |
if (sum(suspi_pop) == 0) { | |
break | |
} else { | |
dat$fold[suspi_pop] <- 0 | |
set.seed(123) | |
dat$fold[dat$fold != 0] <- sample(1:5, sum(dat$fold != 0), replace = T) | |
step.mat <- step.mat[, -1] | |
} | |
} | |
class.prob <- attr(pred_svm, "probabilities") | |
print(paste("Prepare the summary file, starts at", date())) | |
Summary <- function(v) { | |
v.whichmax <- which.max(v) | |
v2.whichmax <- which.max(v[-v.whichmax]) | |
v.2ndwhichmax <- v2.whichmax + (v2.whichmax>=v.whichmax) | |
c(v.whichmax, v[v.whichmax], v.2ndwhichmax, v[v.2ndwhichmax]) | |
} | |
valid <- dat[,"AFF"]==2 | |
class.prob.valid <- class.prob[valid,] | |
maxed <- apply(class.prob.valid, 1, Summary) | |
popcount <- ncol(class.prob) | |
popnames <- colnames(class.prob) | |
Ancestry.1 <- popnames[maxed[1,]] | |
pred_class <- Ancestry.1 | |
Pr.1 <- maxed[2,] | |
for(i in 1:length(Pr.1)) if(Pr.1[i]<=0.65){ | |
x <- sort(class.prob.valid[i,], decreasing=TRUE) | |
temp <- (1:popcount)[cumsum(x)>0.65][1] | |
pred_class[i] <- paste(names(x)[1:temp], collapse=";")} | |
pred.out <- cbind(dat[valid, c("FID", "IID", "PC1", "PC2")], Ancestry.1, round(Pr.1,4),popnames[maxed[3,]], round(maxed[4,],4), pred_class) | |
colnames(pred.out) <- c("FID", "IID", "PC1", "PC2", "Anc_1st", "Pr_1st", "Anc_2nd", "Pr_2nd", "Ancestry") | |
print(paste("summary file is ready ", date())) | |
write.table(pred.out, paste0(prefix, "_InferredAncestry.txt"), sep = " ", quote = FALSE, row.names = FALSE) | |
print(paste("Results are saved to", paste0(prefix, "_InferredAncestry.txt"), date())) | |
print(paste("Generate plots", date())) | |
pred.out$Ancestry <- as.character(pred.out$Ancestry) | |
pred.out$Ancestry[grep(";", pred.out$Ancestry)] <- "Missing" | |
Palette <- c("#1F78B4", "#33A02C", "#E31A1C", "#FF7F00", "#6A3D9A", "#B15928", "#A6CEE3", | |
"#B2DF8A", "#FB9A99", "#FDBF6F", "#CAB2D6", "#FFFF99", "#999999") | |
train.phe <- dat[dat$AFF == 1, ] | |
train.groups <- unique(train.phe$Population) | |
pred.colors <- rep(Palette[13], nrow(pred.out)) | |
for (i in 1:length(train.groups)) { | |
pred.colors[pred.out$Ancestry == train.groups[i]] <- Palette[i] | |
} | |
train.colors <- rep(0, nrow(train.phe)) | |
for (i in 1:length(train.groups)) { | |
train.colors[train.phe$Population == train.groups[i]] <- Palette[i] | |
} | |
x.adjust <- (max(train.phe$PC1, pred.out$PC1) - min(train.phe$PC1, pred.out$PC1))/10 | |
x.low <- min(train.phe$PC1, pred.out$PC1) - x.adjust | |
x.high <- max(train.phe$PC1, pred.out$PC1) + x.adjust | |
y.adjust <- (max(train.phe$PC2, pred.out$PC2) - min(train.phe$PC2, pred.out$PC2))/10 | |
y.low <- min(train.phe$PC2, pred.out$PC2) - y.adjust | |
y.high <- max(train.phe$PC2, pred.out$PC2) + y.adjust | |
postscript(paste0(prefix, "_ancestryplot.ps"), paper = "letter", horizontal = T) | |
ncols <- min(3, ceiling(length(unique(pred.out$Ancestry))/2)) | |
if ("Missing" %in% unique(pred.out$Ancestry)) { | |
legend.group <- c(sort(unique(pred.out$Ancestry)[unique(pred.out$Ancestry) != "Missing"]), "Missing") | |
all.color <- unique(pred.colors)[order(unique(pred.out$Ancestry))] | |
legend.color <- c(all.color[!all.color %in% c("#999999")], "#999999") | |
} else { | |
legend.group <- sort(unique(pred.out$Ancestry)) | |
legend.color <- unique(pred.colors)[order(unique(pred.out$Ancestry))] | |
} | |
if (!require(ggplot2, quietly = TRUE)) { | |
plot(pred.out$PC1, pred.out$PC2, col = pred.colors, xlab = "PC1", ylab = "PC2", xlim = c(x.low, x.high), | |
ylim = c(y.low, y.high), main = paste("Inferred Populations as Ancestry in", prefix), pch = 16) | |
legend("topright", legend = legend.group, col = legend.color, pch = 16, cex = 1) | |
par(mfrow = c(2, ncols)) | |
for (i in legend.group) { | |
subdata <- subset(pred.out, Ancestry == i) | |
plot(subdata$PC1, subdata$PC2, col = unique(pred.colors)[unique(pred.out$Ancestry) == i], | |
xlim = c(x.low, x.high), ylim = c(y.low, y.high), xlab = "PC1", ylab = "PC2", | |
main = paste0(i, " (N=", nrow(subdata), ")")) | |
} | |
par(mfrow = c(1, 1)) | |
plot(train.phe$PC1, train.phe$PC2, col = train.colors, xlim = c(x.low, x.high), | |
ylim = c(y.low, y.high), xlab = "PC1", ylab = "PC2", main = "Populations in Reference", pch = 16) | |
legend("topright", legend = sort(unique(train.phe$Population)), col = unique(train.colors)[order(unique(train.phe$Population))], | |
pch = 16, cex = 1) | |
} else { | |
p <- ggplot(pred.out, aes(x = PC1, y = PC2)) | |
p <- p + geom_point(aes(colour = factor(Ancestry, levels = legend.group))) + | |
xlim(x.low, x.high) + ylim(y.low, y.high) + labs(color = "") + scale_colour_manual(values = legend.color) + | |
ggtitle(paste("Inferred Populations as Ancestry in", prefix)) | |
print(p) | |
labels <- sapply(legend.group, function(x) paste0(x, " (N=", sum(pred.out$Ancestry == x), ")")) | |
p <- ggplot(pred.out, aes(x = PC1, y = PC2, colour = factor(Ancestry, levels = legend.group))) + | |
scale_color_manual(values = legend.color) + theme(legend.position = "none") | |
p <- p + geom_point() + xlim(x.low, x.high) + ylim(y.low, y.high) + | |
facet_wrap(~factor(Ancestry, levels = legend.group, labels = labels), ncol = min(3, ncols)) | |
print(p) | |
p <- ggplot(train.phe, aes(x = PC1, y = PC2)) | |
p <- p + geom_point(aes(colour = factor(Population, levels = sort(unique(Population))))) + | |
xlim(x.low, x.high) + ylim(y.low, y.high) | |
p <- p + labs(color = "") + scale_colour_manual(values = unique(train.colors)[order(unique(train.phe$Population))]) + | |
ggtitle("Populations in Reference") | |
print(p) | |
} | |
dev.off() | |
rm(list=ls(all=TRUE)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment