Created
September 1, 2017 21:40
-
-
Save cdiener/069a24964d0b426de067020e20bca0ec to your computer and use it in GitHub Desktop.
Source code for my blog post
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(data.table) | |
library(ggplot2) | |
library(magrittr) | |
library(pbapply) | |
large <- fread("ERR260132_genes.csv") | |
#' Sample a rarefied version of a count vector. | |
#' | |
#' @param x A named vector of counts. | |
#' @param n Total number of counts in the new vector. | |
#' @return A named vector with counts for each observed event in the sample. | |
#' @examples | |
#' counts <- 1:100 | |
#' subsample(counts, 10) | |
#' @export | |
subsample <- function(x, n) { | |
sa <- tabulate(sample(1:length(x), n, replace=TRUE, prob=x)) | |
names(sa) <- names(x)[1:length(sa)] | |
return(sa[sa > 0]) | |
} | |
#' Orlitsky's diminishing attenuation estimator (q1/3). | |
#' | |
#' @param data A named vector of counts for which to approximate discrete | |
#' probablities. | |
#' @return A named vector `p` assigning a probability to each event in data. | |
#' Those do not sum up to one since there is also a remaining probability to | |
#' observe a new event given as p(new) = 1 - sum(p). | |
#' @examples | |
#' x <- sample(1:10, 100, replace=T) | |
#' p <- orlitsky(x) | |
#' @export | |
orlitsky <- function(data) { | |
n <- length(data) | |
phi <- c(tabulate(data), 0) | |
cn <- ceiling((n+1)^1/3) | |
new <- max(cn, phi[1] + 1) | |
probs <- (data + 1) * pmax(cn, phi[data + 1] + 1) / pmax(cn, phi[data]) | |
names(probs) <- names(data) | |
return(probs/sum(c(probs, new))) | |
} | |
real <- large[, expected_count] | |
names(real) <- large[, name] | |
#real <- rep(1, 1000) | |
#names(real) <- as.character(1:1000) | |
real <- real/sum(real) | |
error <- function(x, y) { | |
return(median(x / y[names(x)])) | |
} | |
#' Benchmark different probability estimators. | |
#' | |
#' @param real The real discrete distribution from which to generate samples. | |
#' @param n A series of sample sizes. | |
#' @return Nothing. | |
#' @examples | |
#' p <- (1:10)/10 | |
#' benchmark(p) | |
#' @export | |
benchmark <- function(real, n = 10^seq(1, 8, length.out=100)) { | |
samples <- pblapply(10^seq(1, 8, length.out=100), function(k) { | |
x <- subsample(real, k) | |
po <- orlitsky(x) # diminishing attenuation | |
pp <- x/sum(x) # proportions | |
pc <- (x + 1)/(sum(x + 1) + 1) # pseudo counts | |
data.table(n=k, | |
orlitsky=error(po, real), | |
orlitsky_new=1 - sum(po), | |
proportion=error(pp, real), | |
pseudo=error(pc, real)) | |
}) | |
return(rbindlist(samples)) | |
} | |
bench <- benchmark(real) | |
bench_long <- melt(bench, id.vars="n") | |
att_plot <- ggplot(rare_long[!grepl("new", variable)], | |
aes(x=n, y=value, col=variable)) + | |
geom_point() + geom_smooth() + | |
scale_x_log10() + scale_y_log10() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment