cdiener · September 1, 2017 21:40
diff --git a/orlitsky.R b/orlitsky.R
 library(data.table)
 library(ggplot2)
 library(magrittr)
 library(pbapply)

 large <- fread("ERR260132_genes.csv")

 #' Sample a rarefied version of a count vector.
 #'
 #' @param x A named vector of counts.
 #' @param n Total number of counts in the new vector.
 #' @return A named vector with counts for each observed event in the sample.
 #' @examples
 #'  counts <- 1:100
 #'  subsample(counts, 10)
 #' @export
 subsample <- function(x, n) {
    sa <- tabulate(sample(1:length(x), n, replace=TRUE, prob=x))
    names(sa) <- names(x)[1:length(sa)]
    return(sa[sa > 0])
 }

 #' Orlitsky's diminishing attenuation estimator (q1/3).
 #'
 #' @param data A named vector of counts for which to approximate discrete
 #'  probablities.
 #' @return A named vector `p` assigning a probability to each event in data.
 #'  Those do not sum up to one since there is also a remaining probability to
 #'  observe a new event given as p(new) = 1 - sum(p).
 #' @examples
 #'  x <- sample(1:10, 100, replace=T)
 #'  p <- orlitsky(x)
 #' @export
 orlitsky <- function(data) {
    n <- length(data)
    phi <- c(tabulate(data), 0)
    cn <- ceiling((n+1)^1/3)
    new <- max(cn, phi[1] + 1)
    probs <- (data + 1) * pmax(cn, phi[data + 1] + 1) / pmax(cn, phi[data])
    names(probs) <- names(data)
    return(probs/sum(c(probs, new)))
 }

 real <- large[, expected_count]
 names(real) <- large[, name]
 #real <- rep(1, 1000)
 #names(real) <- as.character(1:1000)
 real <- real/sum(real)


 error <- function(x, y) {
    return(median(x / y[names(x)]))
 }

 #' Benchmark different probability estimators.
 #'
 #' @param real The real discrete distribution from which to generate samples.
 #' @param n A series of sample sizes.
 #' @return Nothing.
 #' @examples
 #'  p <- (1:10)/10
 #'  benchmark(p)
 #' @export
 benchmark <- function(real, n = 10^seq(1, 8, length.out=100)) {
    samples <- pblapply(10^seq(1, 8, length.out=100), function(k) {
        x <- subsample(real, k)
        po <- orlitsky(x)              # diminishing attenuation
        pp <- x/sum(x)                 # proportions
        pc <- (x + 1)/(sum(x + 1) + 1) # pseudo counts
        data.table(n=k,
                orlitsky=error(po, real),
                orlitsky_new=1 - sum(po),
                proportion=error(pp, real),
                pseudo=error(pc, real))
    })
    return(rbindlist(samples))
 }

 bench <- benchmark(real)
 bench_long <- melt(bench, id.vars="n")

 att_plot <- ggplot(rare_long[!grepl("new", variable)],
                   aes(x=n, y=value, col=variable)) +
            geom_point() + geom_smooth() +
            scale_x_log10() + scale_y_log10()
	library(data.table)
	library(ggplot2)
	library(magrittr)
	library(pbapply)

	large <- fread("ERR260132_genes.csv")

	#' Sample a rarefied version of a count vector.
	#'
	#' @param x A named vector of counts.
	#' @param n Total number of counts in the new vector.
	#' @return A named vector with counts for each observed event in the sample.
	#' @examples
	#' counts <- 1:100
	#' subsample(counts, 10)
	#' @export
	subsample <- function(x, n) {
	sa <- tabulate(sample(1:length(x), n, replace=TRUE, prob=x))
	names(sa) <- names(x)[1:length(sa)]
	return(sa[sa > 0])
	}

	#' Orlitsky's diminishing attenuation estimator (q1/3).
	#'
	#' @param data A named vector of counts for which to approximate discrete
	#' probablities.
	#' @return A named vector `p` assigning a probability to each event in data.
	#' Those do not sum up to one since there is also a remaining probability to
	#' observe a new event given as p(new) = 1 - sum(p).
	#' @examples
	#' x <- sample(1:10, 100, replace=T)
	#' p <- orlitsky(x)
	#' @export
	orlitsky <- function(data) {
	n <- length(data)
	phi <- c(tabulate(data), 0)
	cn <- ceiling((n+1)^1/3)
	new <- max(cn, phi[1] + 1)
	probs <- (data + 1) * pmax(cn, phi[data + 1] + 1) / pmax(cn, phi[data])
	names(probs) <- names(data)
	return(probs/sum(c(probs, new)))
	}

	real <- large[, expected_count]
	names(real) <- large[, name]
	#real <- rep(1, 1000)
	#names(real) <- as.character(1:1000)
	real <- real/sum(real)


	error <- function(x, y) {
	return(median(x / y[names(x)]))
	}

	#' Benchmark different probability estimators.
	#'
	#' @param real The real discrete distribution from which to generate samples.
	#' @param n A series of sample sizes.
	#' @return Nothing.
	#' @examples
	#' p <- (1:10)/10
	#' benchmark(p)
	#' @export
	benchmark <- function(real, n = 10^seq(1, 8, length.out=100)) {
	samples <- pblapply(10^seq(1, 8, length.out=100), function(k) {
	x <- subsample(real, k)
	po <- orlitsky(x) # diminishing attenuation
	pp <- x/sum(x) # proportions
	pc <- (x + 1)/(sum(x + 1) + 1) # pseudo counts
	data.table(n=k,
	orlitsky=error(po, real),
	orlitsky_new=1 - sum(po),
	proportion=error(pp, real),
	pseudo=error(pc, real))
	})
	return(rbindlist(samples))
	}

	bench <- benchmark(real)
	bench_long <- melt(bench, id.vars="n")

	att_plot <- ggplot(rare_long[!grepl("new", variable)],
	aes(x=n, y=value, col=variable)) +
	geom_point() + geom_smooth() +
	scale_x_log10() + scale_y_log10()