pavelk2 · June 23, 2015 12:18 · pavelk2 · Jun 23, 2015
diff --git a/ros_ml.R b/ros_ml.R
 #
 # Negative log likelihood for parameters `theta`, based on a subset `y` from
 # a sample of `n` iid values, associated with ranks in `m`.
 #
 Lambda <- function(theta, y, m, n) {
  mu <- theta[1]; sigma <- theta[2]
  powers <- diff(c(0, m, n+1)) - 1
  gaps <- diff(c(0, pnorm(y, mu, sigma), 1))
  z <- dnorm(y, mu, sigma)
  #a <- lgamma(n+1) - sum(lgamma(powers+1)) # Log multinomial coefficient
  b <- sum(log(z)) + sum(powers*log(gaps))
  return (-b)
 }
 #
 # Return the Weibull probability plotting points for `n` values.
 #
 pp <- function(n) {
  mid <- c()
  if (n >= 3) mid <- (2:(n-1) - 0.3175)/(n + 0.365)
  u <- (1/2)^(1/n)
  c(1 - u, mid, u)
 }
 #
 # ROS fit.
 #
 ROS <- function(y, m, n) {
  sigma <- qnorm(pp(n)[m])
  return (coef(lm(y ~ sigma)))
 }
 #
 # MLE fit (estimates only).
 #
 MLE <- function(y, m, n) {
  theta <- ROS(y, m, n)      # Use ROS for an initial estimate
  fit <- optim(theta, Lambda, y=y, m=m, n=n)
  z <- c(fit$par, theta)
  names(z) <- c("mu.MLE", "sigma.MLE", "mu.ROS", "sigma.ROS")
  return (z)                 # Return MLE *and* ROS estimates
 }
 #
 # Generate a dataset from a known Normal distribution.
 #
 n <- 100
 m <- sort(c(98, 59, 27)) # The ranks to select
 set.seed(17)             # Allow reproducible results
 x <- sort(rnorm(n))      # Sort the data once and for all
 y <- x[m]                # All fitting is based only on `y`, `m`, and `n`
 #
 # Fit using both methods.
 #
 fit <- MLE(y, m, n)
 mu.hat <- fit[1]; sigma.hat <- fit[2]
 #
 # Plot the MLE fit.
 #
 pp.x <- qnorm(pp(n), mu.hat, sigma.hat)
 pp.y <- pp.x[m]
 plot(pp.x, x, col="Gray", xlab="Fit", ylab="Values",
      main="Simulated Data, n=100")
 abline(c(0,1), col="Black", lty=3)
 points(pp.y, y, pch=16, col="#d03020", cex=1.25)
 #
 # Peform a Monte-Carlo assessment of the bias and variance of both.
 #
 system.time({            # Takes about 4 seconds per 1000 iterations
  sim <- replicate(1e3, {
    x <- sort(rnorm(n))
    y <- x[m]
    MLE(y, m, n)
  })
 })

 rowMeans(sim)            # Average estimates of the parameters
 apply(sim, 1, sd)        # Standard deviations of those estimates
	#
	# Negative log likelihood for parameters `theta`, based on a subset `y` from
	# a sample of `n` iid values, associated with ranks in `m`.
	#
	Lambda <- function(theta, y, m, n) {
	mu <- theta[1]; sigma <- theta[2]
	powers <- diff(c(0, m, n+1)) - 1
	gaps <- diff(c(0, pnorm(y, mu, sigma), 1))
	z <- dnorm(y, mu, sigma)
	#a <- lgamma(n+1) - sum(lgamma(powers+1)) # Log multinomial coefficient
	b <- sum(log(z)) + sum(powers*log(gaps))
	return (-b)
	}
	#
	# Return the Weibull probability plotting points for `n` values.
	#
	pp <- function(n) {
	mid <- c()
	if (n >= 3) mid <- (2:(n-1) - 0.3175)/(n + 0.365)
	u <- (1/2)^(1/n)
	c(1 - u, mid, u)
	}
	#
	# ROS fit.
	#
	ROS <- function(y, m, n) {
	sigma <- qnorm(pp(n)[m])
	return (coef(lm(y ~ sigma)))
	}
	#
	# MLE fit (estimates only).
	#
	MLE <- function(y, m, n) {
	theta <- ROS(y, m, n) # Use ROS for an initial estimate
	fit <- optim(theta, Lambda, y=y, m=m, n=n)
	z <- c(fit$par, theta)
	names(z) <- c("mu.MLE", "sigma.MLE", "mu.ROS", "sigma.ROS")
	return (z) # Return MLE and ROS estimates
	}
	#
	# Generate a dataset from a known Normal distribution.
	#
	n <- 100
	m <- sort(c(98, 59, 27)) # The ranks to select
	set.seed(17) # Allow reproducible results
	x <- sort(rnorm(n)) # Sort the data once and for all
	y <- x[m] # All fitting is based only on `y`, `m`, and `n`
	#
	# Fit using both methods.
	#
	fit <- MLE(y, m, n)
	mu.hat <- fit[1]; sigma.hat <- fit[2]
	#
	# Plot the MLE fit.
	#
	pp.x <- qnorm(pp(n), mu.hat, sigma.hat)
	pp.y <- pp.x[m]
	plot(pp.x, x, col="Gray", xlab="Fit", ylab="Values",
	main="Simulated Data, n=100")
	abline(c(0,1), col="Black", lty=3)
	points(pp.y, y, pch=16, col="#d03020", cex=1.25)
	#
	# Peform a Monte-Carlo assessment of the bias and variance of both.
	#
	system.time({ # Takes about 4 seconds per 1000 iterations
	sim <- replicate(1e3, {
	x <- sort(rnorm(n))
	y <- x[m]
	MLE(y, m, n)
	})
	})

	rowMeans(sim) # Average estimates of the parameters
	apply(sim, 1, sd) # Standard deviations of those estimates