hadley · March 16, 2025 18:52 · hadley · Oct 24, 2022
diff --git a/dplyr-summarise.R b/dplyr-summarise.R
 # What's the most natural way to express this code in base R?
 library(dplyr, warn.conflicts = FALSE)
 mtcars %>%
  group_by(cyl) %>%
  summarise(mean = mean(disp), n = n())
 #> # A tibble: 3 x 3
 #>     cyl  mean     n
 #>   <dbl> <dbl> <int>
 #> 1     4  105.    11
 #> 2     6  183.     7
 #> 3     8  353.    14

 # tapply() ----------------------------------------------------------------
 data.frame(
  cyl = sort(unique(mtcars$cyl)),
  mean = tapply(mtcars$disp, mtcars$cyl, mean),
  n = tapply(mtcars$disp, mtcars$cyl, length)
 )
 #>   cyl     mean  n
 #> 4   4 105.1364 11
 #> 6   6 183.3143  7
 #> 8   8 353.1000 14
 # - hard to generalise to more than one group because tapply() will
 #   return an array
 # - is `sort(unique(mtcars$cyl))` guaranteed to be in the same order as
 #   the tapply() output?

 # aggregate() -------------------------------------------------------------
 df_mean <- aggregate(mtcars["disp"], mtcars["cyl"], mean)
 df_length <- aggregate(mtcars["disp"], mtcars["cyl"], length)
 names(df_mean)[2] <- "mean"
 names(df_length)[2] <- "n"
 merge(df_mean, df_length, by = "cyl")
 #>   cyl     mean  n
 #> 1   4 105.1364 11
 #> 2   6 183.3143  7
 #> 3   8 353.1000 14

 # + generalises in stratightforward to multiple grouping variables and
 #   multiple summary variables
 # - need to manually rename summary variables

 # Could also use formula interface
 # https://twitter.com/tjmahr/status/1231255000766005248
 df_mean <- aggregate(disp ~ cyl, mtcars, mean)
 df_length <- aggregate(disp ~ cyl, mtcars, length)

 # by() --------------------------------------------------------------------
 mtcars_by <- by(mtcars, mtcars$cyl, function(df) {
  data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
 })
 do.call(rbind, mtcars_by)
 #>   cyl     mean  n
 #> 4   4 105.1364 11
 #> 6   6 183.3143  7
 #> 8   8 353.1000 14

 # + generalises easily to more/different summaries
 # - need to know about anonymous functions + do.call + rbind

 # by() = split() + lapply()
 mtcars_by <- lapply(split(mtcars, mtcars$cyl), function(df) {
  data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
 })
 do.call(rbind, mtcars_by)
 #>   cyl     mean  n
 #> 4   4 105.1364 11
 #> 6   6 183.3143  7
 #> 8   8 353.1000 14

 # Manual indexing approahes -------------------------------------------------
 # from https://twitter.com/fartmiasma/status/1231258479865647105
 cyl_counts <- sort(unique(mtcars$cyl))
 tabl <- sapply(cyl_counts, function(ct) {
  with(mtcars, c(cyl = ct, mean = mean(disp[cyl == ct]), n = sum(cyl == ct)))
 })
 as.data.frame(t(tabl))
 #>   cyl     mean  n
 #> 1   4 105.1364 11
 #> 2   6 183.3143  7
 #> 3   8 353.1000 14

 # - coerces all results (and grouping var) to common type

 # Similar approach from
 # https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec#gistcomment-3185680
 s <- lapply(cyl_counts, function(cyl) {
  indx <- mtcars$cyl == cyl
  data.frame(cyl = cyl, mean = mean(mtcars$disp[indx]), n = sum(indx))
 })
 do.call(rbind, s)
 #>   cyl     mean  n
 #> 1   4 105.1364 11
 #> 2   6 183.3143  7
 #> 3   8 353.1000 14

 # - harder to generalise to multiple grouping vars (need to use Map())
	# What's the most natural way to express this code in base R?
	library(dplyr, warn.conflicts = FALSE)
	mtcars %>%
	group_by(cyl) %>%
	summarise(mean = mean(disp), n = n())
	#> # A tibble: 3 x 3
	#> cyl mean n
	#> <dbl> <dbl> <int>
	#> 1 4 105. 11
	#> 2 6 183. 7
	#> 3 8 353. 14

	# tapply() ----------------------------------------------------------------
	data.frame(
	cyl = sort(unique(mtcars$cyl)),
	mean = tapply(mtcars$disp, mtcars$cyl, mean),
	n = tapply(mtcars$disp, mtcars$cyl, length)
	)
	#> cyl mean n
	#> 4 4 105.1364 11
	#> 6 6 183.3143 7
	#> 8 8 353.1000 14
	# - hard to generalise to more than one group because tapply() will
	# return an array
	# - is `sort(unique(mtcars$cyl))` guaranteed to be in the same order as
	# the tapply() output?

	# aggregate() -------------------------------------------------------------
	df_mean <- aggregate(mtcars["disp"], mtcars["cyl"], mean)
	df_length <- aggregate(mtcars["disp"], mtcars["cyl"], length)
	names(df_mean)[2] <- "mean"
	names(df_length)[2] <- "n"
	merge(df_mean, df_length, by = "cyl")
	#> cyl mean n
	#> 1 4 105.1364 11
	#> 2 6 183.3143 7
	#> 3 8 353.1000 14

	# + generalises in stratightforward to multiple grouping variables and
	# multiple summary variables
	# - need to manually rename summary variables

	# Could also use formula interface
	# https://twitter.com/tjmahr/status/1231255000766005248
	df_mean <- aggregate(disp ~ cyl, mtcars, mean)
	df_length <- aggregate(disp ~ cyl, mtcars, length)

	# by() --------------------------------------------------------------------
	mtcars_by <- by(mtcars, mtcars$cyl, function(df) {
	data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
	})
	do.call(rbind, mtcars_by)
	#> cyl mean n
	#> 4 4 105.1364 11
	#> 6 6 183.3143 7
	#> 8 8 353.1000 14

	# + generalises easily to more/different summaries
	# - need to know about anonymous functions + do.call + rbind

	# by() = split() + lapply()
	mtcars_by <- lapply(split(mtcars, mtcars$cyl), function(df) {
	data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df))
	})
	do.call(rbind, mtcars_by)
	#> cyl mean n
	#> 4 4 105.1364 11
	#> 6 6 183.3143 7
	#> 8 8 353.1000 14

	# Manual indexing approahes -------------------------------------------------
	# from https://twitter.com/fartmiasma/status/1231258479865647105
	cyl_counts <- sort(unique(mtcars$cyl))
	tabl <- sapply(cyl_counts, function(ct) {
	with(mtcars, c(cyl = ct, mean = mean(disp[cyl == ct]), n = sum(cyl == ct)))
	})
	as.data.frame(t(tabl))
	#> cyl mean n
	#> 1 4 105.1364 11
	#> 2 6 183.3143 7
	#> 3 8 353.1000 14

	# - coerces all results (and grouping var) to common type

	# Similar approach from
	# https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec#gistcomment-3185680
	s <- lapply(cyl_counts, function(cyl) {
	indx <- mtcars$cyl == cyl
	data.frame(cyl = cyl, mean = mean(mtcars$disp[indx]), n = sum(indx))
	})
	do.call(rbind, s)
	#> cyl mean n
	#> 1 4 105.1364 11
	#> 2 6 183.3143 7
	#> 3 8 353.1000 14

	# - harder to generalise to multiple grouping vars (need to use Map())