Last active
March 16, 2025 18:52
-
-
Save hadley/c430501804349d382ce90754936ab8ec to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# What's the most natural way to express this code in base R? | |
library(dplyr, warn.conflicts = FALSE) | |
mtcars %>% | |
group_by(cyl) %>% | |
summarise(mean = mean(disp), n = n()) | |
#> # A tibble: 3 x 3 | |
#> cyl mean n | |
#> <dbl> <dbl> <int> | |
#> 1 4 105. 11 | |
#> 2 6 183. 7 | |
#> 3 8 353. 14 | |
# tapply() ---------------------------------------------------------------- | |
data.frame( | |
cyl = sort(unique(mtcars$cyl)), | |
mean = tapply(mtcars$disp, mtcars$cyl, mean), | |
n = tapply(mtcars$disp, mtcars$cyl, length) | |
) | |
#> cyl mean n | |
#> 4 4 105.1364 11 | |
#> 6 6 183.3143 7 | |
#> 8 8 353.1000 14 | |
# - hard to generalise to more than one group because tapply() will | |
# return an array | |
# - is `sort(unique(mtcars$cyl))` guaranteed to be in the same order as | |
# the tapply() output? | |
# aggregate() ------------------------------------------------------------- | |
df_mean <- aggregate(mtcars["disp"], mtcars["cyl"], mean) | |
df_length <- aggregate(mtcars["disp"], mtcars["cyl"], length) | |
names(df_mean)[2] <- "mean" | |
names(df_length)[2] <- "n" | |
merge(df_mean, df_length, by = "cyl") | |
#> cyl mean n | |
#> 1 4 105.1364 11 | |
#> 2 6 183.3143 7 | |
#> 3 8 353.1000 14 | |
# + generalises in stratightforward to multiple grouping variables and | |
# multiple summary variables | |
# - need to manually rename summary variables | |
# Could also use formula interface | |
# https://twitter.com/tjmahr/status/1231255000766005248 | |
df_mean <- aggregate(disp ~ cyl, mtcars, mean) | |
df_length <- aggregate(disp ~ cyl, mtcars, length) | |
# by() -------------------------------------------------------------------- | |
mtcars_by <- by(mtcars, mtcars$cyl, function(df) { | |
data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df)) | |
}) | |
do.call(rbind, mtcars_by) | |
#> cyl mean n | |
#> 4 4 105.1364 11 | |
#> 6 6 183.3143 7 | |
#> 8 8 353.1000 14 | |
# + generalises easily to more/different summaries | |
# - need to know about anonymous functions + do.call + rbind | |
# by() = split() + lapply() | |
mtcars_by <- lapply(split(mtcars, mtcars$cyl), function(df) { | |
data.frame(cyl = df$cyl[[1]], mean = mean(df$disp), n = nrow(df)) | |
}) | |
do.call(rbind, mtcars_by) | |
#> cyl mean n | |
#> 4 4 105.1364 11 | |
#> 6 6 183.3143 7 | |
#> 8 8 353.1000 14 | |
# Manual indexing approahes ------------------------------------------------- | |
# from https://twitter.com/fartmiasma/status/1231258479865647105 | |
cyl_counts <- sort(unique(mtcars$cyl)) | |
tabl <- sapply(cyl_counts, function(ct) { | |
with(mtcars, c(cyl = ct, mean = mean(disp[cyl == ct]), n = sum(cyl == ct))) | |
}) | |
as.data.frame(t(tabl)) | |
#> cyl mean n | |
#> 1 4 105.1364 11 | |
#> 2 6 183.3143 7 | |
#> 3 8 353.1000 14 | |
# - coerces all results (and grouping var) to common type | |
# Similar approach from | |
# https://gist.github.com/hadley/c430501804349d382ce90754936ab8ec#gistcomment-3185680 | |
s <- lapply(cyl_counts, function(cyl) { | |
indx <- mtcars$cyl == cyl | |
data.frame(cyl = cyl, mean = mean(mtcars$disp[indx]), n = sum(indx)) | |
}) | |
do.call(rbind, s) | |
#> cyl mean n | |
#> 1 4 105.1364 11 | |
#> 2 6 183.3143 7 | |
#> 3 8 353.1000 14 | |
# - harder to generalise to multiple grouping vars (need to use Map()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Note a downside of the formula based aggregate syntax — it drops missing values like a modelling function:
Created on 2022-10-24 with reprex v2.0.2