Forked from ha0ye/gist:197de5797df1d4e2b14a8bff4fb99c95
Created
April 2, 2020 02:01
-
-
Save jdblischak/dd86758593cc0b667ed12428c853d03e to your computer and use it in GitHub Desktop.
pairwise overlap calculations
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
calc3 <- function(sets) | |
{ | |
sets <- check_sets(sets) | |
set_lengths <- vapply(sets, length, 0) | |
set_order <- order(set_lengths) | |
sets <- sets[set_order] | |
set_lengths <- set_lengths[set_order] | |
n_sets <- length(sets) | |
set_names <- names(sets) | |
n_overlaps <- choose(n = n_sets, k = 2) | |
symbols <- unique(do.call(c, sets)) | |
occ_mat <- vector("list", n_sets) | |
for (j in seq_len(n_sets)) | |
{ | |
occ_mat[[j]] <- symbols %in% sets[[j]] | |
} | |
vec_num_shared <- integer(length = n_overlaps) | |
vec_overlap <- numeric(length = n_overlaps) | |
vec_jaccard <- numeric(length = n_overlaps) | |
overlaps_index <- 1 | |
for (i in seq_len(n_sets - 1)) | |
{ | |
for (j in seq(i + 1, n_sets)) | |
{ | |
num_union <- sum(occ_mat[[i]] | occ_mat[[j]]) | |
num_shared <- sum(occ_mat[[i]] & occ_mat[[j]]) | |
overlap <- num_shared / set_lengths[i] | |
jaccard <- num_shared / num_union | |
vec_num_shared[overlaps_index] <- num_shared | |
vec_overlap[overlaps_index] <- overlap | |
vec_jaccard[overlaps_index] <- jaccard | |
overlaps_index <- overlaps_index + 1 | |
} | |
} | |
idx_df <- expand.grid(idx2 = seq(n_sets), idx1 = seq(n_sets)) | |
idx_df <- idx_df[idx_df$idx2 > idx_df$idx1, ] | |
result <- data.frame(name1 = set_names[idx_df[,2]], | |
name2 = set_names[idx_df[,1]], | |
num_shared = vec_num_shared, | |
overlap = vec_overlap, | |
jaccard = vec_jaccard, | |
stringsAsFactors = FALSE) | |
return(result) | |
} | |
check_sets <- function(sets) | |
{ | |
# Ensure that all sets are unique character vectors | |
sets_are_vectors <- vapply(sets, is.vector, logical(1)) | |
if (any(!sets_are_vectors)) { | |
stop("Sets must be vectors") | |
} | |
sets_are_atomic <- vapply(sets, is.atomic, logical(1)) | |
if (any(!sets_are_atomic)) { | |
stop("Sets must be atomic vectors, i.e. not lists") | |
} | |
sets <- lapply(sets, as.character) | |
is_unique <- function(x) length(unique(x)) == length(x) | |
sets_are_unique <- vapply(sets, is_unique, logical(1)) | |
if (any(!sets_are_unique)) { | |
stop("Sets must be unique, i.e. no duplicated elements") | |
} | |
invisible(sets) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment