Last active
September 22, 2019 23:53
-
-
Save joaopcnogueira/124601a937cb2edd0dd84f5538e04f2b to your computer and use it in GitHub Desktop.
Target mean encoder implementation in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
# creating a toy dataset | |
data = tibble(vehicle = c("car", "bus", "bike", "bus", "car", "bike"), | |
target = c(23,34,56,78,33,65)) | |
# print dataframe | |
data | |
# OUTPUT | |
# A tibble: 6 x 2 | |
# vehicle target | |
# <chr> <dbl> | |
#1 car 23 | |
#2 bus 34 | |
#3 bike 56 | |
#4 bus 78 | |
#5 car 33 | |
#6 bike 65 | |
# Creating the lookup vector for the variable to be encoded | |
target_mean_enc_lookup <- data %>% | |
group_by(vehicle) %>% | |
summarise(mean_target = mean(target)) %>% | |
ungroup() %>% | |
arrange(mean_target) %>% | |
pull(vehicle) %>% | |
setNames(1:length(.), .) | |
# printing the lookup vector | |
target_mean_enc_lookup | |
# OUTPUT | |
# car bus bike | |
# 1 2 3 | |
# Encoding the vehicle variable | |
data %>% | |
mutate( | |
vehicle_target_mean_enc = unname(target_mean_enc_lookup[data$vehicle]) | |
) | |
# print dataframe | |
data | |
# OUTPUT | |
# A tibble: 6 x 4 | |
# vehicle target vehicle_target_mean_enc | |
# <chr> <dbl> <int> | |
#1 car 23 1 | |
#2 bus 34 2 | |
#3 bike 56 3 | |
#4 bus 78 2 | |
#5 car 33 1 | |
#6 bike 65 3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment