Solution to the regularization exercise with the penguins dataset

1 Solution with output

## load packages
library(dplyr)
library(readr)
library(tidyr)
library(ggplot2)
library(ggfortify)
library(glmnetUtils)

## theme for ggplot
theme_set(theme_classic())
theme_update(text = element_text(size = 14))

## load and prepare data
df_penguins <- read_csv("data/05_penguins.csv") %>%
  drop_na() %>%
  unite(species_sex, c(species, sex)) %>%
  mutate(year_c = year - mean(year),
         species_sex = factor(species_sex))

## fit ordinary least squares (linear) model
lm_OLS <- lm(body_mass_g ~ species_sex + year_c,
   data = df_penguins)
lm_OLS

Call:
lm(formula = body_mass_g ~ species_sex + year_c, data = df_penguins)

Coefficients:
                (Intercept)       species_sexAdelie_male  
                   3368.789                      674.658  
species_sexChinstrap_female    species_sexChinstrap_male  
                    158.681                      570.446  
   species_sexGentoo_female       species_sexGentoo_male  
                   1310.853                     2115.961  
                     year_c  
                      3.693  
autoplot(lm_OLS, which = 1:2)

## fit elastic net model with cross validation
alpha_vals <- c(0, 0.1, 0.5, 0.7, 0.8, 0.9, 0.95, 0.99, 1)
elasticnet_cv <- cva.glmnet(
  body_mass_g ~ species_sex + year_c, 
  alpha = alpha_vals,
  data = df_penguins, 
  nfolds = 15,
  use.model.frame = F)

plot(elasticnet_cv)

minlossplot(elasticnet_cv)

plot(elasticnet_cv$modlist[[which(0.5 == alpha_vals)]])

selected_alpha <- 0.8

## visualize the effect of lambda on the coefficients of the predictor variables
elasticnet_single <- glmnet(body_mass_g ~ species_sex + year_c, 
              data = df_penguins, 
              alpha = selected_alpha) 
plot(elasticnet_single, xvar = "lambda", label = T)

## make predictions
df_pred <- expand_grid(
  species_sex = unique(df_penguins$species_sex),
  year_c = unique(df_penguins$year_c)) %>%
  mutate(year = year_c + mean(df_penguins$year), 
         OLS = predict(lm_OLS, newdata = .),
         ElasticNet = predict(elasticnet_cv, newdata = ., 
                              s = "lambda.1se", alpha = selected_alpha,
                              use.model.frame = T)[,1]) %>%
  select(-year_c) %>%
  pivot_longer(c(OLS, ElasticNet), 
               names_to = "method", values_to = "body_mass_g") 

df_hline <- tibble(y = c(
  coef(elasticnet_cv, s = "lambda.1se", alpha = 0.9)[1,1], 
  coef(lm_OLS)[1]),
       method = c("ElasticNet", "OLS"))

ggplot(df_pred, aes(species_sex, body_mass_g)) +
  geom_point(aes(color = method),
             shape = "-", size = 20) +
  geom_dotplot(data = df_penguins,
               binaxis = "y", dotsize = 0.5, 
               stackdir = "center", alpha = 0.7,
               fill = NaN) +
  geom_hline(aes(yintercept = y, color = method), 
             data = df_hline) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  labs(x = "", y = "Body mass (g)", 
       color = "Method")

## compare coefficients, we need to change the contrast coding to sum contrasts!
contr_species_sex <- contr.sum(6)
colnames(contr_species_sex) <- levels(df_penguins$species_sex)[1:5] 
contrasts(df_penguins$species_sex) <- contr_species_sex

lm_OLS_sum_contrast <- lm(body_mass_g ~ species_sex + year_c,
   data = df_penguins)

elasticnet_cv_sum_contr <- cv.glmnet(
  body_mass_g ~ species_sex + year_c, 
  alpha = selected_alpha,
  data = df_penguins, 
  nfolds = 15,
  use.model.frame = T)

tibble(names = names(coef(lm_OLS_sum_contrast)), 
       OLS = coef(lm_OLS_sum_contrast),
       ElasticNet = as.numeric(coef(elasticnet_cv_sum_contr, s = "lambda.1se", 
                              alpha = selected_alpha)))
# A tibble: 7 × 3
  names                           OLS ElasticNet
  <chr>                         <dbl>      <dbl>
1 (Intercept)                 4174.       4179. 
2 species_sexAdelie_female    -805.       -762. 
3 species_sexAdelie_male      -130.        -96.5
4 species_sexChinstrap_female -646.       -585. 
5 species_sexChinstrap_male   -235.       -182. 
6 species_sexGentoo_female     506.        362. 
7 year_c                         3.69        0  

2 Solution for copy-pasting

## load packages
library(dplyr)
library(readr)
library(tidyr)
library(ggplot2)
library(ggfortify)
library(glmnetUtils)

## theme for ggplot
theme_set(theme_classic())
theme_update(text = element_text(size = 14))

## load and prepare data
df_penguins <- read_csv("data/05_penguins.csv") %>%
  drop_na() %>%
  unite(species_sex, c(species, sex)) %>%
  mutate(year_c = year - mean(year),
         species_sex = factor(species_sex))

## fit ordinary least squares (linear) model
lm_OLS <- lm(body_mass_g ~ species_sex + year_c,
   data = df_penguins)
lm_OLS
autoplot(lm_OLS, which = 1:2)

## fit elastic net model with cross validation
alpha_vals <- c(0, 0.1, 0.5, 0.7, 0.8, 0.9, 0.95, 0.99, 1)
elasticnet_cv <- cva.glmnet(
  body_mass_g ~ species_sex + year_c, 
  alpha = alpha_vals,
  data = df_penguins, 
  nfolds = 15,
  use.model.frame = F)

plot(elasticnet_cv)
minlossplot(elasticnet_cv)
plot(elasticnet_cv$modlist[[which(0.5 == alpha_vals)]])
selected_alpha <- 0.8

## visualize the effect of lambda on the coefficients of the predictor variables
elasticnet_single <- glmnet(body_mass_g ~ species_sex + year_c, 
              data = df_penguins, 
              alpha = selected_alpha) 
plot(elasticnet_single, xvar = "lambda", label = T)

## make predictions
df_pred <- expand_grid(
  species_sex = unique(df_penguins$species_sex),
  year_c = unique(df_penguins$year_c)) %>%
  mutate(year = year_c + mean(df_penguins$year), 
         OLS = predict(lm_OLS, newdata = .),
         ElasticNet = predict(elasticnet_cv, newdata = ., 
                              s = "lambda.1se", alpha = selected_alpha,
                              use.model.frame = T)[,1]) %>%
  select(-year_c) %>%
  pivot_longer(c(OLS, ElasticNet), 
               names_to = "method", values_to = "body_mass_g") 

df_hline <- tibble(y = c(
  coef(elasticnet_cv, s = "lambda.1se", alpha = 0.9)[1,1], 
  coef(lm_OLS)[1]),
       method = c("ElasticNet", "OLS"))

ggplot(df_pred, aes(species_sex, body_mass_g)) +
  geom_point(aes(color = method),
             shape = "-", size = 20) +
  geom_dotplot(data = df_penguins,
               binaxis = "y", dotsize = 0.5, 
               stackdir = "center", alpha = 0.7,
               fill = NaN) +
  geom_hline(aes(yintercept = y, color = method), 
             data = df_hline) +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  labs(x = "", y = "Body mass (g)", 
       color = "Method")

## compare coefficients, we need to change the contrast coding to sum contrasts!
contr_species_sex <- contr.sum(6)
colnames(contr_species_sex) <- levels(df_penguins$species_sex)[1:5] 
contrasts(df_penguins$species_sex) <- contr_species_sex

lm_OLS_sum_contrast <- lm(body_mass_g ~ species_sex + year_c,
   data = df_penguins)

elasticnet_cv_sum_contr <- cv.glmnet(
  body_mass_g ~ species_sex + year_c, 
  alpha = selected_alpha,
  data = df_penguins, 
  nfolds = 15,
  use.model.frame = T)

tibble(names = names(coef(lm_OLS_sum_contrast)), 
       OLS = coef(lm_OLS_sum_contrast),
       ElasticNet = as.numeric(coef(elasticnet_cv_sum_contr, s = "lambda.1se", 
                              alpha = selected_alpha)))