Solution to the regularization exercise with the crab dataset

1 Solution with output

## load packages
library(dplyr)
library(readr)
library(tidyr)
library(ggplot2)
library(ggfortify)
library(glmnetUtils)

## theme for ggplot
theme_set(theme_classic())
theme_update(text = element_text(size = 14))

## load data set
df_crabs <- read_csv("data/06_crab_size.csv")

## fit ordinary least squares (linear) model and check assumptions
model_formula <- size ~ latitude * (water_temp + air_temp) + air_temp_sd + water_temp_sd
lm_OLS <- lm(model_formula, data = df_crabs)
autoplot(lm_OLS, which = 1:2)

## fit elastic net model with cross validation
alpha_vals <- c(0, 0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1)
elnet_cv <- cva.glmnet(model_formula, 
  alpha = alpha_vals,
  data = df_crabs, 
  nfolds = 20)
plot(elnet_cv)

minlossplot(elnet_cv)

selected_alpha <- 0.9
plot(elnet_cv$modlist[[which(selected_alpha == alpha_vals)]])

## visualize effect of lambda on predictor cofficient values
elnet_single <- glmnet(
  model_formula,
  data = df_crabs, 
  alpha = selected_alpha) 
plot(elnet_single, xvar = "lambda", label = T)

## extract coefficients
coef(elnet_cv, s = "lambda.1se", alpha = selected_alpha)
8 x 1 sparse Matrix of class "dgCMatrix"
                              s1
(Intercept)         26.019422294
latitude             .          
water_temp           .          
air_temp            -0.196900040
latitude:water_temp  0.002558175
latitude:air_temp   -0.018204759
air_temp_sd          .          
water_temp_sd        0.013732613
## compare coefficients
tibble(names = names(coef(lm_OLS)), 
       OLS = coef(lm_OLS),
       elasticnet = as.numeric(coef(elnet_cv, 
                                    s = "lambda.1se", 
                                    alpha = selected_alpha)))
# A tibble: 8 × 3
  names                  OLS elasticnet
  <chr>                <dbl>      <dbl>
1 (Intercept)         72.9     26.0    
2 latitude            -1.07     0      
3 water_temp           6.57     0      
4 air_temp            -8.92    -0.197  
5 air_temp_sd          0.406    0.00256
6 water_temp_sd       -0.189   -0.0182 
7 latitude:water_temp -0.141    0      
8 latitude:air_temp    0.165    0.0137 
## calculate mean, min and max of all predictor variables
df_crabs_summary <- df_crabs %>%
  select(latitude, water_temp, air_temp, air_temp_sd, water_temp_sd) %>%
  pivot_longer(everything()) %>%
  group_by(name) %>%
  summarise(min = min(value),
            max = max(value),
            mean = mean(value))
df_crabs_summary
# A tibble: 5 × 4
  name            min   max  mean
  <chr>         <dbl> <dbl> <dbl>
1 air_temp      10.3  21.8  15.2 
2 air_temp_sd    6.39  9.96  8.65
3 latitude      30    42.7  37.7 
4 water_temp    14.0  24.5  17.6 
5 water_temp_sd  4.84  9.12  7.25
## make predictions
df_mean <- df_crabs_summary %>%
  select(name, mean) %>%
  pivot_wider(names_from = name, values_from = mean) 

df_latitude <- expand_grid(
  select(df_mean, -latitude),
  latitude = c(30, 42.7)) %>%
  mutate(ElasticNet = predict(elnet_cv, 
                              newdata = ., s = "lambda.1se", alpha = selected_alpha)[,1],
         OLS = predict(lm_OLS, newdata = .),
         variable = "latitude") %>%
  pivot_longer(c(ElasticNet, OLS), names_to = "method", values_to = "size")  

df_air_temp <- expand_grid(
  select(df_mean, -air_temp),
  air_temp = c(10.3, 21.8)) %>%
  mutate(ElasticNet = predict(elnet_cv, 
                              newdata = ., 
                              s = "lambda.1se", 
                              alpha = selected_alpha)[,1],
         OLS = predict(lm_OLS, newdata = .),
         variable = "air_temp") %>%
  pivot_longer(c(ElasticNet, OLS), 
               names_to = "method", values_to = "size") 
  
df_water_temp <- expand_grid(
  select(df_mean, -water_temp),
  water_temp = c(14, 24.5)) %>%
  mutate(ElasticNet = predict(elnet_cv, 
                              newdata = ., 
                              s = "lambda.1se", alpha = selected_alpha)[,1],
         OLS = predict(lm_OLS, newdata = .),
         variable = "water_temp") %>%
  pivot_longer(c(ElasticNet, OLS), 
               names_to = "method", values_to = "size")  
    
df_pred <- bind_rows(df_air_temp, df_water_temp, df_latitude) %>%
  pivot_longer(air_temp_sd:air_temp) %>%
  filter(variable == name) %>%
  select(-name) %>%
  mutate(variable = factor(variable, 
                           levels = c("latitude", "water_temp", "air_temp"),
                           labels = c("Latitude (degree)", "Water temperature (°C)", "Air temperature (°C)")) )

df_data <- df_crabs %>%
  select(size, latitude, water_temp, air_temp, air_temp_sd, water_temp_sd) %>%
  pivot_longer(-size, names_to = "variable") %>%
  filter(variable %in% c("latitude", "water_temp", "air_temp")) %>%
  mutate(variable = factor(variable, 
                           levels = c("latitude", "water_temp", "air_temp"),
                           labels = c("Latitude (degree)", "Water temperature (°C)", "Air temperature (°C)")) )
  
ggplot(df_data, aes(value, size)) +
  geom_point(alpha = 0.2) +
  geom_line(aes(color = method), data = df_pred,
            linewidth = 1) +
  facet_wrap(~variable, scales = "free_x") +
  labs(x = "") +
  theme_light() +
  theme(text = element_text(size = 14)) 

## Bonus: predictions for all variables
df_crabs_summary_all <- df_crabs %>%
  select(latitude, water_temp, air_temp, air_temp_sd, water_temp_sd) %>%
  mutate(latitude_water_temp = latitude * water_temp,
         latitude_air_temp = latitude * air_temp) %>%
  pivot_longer(everything()) %>%
  group_by(name) %>%
  summarise(min = min(value),
            max = max(value),
            mean = mean(value)) 

df_mean_all <- df_crabs_summary_all %>%
  select(name, mean) %>%
  pivot_wider(names_from = name, values_from = mean) 

df_prep <- df_crabs_summary_all %>%
  rowwise() %>%
  mutate(df = 
      list(expand_grid(
        variable = name,
        "{name}" := c(min, max),
        select(df_mean_all, -name))))

df_pred_all <- bind_rows(df_prep$df) %>%
  mutate(ElasticNet = predict(elnet_cv, newdata = ., 
                              s = "lambda.1se",
                              alpha = selected_alpha)[ , 1],
         OLS = predict(lm_OLS, newdata = .)) %>%
  pivot_longer(c(ElasticNet, OLS), 
               names_to = "method", values_to = "size")  %>%
  pivot_longer(c(-method, -variable, -size), values_to = "value") %>%
  filter(name == variable) %>%
  select(-name) 

df_data_all <- df_crabs %>%
  select(size, latitude, water_temp, air_temp, air_temp_sd, water_temp_sd) %>%
   mutate(
    latitude_water_temp = latitude * water_temp,
    latitude_air_temp = latitude * air_temp) %>%
  pivot_longer(-size, names_to = "variable")

ggplot(df_data_all, aes(value, size)) +
  geom_point(alpha = 0.1) +
  geom_line(aes(color = method), data = df_pred_all,
            linewidth = 1) +
  facet_wrap(~variable, scales = "free_x") +
  labs(x = "") +
  theme_light() +
  theme(text = element_text(size = 14))   

2 Solution for copy-pasting

## load packages
library(dplyr)
library(readr)
library(tidyr)
library(ggplot2)
library(ggfortify)
library(glmnetUtils)

## theme for ggplot
theme_set(theme_classic())
theme_update(text = element_text(size = 14))

## load data set
df_crabs <- read_csv("data/06_crab_size.csv")

## fit ordinary least squares (linear) model and check assumptions
model_formula <- size ~ latitude * (water_temp + air_temp) + air_temp_sd + water_temp_sd
lm_OLS <- lm(model_formula, data = df_crabs)
autoplot(lm_OLS, which = 1:2)

## fit elastic net model with cross validation
alpha_vals <- c(0, 0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1)
elnet_cv <- cva.glmnet(
  model_formula, 
  alpha = alpha_vals,
  data = df_crabs, 
  nfolds = 20)
plot(elnet_cv)
minlossplot(elnet_cv)
selected_alpha <- 0.9
plot(elnet_cv$modlist[[which(selected_alpha == alpha_vals)]])

## visualize effect of lambda on predictor cofficient values
elnet_single <- glmnet(
  model_formula,
  data = df_crabs, 
  alpha = selected_alpha) 
plot(elnet_single, xvar = "lambda", label = T)

## extract coefficients
coef(elnet_cv, s = "lambda.1se", alpha = selected_alpha)

## compare coefficients
tibble(names = names(coef(lm_OLS)), 
       OLS = coef(lm_OLS),
       elasticnet = as.numeric(coef(elnet_cv, 
                                    s = "lambda.1se", 
                                    alpha = selected_alpha)))

## calculate mean, min and max of all predictor variables
df_crabs_summary <- df_crabs %>%
  select(latitude, water_temp, air_temp, air_temp_sd, water_temp_sd) %>%
  pivot_longer(everything()) %>%
  group_by(name) %>%
  summarise(min = min(value),
            max = max(value),
            mean = mean(value))
df_crabs_summary

## make predictions
df_mean <- df_crabs_summary %>%
  select(name, mean) %>%
  pivot_wider(names_from = name, values_from = mean) 

df_latitude <- expand_grid(
  select(df_mean, -latitude),
  latitude = c(30, 42.7)) %>%
  mutate(ElasticNet = predict(elnet_cv, 
                              newdata = ., s = "lambda.1se", alpha = selected_alpha)[,1],
         OLS = predict(lm_OLS, newdata = .),
         variable = "latitude") %>%
  pivot_longer(c(ElasticNet, OLS), names_to = "method", values_to = "size")  

df_air_temp <- expand_grid(
  select(df_mean, -air_temp),
  air_temp = c(10.3, 21.8)) %>%
  mutate(ElasticNet = predict(elnet_cv, 
                              newdata = ., 
                              s = "lambda.1se", 
                              alpha = selected_alpha)[,1],
         OLS = predict(lm_OLS, newdata = .),
         variable = "air_temp") %>%
  pivot_longer(c(ElasticNet, OLS), 
               names_to = "method", values_to = "size") 
  
df_water_temp <- expand_grid(
  select(df_mean, -water_temp),
  water_temp = c(14, 24.5)) %>%
  mutate(ElasticNet = predict(elnet_cv, 
                              newdata = ., 
                              s = "lambda.1se", alpha = selected_alpha)[,1],
         OLS = predict(lm_OLS, newdata = .),
         variable = "water_temp") %>%
  pivot_longer(c(ElasticNet, OLS), 
               names_to = "method", values_to = "size")  
    
df_pred <- bind_rows(df_air_temp, df_water_temp, df_latitude) %>%
  pivot_longer(air_temp_sd:air_temp) %>%
  filter(variable == name) %>%
  select(-name) %>%
  mutate(variable = factor(variable, 
                           levels = c("latitude", "water_temp", "air_temp"),
                           labels = c("Latitude (degree)", "Water temperature (°C)", "Air temperature (°C)")) )

df_data <- df_crabs %>%
  select(size, latitude, water_temp, air_temp, air_temp_sd, water_temp_sd) %>%
  pivot_longer(-size, names_to = "variable") %>%
  filter(variable %in% c("latitude", "water_temp", "air_temp")) %>%
  mutate(variable = factor(variable, 
                           levels = c("latitude", "water_temp", "air_temp"),
                           labels = c("Latitude (degree)", "Water temperature (°C)", "Air temperature (°C)")) )
  
ggplot(df_data, aes(value, size)) +
  geom_point(alpha = 0.2) +
  geom_line(aes(color = method), data = df_pred,
            linewidth = 1) +
  facet_wrap(~variable, scales = "free_x") +
  labs(x = "") +
  theme_light() +
  theme(text = element_text(size = 14)) 
  
  
## Bonus: predictions for all variables
df_crabs_summary_all <- df_crabs %>%
  select(latitude, water_temp, air_temp, air_temp_sd, water_temp_sd) %>%
  mutate(latitude_water_temp = latitude * water_temp,
         latitude_air_temp = latitude * air_temp) %>%
  pivot_longer(everything()) %>%
  group_by(name) %>%
  summarise(min = min(value),
            max = max(value),
            mean = mean(value)) 

df_mean_all <- df_crabs_summary_all %>%
  select(name, mean) %>%
  pivot_wider(names_from = name, values_from = mean) 

df_prep <- df_crabs_summary_all %>%
  rowwise() %>%
  mutate(df = 
      list(expand_grid(
        variable = name,
        "{name}" := c(min, max),
        select(df_mean_all, -name))))

df_pred_all <- bind_rows(df_prep$df) %>%
  mutate(ElasticNet = predict(elnet_cv, newdata = ., 
                              s = "lambda.1se",
                              alpha = selected_alpha)[ , 1],
         OLS = predict(lm_OLS, newdata = .)) %>%
  pivot_longer(c(ElasticNet, OLS), 
               names_to = "method", values_to = "size")  %>%
  pivot_longer(c(-method, -variable, -size), values_to = "value") %>%
  filter(name == variable) %>%
  select(-name) 

df_data_all <- df_crabs %>%
  select(size, latitude, water_temp, air_temp, air_temp_sd, water_temp_sd) %>%
   mutate(
    latitude_water_temp = latitude * water_temp,
    latitude_air_temp = latitude * air_temp) %>%
  pivot_longer(-size, names_to = "variable")

ggplot(df_data_all, aes(value, size)) +
  geom_point(alpha = 0.1) +
  geom_line(aes(color = method), data = df_pred_all,
            linewidth = 1) +
  facet_wrap(~variable, scales = "free_x") +
  labs(x = "") +
  theme_light() +
  theme(text = element_text(size = 14))