Regularization for linear models in `R`

Day 7

Felix Nößler

Freie Universität Berlin @ Theoretical Ecology

January 19, 2024

Reproduce slides

library(ggplot2)
library(dplyr)
library(tidyr)
library(forcats)
library(stringr)
library(readr)
library(purrr)
library(glmnetUtils)

## theme for ggplot
theme_set(theme_classic())
theme_update(text = element_text(size = 14))

1 A note on the library `glmnetUtils`

glmnet is a package for fitting regularized linear models
glmnetUtils is a package that extends glmnet with some functions as the formula interface

A tutorial on glmnet can be found here

2 Introduction to the dataset

20 years of data on the population size of the Rocky Mountain Apollo butterfly (Parnassius smintheus)
What are wheather covariates that influence the population size of the butterfly?
Rt is the change in population size from year \(t\) to year \(t+1\)
meada is the letter of the meadow where the butterfly was observed
72 wheather covariates

butterfly <- read_csv("data/07_butterfly.csv")

df <- butterfly %>%
  filter(! is.na(Rt)) %>%
  select(-year, -Nt, -logNt) %>%
  select_if(~ all(!is.na(.)))

J. Roland and S. F. Matter, “Pivotal effect of early‐winter temperatures and snowfall on population growth of alpine Parnassius smintheus butterflies,” Ecological Monographs, 2016. doi: 10.1002/ecm.1225.

the dataset is also used in:
A. T. Tredennick, G. Hooker, S. P. Ellner, and P. B. Adler, “A practical guide to selecting models for exploration, inference, and prediction in ecology,” Ecology, 2021. doi: 10.1002/ecy.3336.

3 Find a good value for \(\alpha\)

lm_cva <- cva.glmnet(Rt ~ ., data = df, 
                     alpha = c(0, 0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1))
plot(lm_cva)

look for the colour of the curve with the lowest minimum and then look in the legendbox for the corresponding value of \(\alpha\)
here: \(\alpha > 0.7\) are good values

4 Find a good value for \(\lambda\)

lm_cv <- cv.glmnet(Rt ~ ., data = df, alpha = 0.9, nfolds = 30)
plot(lm_cv)

mean squared error (MSE) with standard error (SE) for different values of \(\lambda\)
the left vertical line is the value of \(\lambda\) with the lowest cross-validated error (lambda.min)
the right vertical line is the value of \(\lambda\) with the lowest cross-validated error + 1 standard error (lambda.1se)

5 Coefficient value as a function of \(\lambda\)

lm1 <- glmnet(Rt ~ ., data = df, alpha = 0.9) 
plot(lm1, xvar = "lambda")

6 Extract coefficients

coef(lm_cv, s = "lambda.1se") # "lambda.1se", "lambda.min"

84 x 1 sparse Matrix of class "dgCMatrix"
                        s1
(Intercept)  -0.1859470136
meadaF        .           
meadag        .           
meadaG        .           
meadaJ        .           
meadaL        .           
meadaM        .           
meadaN        .           
meadaO        .           
meadaR        .           
meadaS        .           
meadaY        .           
maxsnow       .           
Datesnowmelt  .           
jansnow       .           
jansnowfall   .           
janmean       .           
janmeanmax    .           
janmeanmin    .           
janextmax     .           
janextmin     .           
febsnow       .           
febsnowfall   .           
febmeanmin    .           
febextmin     .           
marsnow       .           
marsnowfall   .           
marmean       .           
marmeanmax    0.0001697007
marmeanmin    .           
marextmax     .           
marextmin     .           
aprsnow       .           
aprsnowfall   .           
maysnow       .           
maymean       0.1131836622
maymeanmax    .           
maymeanmin    0.0078090816
mayextmax     .           
mayextmin     .           
junsnow       .           
junmean       .           
junmeanmax    .           
junmeanmin    0.0071916610
junextmax     .           
junextmin     .           
julsnow       .           
julmean       .           
julmeanmax    .           
julmeanmin    .           
julextmax     .           
julextmin     .           
augsnow       0.0124387278
augmean       .           
augmeanmax    .           
augmeanmin    .           
augextmax    -0.0060987645
augextmin     0.0196316088
sepsnow       .           
sepsnowfall   .           
sepmean       .           
sepmeanmax    .           
sepmeanmin    .           
sepextmax     .           
sepextmin    -0.0245992877
octmean       .           
octmeanmax    .           
octmeanmin    0.0335255661
octextmax     .           
octextmin     .           
novsnow       .           
novsnowfall   .           
novmean       .           
novmeanmax    .           
novmeanmin   -0.0302745006
novextmax    -0.0474869699
novextmin     .           
decsnow       .           
decsnowfall   .           
decmean       .           
decmeanmax    .           
decmeanmin    .           
decextmax     .           
decextmin    -0.0064699643

7 Visualize the standardized coefficients

Code

df_std <- df %>% 
  mutate_at(vars(-Rt, -meada), scale) 
  
lm_std <- glmnet(Rt ~ ., data = df_std, 
       alpha = 0.9, 
       lambda = lm_cv$lambda.1se,
       standardize = F) 

c1 <- coef(lm_std)
coef_df <- tibble(name = rownames(c1), value = as.numeric(c1))

coef_continous <- coef_df %>%
  filter(abs(value) > 1e-8) %>%
  filter(name  != "(Intercept)" & ! str_starts(name, "meda")) 

ggplot(coef_continous, aes(x = value, y = fct_reorder(name, value))) +
  geom_col() +
  labs(y = "parameter name", x = "standardized coefficient value")

8 Calculate the \(R^2\)

df_pred <- df %>%
  mutate(Rt_pred = predict(lm_cv, newdata = ., s = "lambda.1se"),
         residual = Rt - Rt_pred,
         residual_total = Rt - mean(Rt))
  
RSS <- sum(df_pred$residual ^ 2)
RSS_tot <- sum(df_pred$residual_total ^ 2)

RSS / RSS_tot

[1] 0.5034184

9 Marginal effect plots