library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
## theme for ggplot
theme_set(theme_classic())
theme_update(text = element_text(size = 14))Day 5 and 6
Freie Universität Berlin @ Theoretical Ecology
January 19, 2024
library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
## theme for ggplot
theme_set(theme_classic())
theme_update(text = element_text(size = 14))\[
\begin{align}
y &= \alpha + \epsilon \\
\epsilon &\sim N(0, \sigma)
\end{align}
\] - \(y\) is the response variable
- \(\alpha\) is the mean of the group
- \(\epsilon\) is the error term with standard deviation \(\sigma\)
df <- tibble(group = factor(rep(c("α"), 20)),
y = rnorm(20, sd = 1, mean = 0.5))
lm1 <- lm(y ~ 1, data = df)
b <- coef(lm1)[1]
df <- mutate(df, y_pred = b)
df_normal <- tibble(
group = factor(rep("a", 100)),
group_num = unclass(group),
y = seq(2*min(df$y), 1.5*max(df$y), length.out = 100),
y_normal = dnorm(y,
mean = rep(b, 100),
sd = sigma(lm1)))
ggplot(df, aes(group, y)) +
geom_point(aes(group, y_pred), color = "blue", size = 10, shape = 95) +
geom_jitter(width = 0.05, height = 0, size = 2, alpha = 0.5) +
geom_path(aes(group_num + y_normal, y, group = group), data = df_normal) +
geom_ribbon(aes(xmin = group_num, xmax = group_num + y_normal, y = y, group = group),
data = df_normal, alpha = 0.1, fill = "blue") \[ \begin{align} y &= \left\{ \begin{array}{ll} \alpha_a + \epsilon \\ \alpha_b + \epsilon \\ \alpha_c + \epsilon \\ \end{array} \right. \\ \epsilon &\sim N(0, \sigma) \end{align} \] or shorter with \(g\) as the group index: \[ \begin{align} y_g &= \alpha_g + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
Examples:
df <- tibble(group = factor(rep(c("a", "b", "c"), 20)),
y = rnorm(60, sd = 1, mean = c(0.5, 3.0, 5.0)))
lm1 <- lm(y ~ group, data = df)
b1 <- coef(lm1)[1]
b2 <- b1 + coef(lm1)[2]
b3 <- b1 + coef(lm1)[3]
df <- df %>%
mutate(y_pred = case_when(group == "a" ~ b1,
group == "b" ~ b2,
group == "c" ~ b3))
df_normal <- tibble(
group = factor(rep(c("a", "b", "c"), each = 100)),
group_num = unclass(group),
y = rep(seq(2*min(df$y), 1.5*max(df$y), length.out = 100), 3),
y_normal = dnorm(y,
mean = rep(c(b1, b2, b3), each = 100),
sd = sigma(lm1)))
ggplot(df, aes(group, y)) +
geom_point(aes(group, y_pred), color = "blue", size = 10, shape = 95) +
geom_jitter(width = 0.05, height = 0, size = 2, alpha = 0.5) +
geom_path(aes(group_num + y_normal, y, group = group), data = df_normal) +
geom_ribbon(aes(xmin = group_num, xmax = group_num + y_normal, y = y, group = group),
data = df_normal, alpha = 0.1, fill = "blue") \[
\begin{align}
y &= \alpha + \beta \cdot x + \epsilon \\
\epsilon &\sim N(0, \sigma)
\end{align}
\] - \(y\) is the dependent variable
- \(x\) is the independent variable
- \(\beta\) is the slope, \(\beta = \frac{\Delta y}{\Delta x}\) (see slope triangle in blue)
- \(\alpha\) is the y-axis intercept (purple)
- \(\sigma\) is the standard deviation of the residuals
Examples:
df <- tibble(x = seq(0, 10, length.out = 20),
y = 0.5 * x + 2 + rnorm(20, sd = 1))
lm1 <- lm(y ~ x, data = df)
alpha <- coef(lm1)[1]
beta <- coef(lm1)[2]
s <- sigma(lm1)
df_quantile <- tibble(
alpha = alpha + qnorm(seq(0.025, 0.975, 0.05), sd = s),
beta = beta)
ggplot(df, aes(x, y)) +
geom_abline(aes(slope = beta, intercept = alpha),
data = df_quantile,
color = "grey", linewidth = 0.5) +
geom_abline(intercept = alpha,
slope = beta,
color = "red", linewidth = 1.5) +
geom_point(x = 0, y = alpha, color = "purple",
size = 2) +
geom_segment(aes(x = 5, xend = 9, y = beta * 5 + alpha,
yend = beta * 5 + alpha),
color = "cornflowerblue", linewidth = 1) +
geom_segment(aes(x = 9, xend = 9, y = beta * 5 + alpha,
yend = beta * 9 + alpha),
color = "cornflowerblue", linewidth = 1) +
geom_segment(aes(x = 0, y = alpha,
xend = 0, yend = 0),
color = "purple", linewidth = 0.5, linetype = "dashed") +
geom_segment(aes(x = 0, y = alpha,
xend = -1, yend = alpha),
color = "purple", linewidth = 0.5,
arrow = arrow(type = "closed", length = unit(0.30, "cm"))) +
annotate("text", x = 7, y = beta * 5 + alpha - 0.2,
label = "Δx") +
annotate("text", x = 9.3, y = beta * 7 + alpha,
label = "Δy") +
annotate("text", x = -0.5, y = alpha + 0.2,
label = "α") +
geom_point(size = 3) +
coord_fixed()\[ \begin{align} y_g &= \alpha_g + \beta \cdot x + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
Example:
df <- tibble(group = factor(rep(c("a", "b", "c"), 30)),
x = rnorm(90, sd = 4, mean = 10),
y = x * 0.5 + rep(c(12, 8, 2), 30) + rnorm(90, sd = 1, mean = 0))
lm1 <- lm(y ~ x + group, data = df)
df <- df %>%
mutate(y_pred = predict(lm1))
df_quantile <- df %>%
slice(rep(1:n(), each = 20)) %>%
mutate(
p = rep(seq(0.025, 0.975, 0.05), 90),
q = qnorm(p, sd = sigma(lm1), mean = y_pred)) %>%
arrange(group)
ggplot(df, aes(x,y, color = group)) +
geom_line(aes(y = y_pred), linewidth = 1) +
geom_line(aes(x, q, group = interaction(p, group)), data = df_quantile,
alpha = 0.2, color = "grey1") +
geom_point(size = 2, alpha = 0.8) \[ \begin{align} y_{gk} &= \alpha_g + \alpha_k + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
Examples:
df <- tibble(
species = factor(rep(c("species a", "species b", "species c"), 40)),
soil_type = factor(rep(c("loam", "sand"), each = 60)),
biomass = rnorm(120, sd = 5, mean = rep(c(100, 70), each = 60) + rep(c(20, 0, -20), 40))
)
lm1 <- lm(biomass ~ species + soil_type, data = df)
df <- df %>%
mutate(y_pred = predict(lm1))
df_normal <- df %>%
group_by(species, soil_type) %>%
reframe(q = seq(0.8 * min(biomass), 1.2 * max(biomass), 0.01)) %>%
mutate(y_pred = predict(lm1, newdata = .),
y_dens = dnorm(q, mean = y_pred, sd = sigma(lm1)),
group = interaction(species, soil_type),
group_num = unclass(group))
ggplot(df, aes(interaction(species, soil_type), biomass)) +
geom_jitter(width = 0.05, height = 0, size = 1.5, alpha = 0.3) +
geom_point(aes(y = y_pred), color = "blue", size = 8, shape = 95) +
geom_path(aes(group_num + 5 * y_dens, q, group = group), data = df_normal) +
geom_ribbon(aes(xmin = group_num, xmax = group_num + 5 * y_dens, y = q, group = group),
data = df_normal, alpha = 0.1, fill = "blue") +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
labs(x = "species, soil type")with the two continuous variables \(x_1\) and \(x_2\): \[ \begin{align} y &= \alpha + \beta_1 \cdot x_1 + \beta_2 \cdot x_2 + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
df <- tibble(x1 = seq(0, 10, length.out = 50),
x2 = rnorm(50, mean = 15, sd = 2),
y = rnorm(50, mean = 4 - 0.5 * x1 + 2.0 * x2, sd = 1))
lm2 <- lm(y ~ x1 + x2, data = df)
b <- coef(lm2)[1]
m1 <- coef(lm2)[2]
m2 <- coef(lm2)[3]
df_pred1 <- df %>%
mutate(y_pred = b + m1 * x1 + m2 * mean(x2),
x = x1,
type = "variable: x1; x2 at mean") %>%
select(-x1, -x2)
df_pred1_quantiles <- df_pred1 %>%
slice(rep(1:n(), each = 20)) %>%
mutate(
p = rep(seq(0.025, 0.975, 0.05), 50),
q = qnorm(p, sd = sigma(lm2), mean = y_pred)) %>%
select(p, x, q, type)
df_pred2 <- df %>%
mutate(y_pred = b + m1 * mean(x1) + m2 * x2,
x = x2,
type = "variable: x2; x1 at mean") %>%
select(-x1, -x2)
df_pred2_quantiles <- df_pred2 %>%
slice(rep(1:n(), each = 20)) %>%
mutate(
p = rep(seq(0.025, 0.975, 0.05), 50),
q = qnorm(p, sd = sigma(lm2), mean = y_pred)) %>%
select(p, x, q, type)
df_plot <- bind_rows(df_pred1, df_pred2)
df_plot_quantiles <- bind_rows(df_pred1_quantiles, df_pred2_quantiles)
ggplot(df_plot, aes(x, y)) +
geom_point(alpha = 0.6, size = 2) +
geom_line(aes(y = y_pred), color = "red", linewidth = 1.5) +
geom_line(aes(x, q, group = p), data = df_plot_quantiles,
color = "black", alpha = 0.2) +
facet_wrap(~type, scales = "free_x") +
theme_light() +
theme(text = element_text(size = 14))recall the equation for a linear model with one categorical and one continous predictor variable:
\[ \begin{align} y_g &= \alpha_g + \beta \cdot x + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
if you want to include an “interaction” the equation changes to: \[ \begin{align} y_g &= \alpha_g + \beta_g \cdot x + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
df <- tibble(group = factor(rep(c("a", "b", "c"), 30)),
x = rnorm(90, sd = 4, mean = 10),
y = x * rep(c(0.2, 0.5, 0.7), 30) + rep(c(12, 8, 2), 30) + rnorm(90, sd = 1, mean = 0))
lm1 <- lm(y ~ x * group, data = df)
df <- df %>%
mutate(y_pred = predict(lm1))
df_quantile <- df %>%
slice(rep(1:n(), each = 20)) %>%
mutate(
p = rep(seq(0.025, 0.975, 0.05), 90),
q = qnorm(p, sd = sigma(lm1), mean = y_pred)) %>%
arrange(group)
ggplot(df, aes(x,y, color = group)) +
geom_line(aes(y = y_pred), linewidth = 1) +
geom_line(aes(x, q, group = interaction(p, group)), data = df_quantile,
alpha = 0.2, color = "grey1") +
geom_point(size = 2, alpha = 0.8) recall the equation for a linear model with two categorical predictor variable:
\[ \begin{align} y_{gk} &= \alpha_g + \alpha_k + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
if you want to include an “interaction” the equation changes to: \[ \begin{align} y_{gk} &= \alpha_{gk} + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
df <- tibble(
species = factor(rep(c("species a", "species b", "species c"), 40)),
soil_type = factor(rep(c("loam", "sand"), each = 60)),
biomass = rnorm(120, sd = 5, mean = rep(c(100, 70), each = 60) + c(rep(c(20, 0, -20), 20), rep(c(0, 10, 10), 20)) )
)
lm1 <- lm(biomass ~ species * soil_type, data = df)
df <- df %>%
mutate(y_pred = predict(lm1))
df_normal <- df %>%
group_by(species, soil_type) %>%
reframe(q = seq(0.8 * min(biomass), 1.2 * max(biomass), 0.01)) %>%
mutate(y_pred = predict(lm1, newdata = .),
y_dens = dnorm(q, mean = y_pred, sd = sigma(lm1)),
group = interaction(species, soil_type),
group_num = unclass(group))
ggplot(df, aes(interaction(species, soil_type), biomass)) +
geom_jitter(width = 0.05, height = 0, size = 1.5, alpha = 0.3) +
geom_point(aes(y = y_pred), color = "blue", size = 8, shape = 95) +
geom_path(aes(group_num + 5 * y_dens, q, group = group), data = df_normal) +
geom_ribbon(aes(xmin = group_num, xmax = group_num + 5 * y_dens, y = q, group = group),
data = df_normal, alpha = 0.1, fill = "blue") +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
labs(x = "species, soil type")recall the equation for a linear model with two continuous predictor variable:
\[ \begin{align} y_{gk} &= \alpha + \beta_1 \cdot x_1 + \beta_2 \cdot x_2 + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
if you want to include an “interaction” the equation changes to: \[ \begin{align} y_{gk} &= \alpha + \beta_1 \cdot x_1 + \beta_2 \cdot x_2 + \beta_3\cdot x_1 \cdot x_2 + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]
df <- tibble(x1 = seq(0, 10, length.out = 50),
x2 = rnorm(50, mean = 150, sd = 100),
y = rnorm(50, mean = 4 - 2 * x1 + -20.0 * x2 + 30.0*x1*x2, sd = 1))
lm2 <- lm(y ~ x1 * x2, data = df)
df_pred1 <- df %>%
slice(rep(1:n(), 3)) %>%
mutate(x2 = rep(c(0.9 * mean(x2), mean(x2), 1.1 * mean(x2)), each = n()/3),
group = rep(c("0.9 · mean", "1.0 · mean", "1.1 · mean"), each = n()/3)) %>%
mutate(
y_pred = predict(lm2, newdata = .),
x = x1,
type = "variable: x1; x2 fixed") %>%
select(-x1, -x2)
df_pred2 <- df %>%
slice(rep(1:n(), 3)) %>%
mutate(x1 = rep(c(0.7 * mean(x1), mean(x1), 1.3 * mean(x1)), each = n()/3),
group = rep(c("0.9 · mean", "1.0 · mean", "1.1 · mean"), each = n()/3)) %>%
mutate(
y_pred = predict(lm2, newdata = .),
x = x2,
type = "variable: x2; x1 fixed") %>%
select(-x1, -x2)
df_plot <- bind_rows(df_pred1, df_pred2)
ggplot(df_plot, aes(x, y)) +
geom_point(alpha = 0.6, size = 2) +
geom_line(aes(y = y_pred, color = group), linewidth = 1) +
facet_wrap(~type, scales = "free_x") +
theme_light() +
theme(text = element_text(size = 14)) +
labs(color = "value of fixed \nvariable")lm1 <- lm(y ~ x, data = df)
df <- df %>%
mutate(
y_pred = predict(lm1),
residual = y - y_pred)
ggplot(df, aes(x, y)) +
geom_rect(aes(xmin = x, xmax = x + abs(residual), ymin = y_pred, ymax = y),
alpha = 0.2) +
geom_point(size = 3) +
geom_line(aes(y = y_pred), color = "red", linewidth = 1) +
geom_segment(aes(xend = x, yend = y_pred), alpha = 0.5) +
coord_fixed()\[ \begin{align} RSS &= \sum_{i=1}^n (y_i - \hat{y}_i)^2 \\ TSS &= \sum_{i=1}^n (y_i - \bar{y})^2 \\ R^2 &= 1 - \frac{RSS}{TSS} \end{align} \]
df <- tibble(x = seq(0, 10, length.out = 20),
y1 = 0.8 * x + 1 + rnorm(20, sd = 2),
y2 = 0.8 * x + 1 + rnorm(20, sd = 0.2))
lm1 <- lm(y1 ~ x, data = df)
lm2 <- lm(y2 ~ x, data = df)
r2_1 <- str_glue("R²: {round(summary(lm1)$r.squared, 3)}")
r2_2 <- str_glue("R²: {round(summary(lm2)$r.squared, 3)}")
df %>%
pivot_longer(cols = c(y1, y2)) %>%
mutate(name = factor(name, labels = c(r2_1, r2_2))) -> df_plot
df %>%
mutate(y1 = predict(lm1),
y2 = predict(lm2)) %>%
pivot_longer(cols = c(y1, y2)) %>%
mutate(name = factor(name, labels = c(r2_1, r2_2))) -> df_pred
ggplot(df_plot, aes(x, value)) +
geom_point(size = 3, alpha = 0.5) +
geom_line(data = df_pred, linewidth = 1, color = "red") +
facet_wrap(~name) +
theme_light() +
theme(text = element_text(size = 14)) +
labs(y = "y")\[ R^2_{\text{adj}} = 1 - \frac{RSS/(n-p-1)}{TSS/(n-1)} \]
with sample size \(n\)
summary function, the standard \(R^2\) is reported as Multiple R-squared and the adjusted \(R^2\) as Adjusted R-squared
df <- tibble(
x = seq(1, 20, length.out = 40),
y = rnorm(40, mean = 20 + 10*x, sd = 20))
lm1 <- lm(y ~ x, data = df)
df_pred <- df %>%
bind_cols(predict(lm1, interval = "prediction",
newdata = .)) %>%
mutate(lwr_qt = qnorm(0.025, mean = fit, sd = sigma(lm1)),
upr_qt = qnorm(0.975, mean = fit, sd = sigma(lm1)))
ggplot(df, aes(x, y)) +
geom_point(size = 4, alpha = 0.7) +
geom_ribbon(aes(ymin = lwr_qt, ymax = upr_qt, color = "quantile"),
data = df_pred, alpha = 0.0, fill = "black") +
geom_ribbon(aes(ymin = lwr, ymax = upr, color = "predictive interval"),
data = df_pred, alpha = 0.05, fill = "red") +
geom_line(aes(y = fit), data = df_pred,
color = "red", linewidth = 1) +
labs(color = "type")df_pred <- df %>%
bind_cols(predict(lm1, interval = "prediction",
newdata = .)) %>%
rename(lwr_pred = lwr, upr_pred = upr) %>%
select(-fit) %>%
bind_cols(predict(lm1, interval = "confidence",
newdata = .))
ggplot(df, aes(x, y)) +
geom_point(size = 4, alpha = 0.7) +
geom_ribbon(aes(ymin = lwr, ymax = upr, color = "confidence interval"),
data = df_pred, alpha = 0.2, fill = "black") +
geom_ribbon(aes(ymin = lwr_pred, ymax = upr_pred, color = "predictive interval"),
data = df_pred, alpha = 0.1, fill = "black") +
geom_line(aes(y = fit), data = df_pred,
color = "red", linewidth = 1) +
labs(color = "type")lb <- c()
ub <- c()
for (i in 1:100) {
x <- rnorm(100, mean = 10, sd = 1)
confidence_interval <- confint(lm(x ~ 1))
lb <- c(lb, confidence_interval[1])
ub <- c(ub, confidence_interval[2])
}
tibble(lb, ub, n = 1:100) %>%
mutate(in_ci = lb < 10 & ub > 10) %>%
ggplot() +
geom_segment(aes(x = lb, xend = ub,
y = n, yend = n,
color = in_ci)) +
geom_vline(xintercept = 10) +
labs(x = "Confidence interval",
color = "Confidence interval\ncontains population\nmean?")| Term | Number of continuous predictor variables | Number of categorical predictor variables |
|---|---|---|
| Simple linear regression | 1 | 0 |
| Multiple linear regression | > 1 | |
| Analysis of variance (ANOVA) | 0 | 1 |
| Analysis of covariance (ANCOVA) | 1 | 1 |
Linear models