Linear models

Day 5 and 6

Felix Nößler

Freie Universität Berlin @ Theoretical Ecology

January 19, 2024

Reproduce slides

library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)

## theme for ggplot
theme_set(theme_classic())
theme_update(text = element_text(size = 14))

1 Overview

1.1 Introduction

\[ \begin{align} y &= \alpha + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \] - \(y\) is the response variable
- \(\alpha\) is the mean of the group
- \(\epsilon\) is the error term with standard deviation \(\sigma\)

we increase the complexity from a normal distribution to linear models:
- we will add more groups to the categorical predictor variable
- we will add a continuous predictor variable
in both cases: the mean of the normal distribution is shifted
the standard deviation of the residuals stays constant across groups and across values of the continuous predictor variable

Code

df <- tibble(group = factor(rep(c("α"), 20)),
             y = rnorm(20, sd = 1, mean = 0.5))

lm1 <- lm(y ~ 1, data = df)

b <- coef(lm1)[1]

df <- mutate(df, y_pred = b)

df_normal <- tibble(
  group = factor(rep("a", 100)),
  group_num = unclass(group),
  y = seq(2*min(df$y), 1.5*max(df$y), length.out = 100),
  y_normal = dnorm(y, 
                   mean = rep(b, 100),
                   sd = sigma(lm1))) 

ggplot(df, aes(group, y)) +
  geom_point(aes(group, y_pred), color = "blue", size = 10, shape = 95) +
  geom_jitter(width = 0.05, height = 0, size = 2, alpha = 0.5) +
  geom_path(aes(group_num + y_normal, y, group = group), data = df_normal) +
  geom_ribbon(aes(xmin = group_num, xmax = group_num + y_normal, y = y, group = group), 
              data = df_normal, alpha = 0.1, fill = "blue")

1.2 One categorical predictor variable

we have a categorical predictor variable with three groups: \(a\), \(b\), \(c\)

\[ \begin{align} y &= \left\{ \begin{array}{ll} \alpha_a + \epsilon \\ \alpha_b + \epsilon \\ \alpha_c + \epsilon \\ \end{array} \right. \\ \epsilon &\sim N(0, \sigma) \end{align} \] or shorter with \(g\) as the group index: \[ \begin{align} y_g &= \alpha_g + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

Examples:

biomass production for mown, grazed, mixed grasslands
bacteria density of control, treatment 1, treatment 2
body size of species 1, species 2, species 3

Code

df <- tibble(group = factor(rep(c("a", "b", "c"), 20)),
             y = rnorm(60, sd = 1, mean = c(0.5, 3.0, 5.0)))

lm1 <- lm(y ~ group, data = df)

b1 <- coef(lm1)[1]
b2 <- b1 + coef(lm1)[2]
b3 <- b1 + coef(lm1)[3]

df <- df %>% 
  mutate(y_pred = case_when(group == "a" ~ b1,
                            group == "b" ~ b2,
                            group == "c" ~ b3))

df_normal <- tibble(
  group = factor(rep(c("a", "b", "c"), each = 100)),
  group_num = unclass(group),
  y = rep(seq(2*min(df$y), 1.5*max(df$y), length.out = 100), 3),
  y_normal = dnorm(y, 
                   mean = rep(c(b1, b2, b3), each = 100),
                   sd = sigma(lm1))) 

ggplot(df, aes(group, y)) +
  geom_point(aes(group, y_pred), color = "blue", size = 10, shape = 95) +
  geom_jitter(width = 0.05, height = 0, size = 2, alpha = 0.5) +
  geom_path(aes(group_num + y_normal, y, group = group), data = df_normal) +
  geom_ribbon(aes(xmin = group_num, xmax = group_num + y_normal, y = y, group = group), 
              data = df_normal, alpha = 0.1, fill = "blue")

1.3 One continous predictor variable

\[ \begin{align} y &= \alpha + \beta \cdot x + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \] - \(y\) is the dependent variable
- \(x\) is the independent variable
- \(\beta\) is the slope, \(\beta = \frac{\Delta y}{\Delta x}\) (see slope triangle in blue)
- \(\alpha\) is the y-axis intercept (purple)
- \(\sigma\) is the standard deviation of the residuals

Examples:

body size of fishes associated to harvest intensity
biomass production of grasslands associated to nitrogen fertilization

Code

df <- tibble(x = seq(0, 10, length.out = 20),
             y = 0.5 * x + 2 + rnorm(20, sd = 1))
lm1 <- lm(y ~ x, data = df)

alpha <- coef(lm1)[1]
beta <- coef(lm1)[2]
s <- sigma(lm1)

df_quantile <- tibble(
  alpha = alpha + qnorm(seq(0.025, 0.975, 0.05), sd = s),
  beta = beta)

ggplot(df, aes(x, y)) +
  geom_abline(aes(slope = beta, intercept = alpha),
              data      = df_quantile,
              color = "grey", linewidth = 0.5) +
  geom_abline(intercept = alpha,
              slope = beta,
              color = "red", linewidth = 1.5) +
  geom_point(x = 0, y = alpha, color = "purple",
             size = 2) +
  geom_segment(aes(x = 5, xend = 9, y = beta * 5 + alpha,
                   yend = beta * 5 + alpha),
               color = "cornflowerblue", linewidth = 1) +
  geom_segment(aes(x = 9, xend = 9, y = beta * 5 + alpha,
                   yend = beta * 9 + alpha),
               color = "cornflowerblue", linewidth = 1) +
  geom_segment(aes(x = 0, y = alpha,
                   xend = 0, yend = 0),
               color = "purple", linewidth = 0.5, linetype = "dashed") +
  geom_segment(aes(x = 0, y = alpha,
                   xend = -1, yend = alpha),
               color = "purple", linewidth = 0.5, 
               arrow = arrow(type = "closed", length = unit(0.30, "cm"))) +
  annotate("text", x = 7, y = beta * 5 + alpha - 0.2,
           label = "Δx") +
  annotate("text", x = 9.3, y = beta * 7 + alpha,
           label = "Δy") +
  annotate("text", x = -0.5, y = alpha + 0.2,
           label = "α") +
  geom_point(size = 3) +
  coord_fixed()

1.4 One continous and one categorical predictor variable

\[ \begin{align} y_g &= \alpha_g + \beta \cdot x + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

Example:

body size of fishes associated to harvest intensity for different fish species
biomass production of grasslands associated to nitrogen fertilization for different soil types

Code

df <- tibble(group = factor(rep(c("a", "b", "c"), 30)),
             x = rnorm(90, sd = 4, mean = 10),
             y = x * 0.5 + rep(c(12, 8, 2), 30) + rnorm(90, sd = 1, mean = 0))

lm1 <- lm(y ~ x + group, data = df)

df <- df %>% 
  mutate(y_pred = predict(lm1))

df_quantile <- df %>%
  slice(rep(1:n(), each = 20)) %>%
  mutate(
    p = rep(seq(0.025, 0.975, 0.05), 90),
    q = qnorm(p, sd = sigma(lm1), mean = y_pred)) %>%
  arrange(group)

ggplot(df, aes(x,y, color = group)) +
  geom_line(aes(y = y_pred), linewidth = 1) +
  geom_line(aes(x, q, group = interaction(p, group)), data = df_quantile, 
            alpha = 0.2, color = "grey1") + 
  geom_point(size = 2, alpha = 0.8)

1.5 Two categorical predictor variables

\[ \begin{align} y_{gk} &= \alpha_g + \alpha_k + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

when we have a first categorical predictor variable with two groups and a second categorical predictor variable with three groups, we have six groups in total

Examples:

body size of fishes associated to harvest intensity for different fish species and different locations
biomass production associated with different soil types and different species

Code

df <- tibble(
  species = factor(rep(c("species a", "species b", "species c"), 40)),
  soil_type = factor(rep(c("loam", "sand"), each = 60)),
  biomass = rnorm(120, sd = 5, mean = rep(c(100, 70), each = 60) + rep(c(20, 0, -20), 40))
)

lm1 <- lm(biomass ~ species + soil_type, data = df)

df <- df %>% 
  mutate(y_pred = predict(lm1))

df_normal <- df %>%
  group_by(species, soil_type) %>%
  reframe(q = seq(0.8 * min(biomass), 1.2 * max(biomass), 0.01)) %>% 
  mutate(y_pred = predict(lm1, newdata = .),
         y_dens = dnorm(q, mean = y_pred, sd = sigma(lm1)),
         group = interaction(species, soil_type),
         group_num = unclass(group)) 
  
ggplot(df, aes(interaction(species, soil_type), biomass)) +
  geom_jitter(width = 0.05, height = 0, size = 1.5, alpha = 0.3) +
  geom_point(aes(y = y_pred), color = "blue", size = 8, shape = 95) +
  geom_path(aes(group_num + 5 * y_dens, q, group = group), data = df_normal) +
  geom_ribbon(aes(xmin = group_num, xmax = group_num + 5 * y_dens, y = q, group = group), 
              data = df_normal, alpha = 0.1, fill = "blue") +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  labs(x = "species, soil type")

1.6 Two continous predictor variables

with the two continuous variables \(x_1\) and \(x_2\): \[ \begin{align} y &= \alpha + \beta_1 \cdot x_1 + \beta_2 \cdot x_2 + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

Code

df <- tibble(x1 = seq(0, 10, length.out = 50),
             x2 = rnorm(50, mean = 15, sd = 2),
             y = rnorm(50, mean = 4 - 0.5 * x1 + 2.0 * x2, sd = 1))

lm2 <- lm(y ~ x1 + x2, data = df)

b <- coef(lm2)[1]
m1 <- coef(lm2)[2]
m2 <- coef(lm2)[3]

df_pred1 <- df %>% 
  mutate(y_pred = b + m1 * x1 + m2 * mean(x2),
         x = x1,
         type = "variable: x1; x2 at mean") %>% 
  select(-x1, -x2)

df_pred1_quantiles <- df_pred1 %>%
  slice(rep(1:n(), each = 20)) %>%
  mutate(
    p = rep(seq(0.025, 0.975, 0.05), 50),
    q = qnorm(p, sd = sigma(lm2), mean = y_pred)) %>% 
  select(p, x, q, type)

df_pred2 <- df %>% 
  mutate(y_pred = b + m1 * mean(x1) + m2 * x2,
         x = x2,
         type = "variable: x2; x1 at mean") %>% 
  select(-x1, -x2)

df_pred2_quantiles <- df_pred2 %>%
  slice(rep(1:n(), each = 20)) %>%
  mutate(
    p = rep(seq(0.025, 0.975, 0.05), 50),
    q = qnorm(p, sd = sigma(lm2), mean = y_pred)) %>% 
  select(p, x, q, type)

df_plot <- bind_rows(df_pred1, df_pred2)
df_plot_quantiles <- bind_rows(df_pred1_quantiles, df_pred2_quantiles)

ggplot(df_plot, aes(x, y)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_line(aes(y = y_pred), color = "red", linewidth = 1.5) +
  geom_line(aes(x, q, group = p), data = df_plot_quantiles,
            color = "black", alpha = 0.2) +
  facet_wrap(~type, scales = "free_x") +
  theme_light() +
  theme(text = element_text(size = 14))

it is hard to visualize both variables at the same time, for a 3D plot with a regression plane see here

2 Interaction

2.1 Interaction between continous and categorical predictor variables

recall the equation for a linear model with one categorical and one continous predictor variable:

\[ \begin{align} y_g &= \alpha_g + \beta \cdot x + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

if you want to include an “interaction” the equation changes to: \[ \begin{align} y_g &= \alpha_g + \beta_g \cdot x + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

the slope \(\beta\) is dependent on the group \(g\) of the categorical predictor variable

Code

df <- tibble(group = factor(rep(c("a", "b", "c"), 30)),
             x = rnorm(90, sd = 4, mean = 10),
             y = x * rep(c(0.2, 0.5, 0.7), 30) + rep(c(12, 8, 2), 30) + rnorm(90, sd = 1, mean = 0))

lm1 <- lm(y ~ x * group, data = df)

df <- df %>% 
  mutate(y_pred = predict(lm1))

df_quantile <- df %>%
  slice(rep(1:n(), each = 20)) %>%
  mutate(
    p = rep(seq(0.025, 0.975, 0.05), 90),
    q = qnorm(p, sd = sigma(lm1), mean = y_pred)) %>%
  arrange(group)

ggplot(df, aes(x,y, color = group)) +
  geom_line(aes(y = y_pred), linewidth = 1) +
  geom_line(aes(x, q, group = interaction(p, group)), data = df_quantile, 
            alpha = 0.2, color = "grey1") + 
  geom_point(size = 2, alpha = 0.8)

2.1 Interaction between continous and categorical predictor variables

2.2 Interaction between two categorical predictor variables

recall the equation for a linear model with two categorical predictor variable:

\[ \begin{align} y_{gk} &= \alpha_g + \alpha_k + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

if you want to include an “interaction” the equation changes to: \[ \begin{align} y_{gk} &= \alpha_{gk} + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

now, for each combination of the two categorical predictor variables we have a different intercept \(\alpha_{gk}\)
for example: 5 groups, 3 groups: \(5 \cdot 3 = 15\) different intercepts, before we had only \(5 + 3 = 8\) different intercepts

Code

df <- tibble(
  species = factor(rep(c("species a", "species b", "species c"), 40)),
  soil_type = factor(rep(c("loam", "sand"), each = 60)),
  biomass = rnorm(120, sd = 5, mean = rep(c(100, 70), each = 60) + c(rep(c(20, 0, -20), 20), rep(c(0, 10, 10), 20))  )
)

lm1 <- lm(biomass ~ species * soil_type, data = df)

df <- df %>% 
  mutate(y_pred = predict(lm1))

df_normal <- df %>%
  group_by(species, soil_type) %>%
  reframe(q = seq(0.8 * min(biomass), 1.2 * max(biomass), 0.01)) %>% 
  mutate(y_pred = predict(lm1, newdata = .),
         y_dens = dnorm(q, mean = y_pred, sd = sigma(lm1)),
         group = interaction(species, soil_type),
         group_num = unclass(group)) 
  
ggplot(df, aes(interaction(species, soil_type), biomass)) +
  geom_jitter(width = 0.05, height = 0, size = 1.5, alpha = 0.3) +
  geom_point(aes(y = y_pred), color = "blue", size = 8, shape = 95) +
  geom_path(aes(group_num + 5 * y_dens, q, group = group), data = df_normal) +
  geom_ribbon(aes(xmin = group_num, xmax = group_num + 5 * y_dens, y = q, group = group), 
              data = df_normal, alpha = 0.1, fill = "blue") +
  scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
  labs(x = "species, soil type")

2.2 Interaction between two categorical predictor variables

2.3 Interaction between two continous predictor variables

recall the equation for a linear model with two continuous predictor variable:

\[ \begin{align} y_{gk} &= \alpha + \beta_1 \cdot x_1 + \beta_2 \cdot x_2 + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

if you want to include an “interaction” the equation changes to: \[ \begin{align} y_{gk} &= \alpha + \beta_1 \cdot x_1 + \beta_2 \cdot x_2 + \beta_3\cdot x_1 \cdot x_2 + \epsilon \\ \epsilon &\sim N(0, \sigma) \end{align} \]

now, there is a third slope \(\beta_3\) that describes the interaction between the two continuous predictor variables

Code

df <- tibble(x1 = seq(0, 10, length.out = 50),
             x2 = rnorm(50, mean = 150, sd = 100),
             y = rnorm(50, mean = 4 - 2 * x1 + -20.0 * x2 + 30.0*x1*x2, sd = 1))

lm2 <- lm(y ~ x1 * x2, data = df)

df_pred1 <- df %>% 
  slice(rep(1:n(), 3)) %>%
  mutate(x2 = rep(c(0.9 * mean(x2), mean(x2), 1.1 * mean(x2)), each = n()/3),
         group = rep(c("0.9 · mean", "1.0 · mean", "1.1 · mean"), each = n()/3)) %>% 
  mutate(
    y_pred = predict(lm2, newdata = .),
    x = x1,
    type = "variable: x1; x2 fixed") %>% 
  select(-x1, -x2)

df_pred2 <- df %>% 
  slice(rep(1:n(), 3)) %>%
  mutate(x1 = rep(c(0.7 * mean(x1), mean(x1), 1.3 * mean(x1)), each = n()/3),
         group = rep(c("0.9 · mean", "1.0 · mean", "1.1 · mean"), each = n()/3)) %>% 
  mutate(
    y_pred = predict(lm2, newdata = .),
    x = x2,
    type = "variable: x2; x1 fixed") %>% 
  select(-x1, -x2)

df_plot <- bind_rows(df_pred1, df_pred2)

ggplot(df_plot, aes(x, y)) +
  geom_point(alpha = 0.6, size = 2) +
  geom_line(aes(y = y_pred, color = group), linewidth = 1) +
  facet_wrap(~type, scales = "free_x") +
  theme_light() +
  theme(text = element_text(size = 14)) +
  labs(color = "value of fixed \nvariable")

2.3 Interaction between two continous predictor variables

3 Parameter estimation

Code

df <- tibble(x = seq(0, 10, length.out = 20),
             y = 0.8 * x + 1 + rnorm(20, sd = 2))

ggplot(df, aes(x, y)) +
  geom_point(size = 3)

What is the best line?

3.1 Method of least squares

we want to minimize the sum of squared residuals \[ \sum_{i=1}^n (y_i - \hat{y}_i)^2 \]
the parameter estimate obtained by the method of the least squares equals the maximum likelihood estimate (if all assumptions are met)
there is an analytical solution available for linear regression models

Code

lm1 <- lm(y ~ x, data = df)

df <- df %>% 
  mutate(
    y_pred = predict(lm1),
    residual = y - y_pred) 

ggplot(df, aes(x, y)) +
  geom_rect(aes(xmin = x, xmax = x + abs(residual), ymin = y_pred, ymax = y),
            alpha = 0.2) +
  geom_point(size = 3) +
  geom_line(aes(y = y_pred), color = "red", linewidth = 1) +
  geom_segment(aes(xend = x, yend = y_pred), alpha = 0.5) +
  coord_fixed()

4 Goodness of fit

how well does the model fit the data?
the coefficient of determination \(R^2\) is a measure of the goodness of fit

\[ \begin{align} RSS &= \sum_{i=1}^n (y_i - \hat{y}_i)^2 \\ TSS &= \sum_{i=1}^n (y_i - \bar{y})^2 \\ R^2 &= 1 - \frac{RSS}{TSS} \end{align} \]

it compares the residual sum of squares (\(RSS\)) to the total sum of squares (\(TSS\))
it can be interpreted as the proportion of variance explained by the model

4.1 Visualization of \(R^2\)

Code

df <- tibble(x = seq(0, 10, length.out = 20),
             y1 = 0.8 * x + 1 + rnorm(20, sd = 2),
             y2 = 0.8 * x + 1 + rnorm(20, sd = 0.2))
lm1 <- lm(y1 ~ x, data = df)
lm2 <- lm(y2 ~ x, data = df)

r2_1 <- str_glue("R²: {round(summary(lm1)$r.squared, 3)}")
r2_2 <- str_glue("R²: {round(summary(lm2)$r.squared, 3)}")

df %>%
  pivot_longer(cols = c(y1, y2)) %>%
  mutate(name = factor(name, labels = c(r2_1, r2_2))) -> df_plot

df %>%
  mutate(y1 = predict(lm1),
         y2 = predict(lm2)) %>%
  pivot_longer(cols = c(y1, y2)) %>%
  mutate(name = factor(name, labels = c(r2_1, r2_2))) -> df_pred

ggplot(df_plot, aes(x, value)) +
  geom_point(size = 3, alpha = 0.5) +
  geom_line(data = df_pred, linewidth = 1, color = "red") +
  facet_wrap(~name) +
  theme_light() +
  theme(text = element_text(size = 14)) +
  labs(y = "y")

4.2 Adjusted \(R^2\)

\(R^2\) increases with the number of predictor variables
the adjusted \(R^2\) penalizes the number of predictor variables \(p\):

\[ R^2_{\text{adj}} = 1 - \frac{RSS/(n-p-1)}{TSS/(n-1)} \]

with sample size \(n\)

in the summary function, the standard \(R^2\) is reported as Multiple R-squared and the adjusted \(R^2\) as Adjusted R-squared

5 Uncertainty quantification

uncertainty in the indivdual data points: prediction interval
uncertainty in the expected value of the regression: confidence interval

5.1 Predictive interval

simple/naive method: use the quantile function (inverse cdf) of the Normal distribution
problem: we are not sure if the estimates of the mean and the standard deviation are correct, therefore there is a correction included

Code

df <- tibble(
  x = seq(1, 20, length.out = 40),
  y = rnorm(40, mean = 20 + 10*x, sd = 20))

lm1 <- lm(y ~ x, data = df)

df_pred <- df %>%
  bind_cols(predict(lm1, interval = "prediction",
                    newdata = .)) %>%
  mutate(lwr_qt = qnorm(0.025, mean = fit, sd = sigma(lm1)),
         upr_qt = qnorm(0.975, mean = fit, sd = sigma(lm1))) 

ggplot(df, aes(x, y)) +
  geom_point(size = 4, alpha = 0.7) +
  geom_ribbon(aes(ymin = lwr_qt, ymax = upr_qt, color = "quantile"), 
              data = df_pred, alpha = 0.0, fill = "black") +
  geom_ribbon(aes(ymin = lwr, ymax = upr, color = "predictive interval"), 
              data = df_pred, alpha = 0.05, fill = "red") +
    geom_line(aes(y = fit), data = df_pred, 
              color = "red", linewidth = 1) +
    labs(color = "type")

5.2 Confidence interval

the confidence interval (band) represents the uncertainty of the expected value (mean) of the regression
it is always smaller than the predictive interval

Code

df_pred <- df %>%
  bind_cols(predict(lm1, interval = "prediction",
                    newdata = .)) %>%
  rename(lwr_pred = lwr, upr_pred = upr) %>%
  select(-fit) %>%
  bind_cols(predict(lm1, interval = "confidence",
                    newdata = .)) 

ggplot(df, aes(x, y)) +
  geom_point(size = 4, alpha = 0.7) +
  geom_ribbon(aes(ymin = lwr, ymax = upr, color = "confidence interval"), 
              data = df_pred, alpha = 0.2, fill = "black") +
  geom_ribbon(aes(ymin = lwr_pred, ymax = upr_pred, color = "predictive interval"), 
              data = df_pred, alpha = 0.1, fill = "black") +
    geom_line(aes(y = fit), data = df_pred, 
              color = "red", linewidth = 1) +
    labs(color = "type")

5.3 Confidence interval

Code

lb <- c()
ub <- c()
for (i in 1:100) {
  x <- rnorm(100, mean = 10, sd = 1)
  confidence_interval <- confint(lm(x ~ 1))
  
  lb <- c(lb, confidence_interval[1])
  ub <- c(ub, confidence_interval[2])
}

tibble(lb, ub, n = 1:100) %>% 
  mutate(in_ci = lb < 10 & ub > 10) %>% 
  ggplot() + 
    geom_segment(aes(x = lb, xend = ub, 
                     y = n, yend = n, 
                     color = in_ci)) +
    geom_vline(xintercept = 10) +
    labs(x = "Confidence interval",
         color = "Confidence interval\ncontains population\nmean?")

if we sample several times from the population,
and calculate the 95 % confidence intervals,
then 95 % of these intervals should contain the true value (here: population mean) of \(x\)

6 Terminology

Term	Number of continuous predictor variables	Number of categorical predictor variables
Simple linear regression	1	0
Multiple linear regression	> 1
Analysis of variance (ANOVA)	0	1
Analysis of covariance (ANCOVA)	1	1

you don’t have to use this terminology. It would be enough to say that you use a linear model with one continuous and one categorical predictor variable with Normal distributed residuals.
we cover only models with one dependent variable in this course. If you have more than one dependent variable, you can use multivariate linear models (MANOVA, MANCOVA) or use structural equation models (SEM)