Solution for tidying the penguins data set

1 Solution with R output

1.1 Task 1: What are variables and observations?

library(tidyr)
library(dplyr)
library(readr)

# Read the data
penguins_2007 <- read_csv("data/03_penguins_messy_2007.csv")
penguins_2008 <- read_csv("data/03_penguins_messy_2008.csv")
penguins_2009 <- read_csv("data/03_penguins_messy_2009.csv")

# Have a look at the data sets
penguins_2007
# A tibble: 440 × 4
   penguinID spec_island      size_measure       value
       <dbl> <chr>            <chr>              <dbl>
 1         1 Adelie.Torgersen bill_length_mm      39.1
 2         1 Adelie.Torgersen bill_depth_mm       18.7
 3         1 Adelie.Torgersen flipper_length_mm  181  
 4         1 Adelie.Torgersen body_mass_g       3750  
 5         2 Adelie.Torgersen bill_length_mm      39.5
 6         2 Adelie.Torgersen bill_depth_mm       17.4
 7         2 Adelie.Torgersen flipper_length_mm  186  
 8         2 Adelie.Torgersen body_mass_g       3800  
 9         3 Adelie.Torgersen bill_length_mm      40.3
10         3 Adelie.Torgersen bill_depth_mm       18  
# ℹ 430 more rows
penguins_2008
# A tibble: 456 × 4
   penguinID spec_island   size_measure       value
       <dbl> <chr>         <chr>              <dbl>
 1        51 Adelie.Biscoe bill_length_mm      39.6
 2        51 Adelie.Biscoe bill_depth_mm       17.7
 3        51 Adelie.Biscoe flipper_length_mm  186  
 4        51 Adelie.Biscoe body_mass_g       3500  
 5        52 Adelie.Biscoe bill_length_mm      40.1
 6        52 Adelie.Biscoe bill_depth_mm       18.9
 7        52 Adelie.Biscoe flipper_length_mm  188  
 8        52 Adelie.Biscoe body_mass_g       4300  
 9        53 Adelie.Biscoe bill_length_mm      35  
10        53 Adelie.Biscoe bill_depth_mm       17.9
# ℹ 446 more rows
penguins_2009
# A tibble: 480 × 4
   penguinID spec_island   size_measure       value
       <dbl> <chr>         <chr>              <dbl>
 1       101 Adelie.Biscoe bill_length_mm      35  
 2       101 Adelie.Biscoe bill_depth_mm       17.9
 3       101 Adelie.Biscoe flipper_length_mm  192  
 4       101 Adelie.Biscoe body_mass_g       3725  
 5       102 Adelie.Biscoe bill_length_mm      41  
 6       102 Adelie.Biscoe bill_depth_mm       20  
 7       102 Adelie.Biscoe flipper_length_mm  203  
 8       102 Adelie.Biscoe body_mass_g       4725  
 9       103 Adelie.Biscoe bill_length_mm      37.7
10       103 Adelie.Biscoe bill_depth_mm       16  
# ℹ 470 more rows

All the three tables have the same structure. Each table includes the 7 variables penguinID, species, island, bill_length_mm, bill_depth_mm, flipper_length_mm, and body_mass_g. Because each of the three tables is for a specific year, for a single table you should also include the variable year.

The observations are the penguin individuals.

1.2 Task 2: Tidy up

The tidying can be done in different order. One option is to first add the variable year to all three tables and then combine them by row

# Add year variable
penguins_2007 <- penguins_2007 %>% mutate(year = 2007)
penguins_2008 <- penguins_2008 %>% mutate(year = 2008)
penguins_2009 <- penguins_2009 %>% mutate(year = 2009)

# Combine by row
penguins1 <- bind_rows(penguins_2007, penguins_2008, penguins_2009)
penguins1
# A tibble: 1,376 × 5
   penguinID spec_island      size_measure       value  year
       <dbl> <chr>            <chr>              <dbl> <dbl>
 1         1 Adelie.Torgersen bill_length_mm      39.1  2007
 2         1 Adelie.Torgersen bill_depth_mm       18.7  2007
 3         1 Adelie.Torgersen flipper_length_mm  181    2007
 4         1 Adelie.Torgersen body_mass_g       3750    2007
 5         2 Adelie.Torgersen bill_length_mm      39.5  2007
 6         2 Adelie.Torgersen bill_depth_mm       17.4  2007
 7         2 Adelie.Torgersen flipper_length_mm  186    2007
 8         2 Adelie.Torgersen body_mass_g       3800    2007
 9         3 Adelie.Torgersen bill_length_mm      40.3  2007
10         3 Adelie.Torgersen bill_depth_mm       18    2007
# ℹ 1,366 more rows

One problem is that species and island are combined into one column. So we separate them

penguins2 <- penguins1 %>% 
  separate_wider_delim(spec_island, delim = ".", names = c("species", "island"))
penguins2
# A tibble: 1,376 × 6
   penguinID species island    size_measure       value  year
       <dbl> <chr>   <chr>     <chr>              <dbl> <dbl>
 1         1 Adelie  Torgersen bill_length_mm      39.1  2007
 2         1 Adelie  Torgersen bill_depth_mm       18.7  2007
 3         1 Adelie  Torgersen flipper_length_mm  181    2007
 4         1 Adelie  Torgersen body_mass_g       3750    2007
 5         2 Adelie  Torgersen bill_length_mm      39.5  2007
 6         2 Adelie  Torgersen bill_depth_mm       17.4  2007
 7         2 Adelie  Torgersen flipper_length_mm  186    2007
 8         2 Adelie  Torgersen body_mass_g       3800    2007
 9         3 Adelie  Torgersen bill_length_mm      40.3  2007
10         3 Adelie  Torgersen bill_depth_mm       18    2007
# ℹ 1,366 more rows

The last problem is that the bill, flipper and body mass measurements are all in one column instead of four columns. This can be fixed with pivot_wider.

penguins_tidy <- penguins2 %>% 
  pivot_wider(names_from = size_measure, values_from = value)
penguins_tidy
# A tibble: 344 × 8
   penguinID species island  year bill_length_mm bill_depth_mm flipper_length_mm
       <dbl> <chr>   <chr>  <dbl>          <dbl>         <dbl>             <dbl>
 1         1 Adelie  Torge…  2007           39.1          18.7               181
 2         2 Adelie  Torge…  2007           39.5          17.4               186
 3         3 Adelie  Torge…  2007           40.3          18                 195
 4         4 Adelie  Torge…  2007           NA            NA                  NA
 5         5 Adelie  Torge…  2007           36.7          19.3               193
 6         6 Adelie  Torge…  2007           39.3          20.6               190
 7         7 Adelie  Torge…  2007           38.9          17.8               181
 8         8 Adelie  Torge…  2007           39.2          19.6               195
 9         9 Adelie  Torge…  2007           34.1          18.1               193
10        10 Adelie  Torge…  2007           42            20.2               190
# ℹ 334 more rows
# ℹ 1 more variable: body_mass_g <dbl>

Voila, this is a tidy version of the penguin data!

2 Solution as one script without output

library(tidyr)
library(dplyr)
library(readr)

# Read the data
penguins_2007 <- read_csv("data/03_penguins_messy_2007.csv")
penguins_2008 <- read_csv("data/03_penguins_messy_2008.csv")
penguins_2009 <- read_csv("data/03_penguins_messy_2009.csv")

# Have a look at the data sets
penguins_2007
penguins_2008
penguins_2009

# Add year variable
penguins_2007 <- penguins_2007 %>% mutate(year = 2007)
penguins_2008 <- penguins_2008 %>% mutate(year = 2008)
penguins_2009 <- penguins_2009 %>% mutate(year = 2009)

# Combine by row
penguins1 <- bind_rows(penguins_2007, penguins_2008, penguins_2009)

# Separate species and island
penguins2 <- penguins1 %>% 
  separate_wider_delim(spec_island, delim = ".", names = c("species", "island"))

# Widen to extra columns for the size measurements (bill, flipper, mass)
penguins_tidy1 <- penguins2 %>% 
  pivot_wider(names_from = size_measure, values_from = value)


# This can be also done in one step with several pipes
penguins_tidy2 <- bind_rows(penguins_2007, penguins_2008, penguins_2009) %>%
  separate_wider_delim(spec_island, delim = ".", names = c("species", "island")) %>% 
  pivot_wider(names_from = size_measure, values_from = value)