Packages we’ll use

library(dplyr)
library(readxl)

Income data

The income dataset is from the 1994 Census database. It is also known as “Census Income” or “Adult” dataset. Details of this dataset can be found at UCI Machine Learning Repository

Government debt data (quarterly)

Load data

income <- read.table("data/income.data", 
                               sep = ',', 
                               fill = F, 
                               strip.white = T) |> 
  as_tibble()

colnames(income) <- c('age', 'workclass', 'fnlwgt', 'education',
                      'education_num', 'marital_status', 'occupation',
                      'relationship', 'race', 'sex', 'capital_gain',
                      'capital_loss', 'hours_per_week', 'native_country',
                      'income')

debt <- read_xlsx("data/eu_debt.xlsx") |> 
  as_tibble()

Data

glimpse(income)
## Rows: 32,561
## Columns: 15
## $ age            <int> 39, 50, 38, 53, 28, 37, 49, 52, 31,…
## $ workclass      <chr> "State-gov", "Self-emp-not-inc", "P…
## $ fnlwgt         <int> 77516, 83311, 215646, 234721, 33840…
## $ education      <chr> "Bachelors", "Bachelors", "HS-grad"…
## $ education_num  <int> 13, 13, 9, 7, 13, 14, 5, 9, 14, 13,…
## $ marital_status <chr> "Never-married", "Married-civ-spous…
## $ occupation     <chr> "Adm-clerical", "Exec-managerial", …
## $ relationship   <chr> "Not-in-family", "Husband", "Not-in…
## $ race           <chr> "White", "White", "White", "Black",…
## $ sex            <chr> "Male", "Male", "Male", "Male", "Fe…
## $ capital_gain   <int> 2174, 0, 0, 0, 0, 0, 0, 0, 14084, 5…
## $ capital_loss   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ hours_per_week <int> 40, 13, 40, 40, 40, 40, 16, 45, 50,…
## $ native_country <chr> "United-States", "United-States", "…
## $ income         <chr> "<=50K", "<=50K", "<=50K", "<=50K",…

Data

head(debt)
## # A tibble: 6 × 29
##   Time    Belgium Bulgaria Czechia Denmark
##   <chr>     <dbl>    <dbl>   <dbl>   <dbl>
## 1 2013-Q1  424169   13466. 1846559  839117
## 2 2013-Q2  428088   13467. 1813822  867358
## 3 2013-Q3  425495   12958. 1792178  881525
## 4 2013-Q4  414432   13979. 1840247  849938
## 5 2014-Q1  435327   15022. 1838037  860911
## 6 2014-Q2  438938   15924. 1817985  866649
## # ℹ 24 more variables:
## #   `Germany (until 1990 former territory of the FRG)` <dbl>,
## #   Estonia <dbl>, Ireland <dbl>, Greece <dbl>,
## #   Spain <dbl>, France <dbl>, Croatia <dbl>, Italy <dbl>,
## #   Cyprus <dbl>, Latvia <dbl>, Lithuania <dbl>,
## #   Luxembourg <dbl>, Hungary <dbl>, Malta <dbl>,
## #   Netherlands <dbl>, Austria <dbl>, Poland <dbl>, …

Summary statistics

Summarising data with base

  • summary()
  • mean(), median(), min(), max(), sum(), sd()
  • quantile()
  • rowMeans(), colMeans(), rowSums(), colSums()

Summarising data with base R

summary

summary(debt |> select(1:5))
##      Time              Belgium          Bulgaria    
##  Length:40          Min.   :414432   Min.   :12958  
##  Class :character   1st Qu.:447396   1st Qu.:23371  
##  Mode  :character   Median :464539   Median :24712  
##                     Mean   :479389   Mean   :25019  
##                     3rd Qu.:515357   3rd Qu.:28210  
##                     Max.   :577640   Max.   :37848  
##     Czechia           Denmark      
##  Min.   :1734602   Min.   :759305  
##  1st Qu.:1815918   1st Qu.:786276  
##  Median :1835851   Median :840391  
##  Mean   :2021238   Mean   :842800  
##  3rd Qu.:2155653   3rd Qu.:872297  
##  Max.   :2997083   Max.   :981323

Summarising data with base R

mean, median, max, min, sum

mean(debt$Netherlands); median(debt$Netherlands)
## [1] 435453
## [1] 442088.5
max(debt$Netherlands); min(debt$Netherlands); sum(debt$Netherlands)
## [1] 480065
## [1] 394732
## [1] 17418122

Summarising data with base R

sd, var

sd(debt$Netherlands); var(debt$Netherlands)
## [1] 20902.49
## [1] 436914204

Summarising data with base R

quantile

quantile(debt$Netherlands, 0.5)
##      50% 
## 442088.5
median(debt$Netherlands)
## [1] 442088.5
quantile(debt$Netherlands, 0.95)
##      95% 
## 457949.9

Summarising data with base R

Row operations

debt |> 
  mutate(sums = rowSums(debt |> select(-Time))) |> 
  select(Time, sums)
## # A tibble: 40 × 2
##   Time         sums
##   <chr>       <dbl>
## 1 2013-Q1 39282664.
## 2 2013-Q2 39355802.
## 3 2013-Q3 39193867.
## 4 2013-Q4 39304172.
## # ℹ 36 more rows

Summarising data with base R

Row operations

debt |> 
  mutate(means = rowMeans(debt |> select(-Time))) |> 
  select(Time, means)
## # A tibble: 40 × 2
##   Time       means
##   <chr>      <dbl>
## 1 2013-Q1 1402952.
## 2 2013-Q2 1405564.
## 3 2013-Q3 1399781.
## 4 2013-Q4 1403720.
## # ℹ 36 more rows

Summarising data with base R

Column operations

debt |> select(-Time) |> colSums() |> head(3)
##  Belgium Bulgaria  Czechia 
## 19175551  1000742 80849536
debt |> select(-Time) |> colMeans() |> head(3)
##    Belgium   Bulgaria    Czechia 
##  479388.78   25018.54 2021238.40

Summarise using tidyverse

summarise

income |> 
  summarise(n_individuals = n(),
            mean_age= mean(age, na.rm = T),
            max_age = max(age, na.rm = T))
## # A tibble: 1 × 3
##   n_individuals mean_age max_age
##           <int>    <dbl>   <int>
## 1         32561     38.6      90

Summarise and grouping

group_by

income |> 
  group_by(race, sex) |> 
  summarise(n_individuals = n(),
            mean_age= mean(age, na.rm = T),
            max_age = max(age, na.rm = T))
## `summarise()` has grouped output by 'race'. You can
## override using the `.groups` argument.
## # A tibble: 10 × 5
## # Groups:   race [5]
##   race               sex    n_individuals mean_age max_age
##   <chr>              <chr>          <int>    <dbl>   <int>
## 1 Amer-Indian-Eskimo Female           119     37.1      80
## 2 Amer-Indian-Eskimo Male             192     37.2      82
## 3 Asian-Pac-Islander Female           346     35.1      75
## 4 Asian-Pac-Islander Male             693     39.1      90
## # ℹ 6 more rows

group_by implies that computations along the length of the dataset are performed within specified groups (here race and sex)

Understand group_by()

income |> 
  group_by(sex) |> 
  mutate(mean_hours = mean(hours_per_week, na.rm = TRUE),
         low_hours = ifelse(hours_per_week < 37, TRUE, FALSE),
         .keep = "used") |> 
  arrange(hours_per_week)
## # A tibble: 32,561 × 4
## # Groups:   sex [2]
##   sex    hours_per_week mean_hours low_hours
##   <chr>           <int>      <dbl> <lgl>    
## 1 Male                1       42.4 TRUE     
## 2 Male                1       42.4 TRUE     
## 3 Female              1       36.4 TRUE     
## 4 Female              1       36.4 TRUE     
## # ℹ 32,557 more rows

Note that the groups remain attached to the data until one uses ungroup()

Understand group_by()

  • Groups remain attached to the dataset until we remove them
    • Can be done with ungroup()
    • A summarise removes the latest grouping layer:
income |> group_by(sex, race) |> summarise(n_individuals = n())
## # A tibble: 10 × 3
## # Groups:   sex [2]
##   sex    race               n_individuals
##   <chr>  <chr>                      <int>
## 1 Female Amer-Indian-Eskimo           119
## 2 Female Asian-Pac-Islander           346
## 3 Female Black                       1555
## 4 Female Other                        109
## # ℹ 6 more rows

Understand group_by()

grouped <- income |> 
  group_by(sex, race) |> # Data is grouped by sex and race 
  summarise(n_individuals = n())  # Data is grouped by sex

grouped
## # A tibble: 10 × 3
## # Groups:   sex [2]
##   sex    race               n_individuals
##   <chr>  <chr>                      <int>
## 1 Female Amer-Indian-Eskimo           119
## 2 Female Asian-Pac-Islander           346
## 3 Female Black                       1555
## 4 Female Other                        109
## # ℹ 6 more rows
grouped |> summarise(mean_individuals = mean(n_individuals)) # Data is no longer grouped
## # A tibble: 2 × 2
##   sex    mean_individuals
##   <chr>             <dbl>
## 1 Female            2154.
## 2 Male              4358

Understand group_by()

If we “forget” a group by, strange things can happen

income |> 
  group_by(sex) |> 
  mutate(mean_hours = mean(hours_per_week, na.rm = TRUE),
         low_hours = ifelse(hours_per_week < 37, TRUE, FALSE)) |> 
  select(age, marital_status, low_hours)
## Adding missing grouping variables: `sex`
## # A tibble: 32,561 × 4
## # Groups:   sex [2]
##   sex     age marital_status     low_hours
##   <chr> <int> <chr>              <lgl>    
## 1 Male     39 Never-married      FALSE    
## 2 Male     50 Married-civ-spouse TRUE     
## 3 Male     38 Divorced           FALSE    
## 4 Male     53 Married-civ-spouse FALSE    
## # ℹ 32,557 more rows

Even though we did not select the sex variable this is still displayed in the data because we did not remove the grouping.

Understand group_by()

Use ungroup to be safe.

income |> 
  group_by(sex) |> 
  mutate(mean_hours = mean(hours_per_week, na.rm = TRUE),
         low_hours = ifelse(hours_per_week < 37, TRUE, FALSE)) |> 
  ungroup() |> 
  select(age, marital_status, low_hours)
## # A tibble: 32,561 × 3
##     age marital_status     low_hours
##   <int> <chr>              <lgl>    
## 1    39 Never-married      FALSE    
## 2    50 Married-civ-spouse TRUE     
## 3    38 Divorced           FALSE    
## 4    53 Married-civ-spouse FALSE    
## # ℹ 32,557 more rows

Exercises