library(dplyr) library(readxl)
library(dplyr) library(readxl)
The income dataset is from the 1994 Census database. It is also known as “Census Income” or “Adult” dataset. Details of this dataset can be found at UCI Machine Learning Repository
The government data is retrieved from eurostat: gov_10q_ggdebt data.
income <- read.table("data/income.data", sep = ',', fill = F, strip.white = T) |> as_tibble() colnames(income) <- c('age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income') debt <- read_xlsx("data/eu_debt.xlsx") |> as_tibble()
glimpse(income)
## Rows: 32,561 ## Columns: 15 ## $ age <int> 39, 50, 38, 53, 28, 37, 49, 52, 31,… ## $ workclass <chr> "State-gov", "Self-emp-not-inc", "P… ## $ fnlwgt <int> 77516, 83311, 215646, 234721, 33840… ## $ education <chr> "Bachelors", "Bachelors", "HS-grad"… ## $ education_num <int> 13, 13, 9, 7, 13, 14, 5, 9, 14, 13,… ## $ marital_status <chr> "Never-married", "Married-civ-spous… ## $ occupation <chr> "Adm-clerical", "Exec-managerial", … ## $ relationship <chr> "Not-in-family", "Husband", "Not-in… ## $ race <chr> "White", "White", "White", "Black",… ## $ sex <chr> "Male", "Male", "Male", "Male", "Fe… ## $ capital_gain <int> 2174, 0, 0, 0, 0, 0, 0, 0, 14084, 5… ## $ capital_loss <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,… ## $ hours_per_week <int> 40, 13, 40, 40, 40, 40, 16, 45, 50,… ## $ native_country <chr> "United-States", "United-States", "… ## $ income <chr> "<=50K", "<=50K", "<=50K", "<=50K",…
head(debt)
## # A tibble: 6 × 29 ## Time Belgium Bulgaria Czechia Denmark ## <chr> <dbl> <dbl> <dbl> <dbl> ## 1 2013-Q1 424169 13466. 1846559 839117 ## 2 2013-Q2 428088 13467. 1813822 867358 ## 3 2013-Q3 425495 12958. 1792178 881525 ## 4 2013-Q4 414432 13979. 1840247 849938 ## 5 2014-Q1 435327 15022. 1838037 860911 ## 6 2014-Q2 438938 15924. 1817985 866649 ## # ℹ 24 more variables: ## # `Germany (until 1990 former territory of the FRG)` <dbl>, ## # Estonia <dbl>, Ireland <dbl>, Greece <dbl>, ## # Spain <dbl>, France <dbl>, Croatia <dbl>, Italy <dbl>, ## # Cyprus <dbl>, Latvia <dbl>, Lithuania <dbl>, ## # Luxembourg <dbl>, Hungary <dbl>, Malta <dbl>, ## # Netherlands <dbl>, Austria <dbl>, Poland <dbl>, …
summary()
mean()
, median()
, min()
, max()
, sum()
, sd()
quantile()
rowMeans()
, colMeans()
, rowSums()
, colSums()
summary
summary(debt |> select(1:5))
## Time Belgium Bulgaria ## Length:40 Min. :414432 Min. :12958 ## Class :character 1st Qu.:447396 1st Qu.:23371 ## Mode :character Median :464539 Median :24712 ## Mean :479389 Mean :25019 ## 3rd Qu.:515357 3rd Qu.:28210 ## Max. :577640 Max. :37848 ## Czechia Denmark ## Min. :1734602 Min. :759305 ## 1st Qu.:1815918 1st Qu.:786276 ## Median :1835851 Median :840391 ## Mean :2021238 Mean :842800 ## 3rd Qu.:2155653 3rd Qu.:872297 ## Max. :2997083 Max. :981323
mean, median, max, min, sum
mean(debt$Netherlands); median(debt$Netherlands)
## [1] 435453
## [1] 442088.5
max(debt$Netherlands); min(debt$Netherlands); sum(debt$Netherlands)
## [1] 480065
## [1] 394732
## [1] 17418122
sd, var
sd(debt$Netherlands); var(debt$Netherlands)
## [1] 20902.49
## [1] 436914204
quantile
quantile(debt$Netherlands, 0.5)
## 50% ## 442088.5
median(debt$Netherlands)
## [1] 442088.5
quantile(debt$Netherlands, 0.95)
## 95% ## 457949.9
Row operations
debt |> mutate(sums = rowSums(debt |> select(-Time))) |> select(Time, sums)
## # A tibble: 40 × 2 ## Time sums ## <chr> <dbl> ## 1 2013-Q1 39282664. ## 2 2013-Q2 39355802. ## 3 2013-Q3 39193867. ## 4 2013-Q4 39304172. ## # ℹ 36 more rows
Row operations
debt |> mutate(means = rowMeans(debt |> select(-Time))) |> select(Time, means)
## # A tibble: 40 × 2 ## Time means ## <chr> <dbl> ## 1 2013-Q1 1402952. ## 2 2013-Q2 1405564. ## 3 2013-Q3 1399781. ## 4 2013-Q4 1403720. ## # ℹ 36 more rows
Column operations
debt |> select(-Time) |> colSums() |> head(3)
## Belgium Bulgaria Czechia ## 19175551 1000742 80849536
debt |> select(-Time) |> colMeans() |> head(3)
## Belgium Bulgaria Czechia ## 479388.78 25018.54 2021238.40
summarise
income |> summarise(n_individuals = n(), mean_age= mean(age, na.rm = T), max_age = max(age, na.rm = T))
## # A tibble: 1 × 3 ## n_individuals mean_age max_age ## <int> <dbl> <int> ## 1 32561 38.6 90
group_by
income |> group_by(race, sex) |> summarise(n_individuals = n(), mean_age= mean(age, na.rm = T), max_age = max(age, na.rm = T))
## `summarise()` has grouped output by 'race'. You can ## override using the `.groups` argument.
## # A tibble: 10 × 5 ## # Groups: race [5] ## race sex n_individuals mean_age max_age ## <chr> <chr> <int> <dbl> <int> ## 1 Amer-Indian-Eskimo Female 119 37.1 80 ## 2 Amer-Indian-Eskimo Male 192 37.2 82 ## 3 Asian-Pac-Islander Female 346 35.1 75 ## 4 Asian-Pac-Islander Male 693 39.1 90 ## # ℹ 6 more rows
group_by
implies that computations along the length of the dataset are performed within specified groups (here race and sex)
income |> group_by(sex) |> mutate(mean_hours = mean(hours_per_week, na.rm = TRUE), low_hours = ifelse(hours_per_week < 37, TRUE, FALSE), .keep = "used") |> arrange(hours_per_week)
## # A tibble: 32,561 × 4 ## # Groups: sex [2] ## sex hours_per_week mean_hours low_hours ## <chr> <int> <dbl> <lgl> ## 1 Male 1 42.4 TRUE ## 2 Male 1 42.4 TRUE ## 3 Female 1 36.4 TRUE ## 4 Female 1 36.4 TRUE ## # ℹ 32,557 more rows
Note that the groups remain attached to the data until one uses ungroup()
ungroup()
summarise
removes the latest grouping layer:income |> group_by(sex, race) |> summarise(n_individuals = n())
## # A tibble: 10 × 3 ## # Groups: sex [2] ## sex race n_individuals ## <chr> <chr> <int> ## 1 Female Amer-Indian-Eskimo 119 ## 2 Female Asian-Pac-Islander 346 ## 3 Female Black 1555 ## 4 Female Other 109 ## # ℹ 6 more rows
grouped <- income |> group_by(sex, race) |> # Data is grouped by sex and race summarise(n_individuals = n()) # Data is grouped by sex grouped
## # A tibble: 10 × 3 ## # Groups: sex [2] ## sex race n_individuals ## <chr> <chr> <int> ## 1 Female Amer-Indian-Eskimo 119 ## 2 Female Asian-Pac-Islander 346 ## 3 Female Black 1555 ## 4 Female Other 109 ## # ℹ 6 more rows
grouped |> summarise(mean_individuals = mean(n_individuals)) # Data is no longer grouped
## # A tibble: 2 × 2 ## sex mean_individuals ## <chr> <dbl> ## 1 Female 2154. ## 2 Male 4358
If we “forget” a group by, strange things can happen
income |> group_by(sex) |> mutate(mean_hours = mean(hours_per_week, na.rm = TRUE), low_hours = ifelse(hours_per_week < 37, TRUE, FALSE)) |> select(age, marital_status, low_hours)
## Adding missing grouping variables: `sex`
## # A tibble: 32,561 × 4 ## # Groups: sex [2] ## sex age marital_status low_hours ## <chr> <int> <chr> <lgl> ## 1 Male 39 Never-married FALSE ## 2 Male 50 Married-civ-spouse TRUE ## 3 Male 38 Divorced FALSE ## 4 Male 53 Married-civ-spouse FALSE ## # ℹ 32,557 more rows
Even though we did not select the sex variable this is still displayed in the data because we did not remove the grouping.
Use ungroup
to be safe.
income |> group_by(sex) |> mutate(mean_hours = mean(hours_per_week, na.rm = TRUE), low_hours = ifelse(hours_per_week < 37, TRUE, FALSE)) |> ungroup() |> select(age, marital_status, low_hours)
## # A tibble: 32,561 × 3 ## age marital_status low_hours ## <int> <chr> <lgl> ## 1 39 Never-married FALSE ## 2 50 Married-civ-spouse TRUE ## 3 38 Divorced FALSE ## 4 53 Married-civ-spouse FALSE ## # ℹ 32,557 more rows