First we read in the data from the excel sheet using read_xls
Here are are telling it:
library(readxl)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
passengers <- read_xls(
path = here::here("data/WebAirport_FY_1986-2017.xls"),
sheet = 3,
skip = 6,
# this gives us much easier names to work with
.name_repair = janitor::make_clean_names
)
passengers
## # A tibble: 3,479 x 14
## x airport year rank inbound outbound total inbound_2 outbound_2
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TOTA… TOTAL … 1985… - 1.45e7 14543928 2.91e7 2767413 2656964
## 2 TOTA… TOTAL … 1986… - 1.50e7 15001877 3.00e7 3154920 3040061
## 3 TOTA… TOTAL … 1987… - 1.62e7 16181355 3.24e7 3692976 3518767
## 4 TOTA… TOTAL … 1988… - 1.65e7 16497826 3.30e7 4022634 3907954
## 5 TOTA… TOTAL … 1989… - 1.19e7 11861644 2.37e7 4175315 4077454
## 6 TOTA… TOTAL … 1990… - 1.65e7 16536767 3.31e7 4257165 4167346
## 7 TOTA… TOTAL … 1991… - 2.05e7 20525193 4.11e7 4539042 4503847
## 8 TOTA… TOTAL … 1992… - 2.09e7 20923187 4.18e7 4902693 4856372
## 9 TOTA… TOTAL … 1993… - 2.27e7 22668912 4.53e7 5340017 5281959
## 10 TOTA… TOTAL … 1994… - 2.51e7 25091227 5.02e7 5830311 5735442
## # … with 3,469 more rows, and 5 more variables: total_2 <dbl>, inbound_3 <dbl>,
## # outbound_3 <dbl>, total_3 <dbl>, x_2 <chr>
Tidy the data, to produce a data set with these columns - airport: all of the airports. - year - type_of_flight: DOMESTIC, INTERNATIONAL - bound: IN or OUT
tidy_passengers <- passengers %>%
select(-x,
-x_2,
-rank,
-starts_with("total")) %>%
rename(inbound_dom = inbound,
inbound_intl = inbound_2,
inbound_totl = inbound_3,
outbound_dom = outbound,
outbound_intl = outbound_2,
outbound_totl =outbound_3) %>%
pivot_longer(cols = contains("bound"),
names_to = "where",
values_to = "amount") %>%
separate(col = where,
into = c("bound", "type")) %>%
filter(type != "totl")
tidy_passengers
## # A tibble: 13,916 x 5
## airport year bound type amount
## <chr> <chr> <chr> <chr> <dbl>
## 1 TOTAL AUSTRALIA 1985-86 inbound dom 14543928
## 2 TOTAL AUSTRALIA 1985-86 outbound dom 14543928
## 3 TOTAL AUSTRALIA 1985-86 inbound intl 2767413
## 4 TOTAL AUSTRALIA 1985-86 outbound intl 2656964
## 5 TOTAL AUSTRALIA 1986-87 inbound dom 15001877
## 6 TOTAL AUSTRALIA 1986-87 outbound dom 15001877
## 7 TOTAL AUSTRALIA 1986-87 inbound intl 3154920
## 8 TOTAL AUSTRALIA 1986-87 outbound intl 3040061
## 9 TOTAL AUSTRALIA 1987-88 inbound dom 16181355
## 10 TOTAL AUSTRALIA 1987-88 outbound dom 16181355
## # … with 13,906 more rows
# gather(key = where,
# value = amount,
# starts_with("IN"),
# starts_with("OUT"))