tb_raw, which contains the read in data.First, load library(tidyverse), and read in the data
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.0 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# tb_raw <- read_csv("data/TB_notifications_2018-03-18.csv")
tb_raw <- read_csv(
here::here("data/TB_notifications_2018-03-18.csv"))
## Parsed with column specification:
## cols(
## .default = col_double(),
## country = col_character(),
## iso2 = col_character(),
## iso3 = col_character(),
## iso_numeric = col_character(),
## g_whoregion = col_character(),
## new_sn_sexunk04 = col_logical(),
## new_sn_sexunk514 = col_logical(),
## new_sn_sexunk014 = col_logical(),
## new_sn_sexunk15plus = col_logical(),
## new_ep_m04 = col_logical(),
## new_ep_sexunkageunk = col_logical(),
## rdxsurvey_newinc = col_logical(),
## rdxsurvey_newinc_rdx = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 103 parsing failures.
## row col expected actual file
## 1186 new_sn_sexunk04 1/0/T/F/TRUE/FALSE 22 '/cloud/project/data/TB_notifications_2018-03-18.csv'
## 1186 new_sn_sexunk514 1/0/T/F/TRUE/FALSE 33 '/cloud/project/data/TB_notifications_2018-03-18.csv'
## 1186 new_sn_sexunk014 1/0/T/F/TRUE/FALSE 55 '/cloud/project/data/TB_notifications_2018-03-18.csv'
## 1186 new_sn_sexunk15plus 1/0/T/F/TRUE/FALSE 632 '/cloud/project/data/TB_notifications_2018-03-18.csv'
## 1187 new_sn_sexunk04 1/0/T/F/TRUE/FALSE 23 '/cloud/project/data/TB_notifications_2018-03-18.csv'
## .... ................... .................. ...... .....................................................
## See problems(...) for more details.
tb_raw
## # A tibble: 7,891 x 175
## country iso2 iso3 iso_numeric g_whoregion year new_sp new_sn new_su new_ep
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… AF AFG 004 EMR 1980 NA NA NA NA
## 2 Afghan… AF AFG 004 EMR 1981 NA NA NA NA
## 3 Afghan… AF AFG 004 EMR 1982 NA NA NA NA
## 4 Afghan… AF AFG 004 EMR 1983 NA NA NA NA
## 5 Afghan… AF AFG 004 EMR 1984 NA NA NA NA
## 6 Afghan… AF AFG 004 EMR 1985 NA NA NA NA
## 7 Afghan… AF AFG 004 EMR 1986 NA NA NA NA
## 8 Afghan… AF AFG 004 EMR 1987 NA NA NA NA
## 9 Afghan… AF AFG 004 EMR 1988 NA NA NA NA
## 10 Afghan… AF AFG 004 EMR 1989 NA NA NA NA
## # … with 7,881 more rows, and 165 more variables: new_oth <dbl>, ret_rel <dbl>,
## # ret_taf <dbl>, ret_tad <dbl>, ret_oth <dbl>, newret_oth <dbl>,
## # new_labconf <dbl>, new_clindx <dbl>, ret_rel_labconf <dbl>,
## # ret_rel_clindx <dbl>, ret_rel_ep <dbl>, ret_nrel <dbl>,
## # notif_foreign <dbl>, c_newinc <dbl>, new_sp_m04 <dbl>, new_sp_m514 <dbl>,
## # new_sp_m014 <dbl>, new_sp_m1524 <dbl>, new_sp_m2534 <dbl>,
## # new_sp_m3544 <dbl>, new_sp_m4554 <dbl>, new_sp_m5564 <dbl>,
## # new_sp_m65 <dbl>, new_sp_mu <dbl>, new_sp_f04 <dbl>, new_sp_f514 <dbl>,
## # new_sp_f014 <dbl>, new_sp_f1524 <dbl>, new_sp_f2534 <dbl>,
## # new_sp_f3544 <dbl>, new_sp_f4554 <dbl>, new_sp_f5564 <dbl>,
## # new_sp_f65 <dbl>, new_sp_fu <dbl>, new_sn_m04 <dbl>, new_sn_m514 <dbl>,
## # new_sn_m014 <dbl>, new_sn_m1524 <dbl>, new_sn_m2534 <dbl>,
## # new_sn_m3544 <dbl>, new_sn_m4554 <dbl>, new_sn_m5564 <dbl>,
## # new_sn_m65 <dbl>, new_sn_m15plus <dbl>, new_sn_mu <dbl>, new_sn_f04 <dbl>,
## # new_sn_f514 <dbl>, new_sn_f014 <dbl>, new_sn_f1524 <dbl>,
## # new_sn_f2534 <dbl>, new_sn_f3544 <dbl>, new_sn_f4554 <dbl>,
## # new_sn_f5564 <dbl>, new_sn_f65 <dbl>, new_sn_f15plus <dbl>,
## # new_sn_fu <dbl>, new_sn_sexunk04 <lgl>, new_sn_sexunk514 <lgl>,
## # new_sn_sexunk014 <lgl>, new_sn_sexunk15plus <lgl>, new_ep_m04 <lgl>,
## # new_ep_m514 <dbl>, new_ep_m014 <dbl>, new_ep_m1524 <dbl>,
## # new_ep_m2534 <dbl>, new_ep_m3544 <dbl>, new_ep_m4554 <dbl>,
## # new_ep_m5564 <dbl>, new_ep_m65 <dbl>, new_ep_m15plus <dbl>,
## # new_ep_mu <dbl>, new_ep_f04 <dbl>, new_ep_f514 <dbl>, new_ep_f014 <dbl>,
## # new_ep_f1524 <dbl>, new_ep_f2534 <dbl>, new_ep_f3544 <dbl>,
## # new_ep_f4554 <dbl>, new_ep_f5564 <dbl>, new_ep_f65 <dbl>,
## # new_ep_f15plus <dbl>, new_ep_fu <dbl>, new_ep_sexunk04 <dbl>,
## # new_ep_sexunk514 <dbl>, new_ep_sexunk014 <dbl>, new_ep_sexunk15plus <dbl>,
## # new_ep_sexunkageunk <lgl>, rel_in_agesex_flg <dbl>, newrel_m04 <dbl>,
## # newrel_m514 <dbl>, newrel_m014 <dbl>, newrel_m1524 <dbl>,
## # newrel_m2534 <dbl>, newrel_m3544 <dbl>, newrel_m4554 <dbl>,
## # newrel_m5564 <dbl>, newrel_m65 <dbl>, newrel_m15plus <dbl>,
## # newrel_mu <dbl>, newrel_f04 <dbl>, …
country, year, and those that start with new_sp_. See ?select for more informationfilter the observations to be between 1996 and 2012.tb_short.tb_short <- tb_raw %>%
select(country, # select variables country and year
year,
starts_with("new_sp_")) %>%
filter(year > 1996,
year < 2012) # before 2012
tb_short
## # A tibble: 3,202 x 22
## country year new_sp_m04 new_sp_m514 new_sp_m014 new_sp_m1524 new_sp_m2534
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghan… 1997 NA NA 0 10 6
## 2 Afghan… 1998 NA NA 30 129 128
## 3 Afghan… 1999 NA NA 8 55 55
## 4 Afghan… 2000 NA NA 52 228 183
## 5 Afghan… 2001 NA NA 129 379 349
## 6 Afghan… 2002 NA NA 90 476 481
## 7 Afghan… 2003 NA NA 127 511 436
## 8 Afghan… 2004 NA NA 139 537 568
## 9 Afghan… 2005 NA NA 151 606 560
## 10 Afghan… 2006 NA NA 193 837 791
## # … with 3,192 more rows, and 15 more variables: new_sp_m3544 <dbl>,
## # new_sp_m4554 <dbl>, new_sp_m5564 <dbl>, new_sp_m65 <dbl>, new_sp_mu <dbl>,
## # new_sp_f04 <dbl>, new_sp_f514 <dbl>, new_sp_f014 <dbl>, new_sp_f1524 <dbl>,
## # new_sp_f2534 <dbl>, new_sp_f3544 <dbl>, new_sp_f4554 <dbl>,
## # new_sp_f5564 <dbl>, new_sp_f65 <dbl>, new_sp_fu <dbl>
top_ntb_short %>% top_n(20) # place 20 inside top_n
## Selecting by new_sp_fu
## # A tibble: 26 x 22
## country year new_sp_m04 new_sp_m514 new_sp_m014 new_sp_m1524 new_sp_m2534
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Argent… 2008 11 58 69 633 611
## 2 Argent… 2009 8 36 44 546 483
## 3 Argent… 2011 50 93 143 664 657
## 4 Bosnia… 2006 0 0 0 40 58
## 5 Brazil 2010 130 168 298 4405 6381
## 6 Centra… 2010 23 55 78 379 633
## 7 Centra… 2011 14 56 70 362 576
## 8 Egypt 2008 0 13 13 581 640
## 9 Greece 2007 0 1 1 21 22
## 10 Italy 2005 7 1 8 93 191
## # … with 16 more rows, and 15 more variables: new_sp_m3544 <dbl>,
## # new_sp_m4554 <dbl>, new_sp_m5564 <dbl>, new_sp_m65 <dbl>, new_sp_mu <dbl>,
## # new_sp_f04 <dbl>, new_sp_f514 <dbl>, new_sp_f014 <dbl>, new_sp_f1524 <dbl>,
## # new_sp_f2534 <dbl>, new_sp_f3544 <dbl>, new_sp_f4554 <dbl>,
## # new_sp_f5564 <dbl>, new_sp_f65 <dbl>, new_sp_fu <dbl>
pivot_longer the data into columns into columns named stuff and count
tb_longer <- tb_short %>%
pivot_longer(cols = starts_with("new_sp_"),
names_to = "stuff",
values_to = "count")
tb_longer
## # A tibble: 64,040 x 4
## country year stuff count
## <chr> <dbl> <chr> <dbl>
## 1 Afghanistan 1997 new_sp_m04 NA
## 2 Afghanistan 1997 new_sp_m514 NA
## 3 Afghanistan 1997 new_sp_m014 0
## 4 Afghanistan 1997 new_sp_m1524 10
## 5 Afghanistan 1997 new_sp_m2534 6
## 6 Afghanistan 1997 new_sp_m3544 3
## 7 Afghanistan 1997 new_sp_m4554 5
## 8 Afghanistan 1997 new_sp_m5564 2
## 9 Afghanistan 1997 new_sp_m65 0
## 10 Afghanistan 1997 new_sp_mu NA
## # … with 64,030 more rows
stuff, into three columns named “stuff1”, “stuff2”, and “genderage”tb_sep_step1 <- tb_longer %>%
separate(col = stuff,
into = c("stuff1", "stuff2", "genderage"),
sep = "_")
tb_sep_step1
## # A tibble: 64,040 x 6
## country year stuff1 stuff2 genderage count
## <chr> <dbl> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1997 new sp m04 NA
## 2 Afghanistan 1997 new sp m514 NA
## 3 Afghanistan 1997 new sp m014 0
## 4 Afghanistan 1997 new sp m1524 10
## 5 Afghanistan 1997 new sp m2534 6
## 6 Afghanistan 1997 new sp m3544 3
## 7 Afghanistan 1997 new sp m4554 5
## 8 Afghanistan 1997 new sp m5564 2
## 9 Afghanistan 1997 new sp m65 0
## 10 Afghanistan 1997 new sp mu NA
## # … with 64,030 more rows
genderage into two columns named “gender” and “age”, separating on the first position.tb_sep_step2 <- tb_sep_step1 %>%
separate(col = genderage,
into = c("gender", "age"),
sep = 1)
tb_sep_step2
## # A tibble: 64,040 x 7
## country year stuff1 stuff2 gender age count
## <chr> <dbl> <chr> <chr> <chr> <chr> <dbl>
## 1 Afghanistan 1997 new sp m 04 NA
## 2 Afghanistan 1997 new sp m 514 NA
## 3 Afghanistan 1997 new sp m 014 0
## 4 Afghanistan 1997 new sp m 1524 10
## 5 Afghanistan 1997 new sp m 2534 6
## 6 Afghanistan 1997 new sp m 3544 3
## 7 Afghanistan 1997 new sp m 4554 5
## 8 Afghanistan 1997 new sp m 5564 2
## 9 Afghanistan 1997 new sp m 65 0
## 10 Afghanistan 1997 new sp m u NA
## # … with 64,030 more rows
Drop the columns named stuff1 and stuff2
tb_tidy <- tb_sep_step2 %>%
select(- stuff1,
- stuff2) # drop stuff1 and stuff2
tb_tidy
## # A tibble: 64,040 x 5
## country year gender age count
## <chr> <dbl> <chr> <chr> <dbl>
## 1 Afghanistan 1997 m 04 NA
## 2 Afghanistan 1997 m 514 NA
## 3 Afghanistan 1997 m 014 0
## 4 Afghanistan 1997 m 1524 10
## 5 Afghanistan 1997 m 2534 6
## 6 Afghanistan 1997 m 3544 3
## 7 Afghanistan 1997 m 4554 5
## 8 Afghanistan 1997 m 5564 2
## 9 Afghanistan 1997 m 65 0
## 10 Afghanistan 1997 m u NA
## # … with 64,030 more rows