class: center middle main-title section-title-1 # .kjh-yellow[Working with<br />] .kjh-lblue[`tidycensus`] .class-info[ **Week 07** .light[Kieran Healy<br> Duke University, Spring 2023] ] --- layout: true class: title title-1 --- # Load our libraries .SMALL[ ```r library(here) # manage file paths library(socviz) # data and some useful things, especially %nin% library(tidyverse) # your friend and mine library(scales) # Convenient scale labels library(tidycensus) # Tidily talk to the Census library(sf) # Draw maps with ggplot ``` ``` ## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE ``` ```r # Don't needlessly download geo files multiple times options(tigris_use_cache = TRUE) ``` ] --- # Census Data - Make sure you have a Census API key. Load it after you load `tidycensus` ```r library(tidycensus) census_api_key("1234567890987654321") ``` - Get one from <https://api.census.gov/data/key_signup.html> --- ## Decennial Census Variables There are a lot of them: ```r census_vars <- load_variables(year = 2010, dataset = "sf1", cache = TRUE) census_vars ``` ``` ## # A tibble: 8,959 × 3 ## name label concept ## <chr> <chr> <chr> ## 1 H001001 Total HOUSING UNITS ## 2 H002001 Total URBAN AND RURAL ## 3 H002002 Total!!Urban URBAN AND RURAL ## 4 H002003 Total!!Urban!!Inside urbanized areas URBAN AND RURAL ## 5 H002004 Total!!Urban!!Inside urban clusters URBAN AND RURAL ## 6 H002005 Total!!Rural URBAN AND RURAL ## 7 H002006 Total!!Not defined for this file URBAN AND RURAL ## 8 H003001 Total OCCUPANCY STATUS ## 9 H003002 Total!!Occupied OCCUPANCY STATUS ## 10 H003003 Total!!Vacant OCCUPANCY STATUS ## # … with 8,949 more rows ``` --- # Decennial Census Variables - Median Age by State ```r age10 <- get_decennial(geography = "state", variables = "P013001", year = 2010) age10 ``` ``` ## # A tibble: 52 × 4 ## GEOID NAME variable value ## <chr> <chr> <chr> <dbl> ## 1 01 Alabama P013001 37.9 ## 2 02 Alaska P013001 33.8 ## 3 04 Arizona P013001 35.9 ## 4 05 Arkansas P013001 37.4 ## 5 06 California P013001 35.2 ## 6 22 Louisiana P013001 35.8 ## 7 21 Kentucky P013001 38.1 ## 8 08 Colorado P013001 36.1 ## 9 09 Connecticut P013001 40 ## 10 10 Delaware P013001 38.8 ## # … with 42 more rows ``` --- # Some Data by County for North Carolina ```r # Census variable names popvars <- c("P005003", "P005004", "P005006", "P004003") # Get a county-level dataset for NC with these variables # The summary value is the total population. nc <- get_decennial(geography = "county", variables = popvars, year = 2010, summary_var = "P001001", state = "NC") |> mutate(pct = 100 * (value / summary_value)) nc ``` ``` ## # A tibble: 400 × 6 ## GEOID NAME variable value summary_value pct ## <chr> <chr> <chr> <dbl> <dbl> <dbl> ## 1 37007 Anson County, North Carolina P005003 12344 26948 45.8 ## 2 37011 Avery County, North Carolina P005003 16029 17797 90.1 ## 3 37003 Alexander County, North Carolina P005003 32671 37198 87.8 ## 4 37015 Bertie County, North Carolina P005003 7393 21282 34.7 ## 5 37013 Beaufort County, North Carolina P005003 31705 47759 66.4 ## 6 37005 Alleghany County, North Carolina P005003 9862 11155 88.4 ## 7 37001 Alamance County, North Carolina P005003 101718 151131 67.3 ## 8 37009 Ashe County, North Carolina P005003 25420 27281 93.2 ## 9 37017 Bladen County, North Carolina P005003 19242 35190 54.7 ## 10 37019 Brunswick County, North Carolina P005003 86818 107431 80.8 ## # … with 390 more rows ``` --- # We can do this a little more elegantly ```r # Census variable names popvars <- tribble( ~variable, ~name, "P005003", "nh_white", "P005004", "nh_black", "P005006", "nh_asian", "P004003", "hispanic") popvars ``` ``` ## # A tibble: 4 × 2 ## variable name ## <chr> <chr> ## 1 P005003 nh_white ## 2 P005004 nh_black ## 3 P005006 nh_asian ## 4 P004003 hispanic ``` --- # We can do this a little more elegantly ```r # Get a county-level dataset for NC with these variables # The summary value is the total population. nc <- get_decennial(geography = "county", * variables = popvars$variable, year = 2010, summary_var = "P001001", state = "NC") |> mutate(pct = 100 * (value / summary_value)) nc ``` ``` ## # A tibble: 400 × 6 ## GEOID NAME variable value summary_value pct ## <chr> <chr> <chr> <dbl> <dbl> <dbl> ## 1 37007 Anson County, North Carolina P005003 12344 26948 45.8 ## 2 37011 Avery County, North Carolina P005003 16029 17797 90.1 ## 3 37003 Alexander County, North Carolina P005003 32671 37198 87.8 ## 4 37015 Bertie County, North Carolina P005003 7393 21282 34.7 ## 5 37013 Beaufort County, North Carolina P005003 31705 47759 66.4 ## 6 37005 Alleghany County, North Carolina P005003 9862 11155 88.4 ## 7 37001 Alamance County, North Carolina P005003 101718 151131 67.3 ## 8 37009 Ashe County, North Carolina P005003 25420 27281 93.2 ## 9 37017 Bladen County, North Carolina P005003 19242 35190 54.7 ## 10 37019 Brunswick County, North Carolina P005003 86818 107431 80.8 ## # … with 390 more rows ``` --- # We can do this a little more elegantly ```r nc |> left_join(popvars, by = "variable") |> relocate(name, .after = variable) |> select(-variable) |> pivot_wider(names_from = name, names_glue = "{name}_{.value}", values_from = c(value, pct)) |> mutate(NAME = str_remove(NAME, " County.*")) ``` ``` ## # A tibble: 100 × 11 ## GEOID NAME summa…¹ nh_wh…² nh_bl…³ nh_as…⁴ hispa…⁵ nh_wh…⁶ nh_bl…⁷ nh_as…⁸ ## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 37007 Anson 26948 12344 13038 281 812 45.8 48.4 1.04 ## 2 37011 Avery 17797 16029 706 56 797 90.1 3.97 0.315 ## 3 37003 Alexan… 37198 32671 2025 357 1601 87.8 5.44 0.960 ## 4 37015 Bertie 21282 7393 13252 103 267 34.7 62.3 0.484 ## 5 37013 Beaufo… 47759 31705 12105 158 3166 66.4 25.3 0.331 ## 6 37005 Allegh… 11155 9862 133 53 1004 88.4 1.19 0.475 ## 7 37001 Alaman… 151131 101718 27985 1806 16639 67.3 18.5 1.19 ## 8 37009 Ashe 27281 25420 148 105 1311 93.2 0.543 0.385 ## 9 37017 Bladen 35190 19242 12202 67 2502 54.7 34.7 0.190 ## 10 37019 Brunsw… 107431 86818 12120 560 5549 80.8 11.3 0.521 ## # … with 90 more rows, 1 more variable: hispanic_pct <dbl>, and abbreviated ## # variable names ¹summary_value, ²nh_white_value, ³nh_black_value, ## # ⁴nh_asian_value, ⁵hispanic_value, ⁶nh_white_pct, ⁷nh_black_pct, ## # ⁸nh_asian_pct ``` --- # Widened table Assign this to `nc_wide` and we have this: ```r nc_wide ``` ``` ## # A tibble: 100 × 11 ## GEOID NAME summa…¹ nh_wh…² nh_bl…³ nh_as…⁴ hispa…⁵ nh_wh…⁶ nh_bl…⁷ nh_as…⁸ ## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 37007 Anson 26948 12344 13038 281 812 45.8 48.4 1.04 ## 2 37011 Avery 17797 16029 706 56 797 90.1 3.97 0.315 ## 3 37003 Alexan… 37198 32671 2025 357 1601 87.8 5.44 0.960 ## 4 37015 Bertie 21282 7393 13252 103 267 34.7 62.3 0.484 ## 5 37013 Beaufo… 47759 31705 12105 158 3166 66.4 25.3 0.331 ## 6 37005 Allegh… 11155 9862 133 53 1004 88.4 1.19 0.475 ## 7 37001 Alaman… 151131 101718 27985 1806 16639 67.3 18.5 1.19 ## 8 37009 Ashe 27281 25420 148 105 1311 93.2 0.543 0.385 ## 9 37017 Bladen 35190 19242 12202 67 2502 54.7 34.7 0.190 ## 10 37019 Brunsw… 107431 86818 12120 560 5549 80.8 11.3 0.521 ## # … with 90 more rows, 1 more variable: hispanic_pct <dbl>, and abbreviated ## # variable names ¹summary_value, ²nh_white_value, ³nh_black_value, ## # ⁴nh_asian_value, ⁵hispanic_value, ⁶nh_white_pct, ⁷nh_black_pct, ## # ⁸nh_asian_pct ``` --- # Top 20 Counties by Percent Hispanic .pull-left.w45[ ```r nc_wide %>% slice_max(order_by = hispanic_pct, n = 20) %>% ggplot(mapping = aes(x = hispanic_pct, y = reorder(NAME, hispanic_pct))) + geom_point() + labs(x = "Percent Hispanic", y = NULL, title = "North Carolina Counties") ``` ] -- .pull-right.w55[ <img src="07b-slides_files/figure-html/codefig-hispanicplot-1.png" width="460" style="display: block; margin: auto;" /> ] --- # Log population vs Percent Hispanic .pull-left.w45[ ```r nc_wide %>% ggplot(mapping = aes(x = summary_value, y = hispanic_pct)) + geom_point() + scale_x_log10(labels = label_number(scale_cut = cut_short_scale())) + labs(x = "Log Population", y = "Percent Hispanic", title = "North Carolina Counties") ``` ] -- .pull-right.w55[ <img src="07b-slides_files/figure-html/codefig-hispvpop-1.png" width="460" style="display: block; margin: auto;" /> ] --- # Log population vs Percent Black .pull-left.w45[ ```r nc_wide %>% ggplot(mapping = aes(x = summary_value, y = nh_black_pct)) + geom_point() + scale_x_log10(labels = label_number(scale_cut = cut_short_scale())) + labs(x = "Log Population", y = "Percent Black", title = "North Carolina Counties") ``` ] -- .pull-right.w55[ <img src="07b-slides_files/figure-html/codefig-blackvpop-1.png" width="460" style="display: block; margin: auto;" /> ] --- # Percent White vs Percent Black .pull-left.w45[ ```r nc_wide %>% ggplot(mapping = aes(x = nh_white_pct, y = nh_black_pct)) + geom_point() + labs(x = "Percent White", y = "Percent Black", title = "North Carolina Counties") ``` ] -- .pull-right.w55[ <img src="07b-slides_files/figure-html/codefig-blackvwhite-1.png" width="460" style="display: block; margin: auto;" /> ] --- # Population Pyramids - We can ask the Census for its estimates of the age breakdown of the population, using the `get_estimates()` function. ```r usa <- get_estimates( geography = "us", product = "characteristics", breakdown = c("SEX", "AGEGROUP"), breakdown_labels = TRUE, year = 2019 ) usa ``` ``` ## # A tibble: 96 × 5 ## GEOID NAME value SEX AGEGROUP ## <chr> <chr> <dbl> <chr> <fct> ## 1 1 United States 328239523 Both sexes All ages ## 2 1 United States 19576683 Both sexes Age 0 to 4 years ## 3 1 United States 20195895 Both sexes Age 5 to 9 years ## 4 1 United States 161657324 Male All ages ## 5 1 United States 10009207 Male Age 0 to 4 years ## 6 1 United States 10322762 Male Age 5 to 9 years ## 7 1 United States 166582199 Female All ages ## 8 1 United States 9567476 Female Age 0 to 4 years ## 9 1 United States 7528626 Female Age 70 to 74 years ## 10 1 United States 20798268 Both sexes Age 10 to 14 years ## # … with 86 more rows ``` --- # Population Pyramids ```r # For the %nin% operator make sure you have # library(socviz) above usa_pyr <- usa |> filter(AGEGROUP %nin% "All ages", str_detect(AGEGROUP, "Age"), SEX %nin% "Both sexes") |> mutate(value = ifelse(SEX == "Male", -value, value)) usa_pyr ``` ``` ## # A tibble: 36 × 5 ## GEOID NAME value SEX AGEGROUP ## <chr> <chr> <dbl> <chr> <fct> ## 1 1 United States -10009207 Male Age 0 to 4 years ## 2 1 United States -10322762 Male Age 5 to 9 years ## 3 1 United States 9567476 Female Age 0 to 4 years ## 4 1 United States 7528626 Female Age 70 to 74 years ## 5 1 United States 9873133 Female Age 5 to 9 years ## 6 1 United States -10618261 Male Age 10 to 14 years ## 7 1 United States 10180007 Female Age 10 to 14 years ## 8 1 United States -11064752 Male Age 20 to 24 years ## 9 1 United States 10568188 Female Age 20 to 24 years ## 10 1 United States -12004570 Male Age 25 to 29 years ## # … with 26 more rows ``` --- # Population Pyramids .pull-left.w45[ ```r usa_pyr |> ggplot(mapping = aes(x = value, y = AGEGROUP, fill = SEX)) + geom_col(width = 0.95, alpha = 0.75) + labs(x = "Count", y = NULL, title= "US Population Pyramid") ``` ] -- .pull-right.w55[ <img src="07b-slides_files/figure-html/codefig-usapyrbasic-1.png" width="576" style="display: block; margin: auto;" /> ] --- # Population Pyramids .pull-left.w45[ ```r usa_pyr |> ggplot(mapping = aes(x = value, y = AGEGROUP, fill = SEX)) + geom_col(width = 0.95, alpha = 0.75) + # In the two scale functions we use ~ label_number() # and ~ str_remove_all() as anonymous functions. Slightly # more advanced usage than normal ... scale_x_continuous( labels = ~ label_number(scale = 0.001, suffix = "k")(abs(.x))) + scale_y_discrete( labels = ~ str_remove_all(.x, "Age | years")) + labs(x = "Count", y = NULL, title= "US Population Pyramid") + theme(legend.position = "bottom") ``` ] -- .pull-right.w55[ <img src="07b-slides_files/figure-html/codefig-usapyr-1.png" width="576" style="display: block; margin: auto;" /> ] --- layout: false class: main-title main-title-inv # .middle.squish4.huge.kjh-orange[Midterm Postmortem]