Example 07: Social Data and the State

Setup

Code
library(here)      # manage file paths
here() starts at /Users/kjhealy/Documents/courses/vsd
Code
library(socviz)    # data and some useful functions
library(tidyverse) # your friend and mine
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(tidycensus) # Tidily talk to the Census
library(sf)        # Draw maps with ggplot
Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
Code
# Don't needlessly download geo files multiple times
options(tigris_use_cache = TRUE)

Make sure you have a Census API key

As we discussed in class, you’ll need an API key to talk to the Census. Get one here:

https://api.census.gov/data/key_signup.html

Follow the instructions and confirm the key via email when you get it. Keep the email.

Load your key with the census_api_key() function. Paste your key (in quotes) instead of “YOUR API KEY GOES HERE” in the chunk below.

Code
# census_api_key("YOUR API KEY GOES HERE")

Median Age by State

Code
age10 <- get_decennial(geography = "state",
                       variables = "P013001",
                       year = 2010)
Getting data from the 2010 decennial Census
Using Census Summary File 1
Code
age10
# A tibble: 52 × 4
   GEOID NAME        variable value
   <chr> <chr>       <chr>    <dbl>
 1 01    Alabama     P013001   37.9
 2 02    Alaska      P013001   33.8
 3 04    Arizona     P013001   35.9
 4 05    Arkansas    P013001   37.4
 5 06    California  P013001   35.2
 6 22    Louisiana   P013001   35.8
 7 21    Kentucky    P013001   38.1
 8 08    Colorado    P013001   36.1
 9 09    Connecticut P013001   40  
10 10    Delaware    P013001   38.8
# ℹ 42 more rows

Decennial Census Variables

There are a lot of them.

Code
census_vars <- load_variables(year = 2010,
                              dataset = "sf1",
                              cache = TRUE)

census_vars
# A tibble: 8,959 × 3
   name    label                                concept         
   <chr>   <chr>                                <chr>           
 1 H001001 Total                                HOUSING UNITS   
 2 H002001 Total                                URBAN AND RURAL 
 3 H002002 Total!!Urban                         URBAN AND RURAL 
 4 H002003 Total!!Urban!!Inside urbanized areas URBAN AND RURAL 
 5 H002004 Total!!Urban!!Inside urban clusters  URBAN AND RURAL 
 6 H002005 Total!!Rural                         URBAN AND RURAL 
 7 H002006 Total!!Not defined for this file     URBAN AND RURAL 
 8 H003001 Total                                OCCUPANCY STATUS
 9 H003002 Total!!Occupied                      OCCUPANCY STATUS
10 H003003 Total!!Vacant                        OCCUPANCY STATUS
# ℹ 8,949 more rows

Some Core Population Measures by County for North Carolina

Code
# Census variable names
popvars <- c("P005003", "P005004", "P005006", "P004003")


# Get a county-level dataset for NC with these variables
nc <- get_decennial(geography = "county",
                    variables = popvars,
                    year = 2010,
                    summary_var = "P001001",
                    state = "NC") |>
  mutate(pct = 100 * (value / summary_value))
Getting data from the 2010 decennial Census
Using Census Summary File 1
Code
nc
# A tibble: 400 × 6
   GEOID NAME                             variable  value summary_value   pct
   <chr> <chr>                            <chr>     <dbl>         <dbl> <dbl>
 1 37007 Anson County, North Carolina     P005003   12344         26948  45.8
 2 37011 Avery County, North Carolina     P005003   16029         17797  90.1
 3 37003 Alexander County, North Carolina P005003   32671         37198  87.8
 4 37015 Bertie County, North Carolina    P005003    7393         21282  34.7
 5 37013 Beaufort County, North Carolina  P005003   31705         47759  66.4
 6 37005 Alleghany County, North Carolina P005003    9862         11155  88.4
 7 37001 Alamance County, North Carolina  P005003  101718        151131  67.3
 8 37009 Ashe County, North Carolina      P005003   25420         27281  93.2
 9 37017 Bladen County, North Carolina    P005003   19242         35190  54.7
10 37019 Brunswick County, North Carolina P005003   86818        107431  80.8
# ℹ 390 more rows

ACS Data

Code
# Get median HH income by county for NC
nc_inc <- get_acs(geography = "county",
        variables = c(medincome = "B19013_001"),
        state = "NC",
        year = 2020)
Getting data from the 2016-2020 5-year ACS
Code
nc_inc
# A tibble: 100 × 5
   GEOID NAME                             variable  estimate   moe
   <chr> <chr>                            <chr>        <dbl> <dbl>
 1 37001 Alamance County, North Carolina  medincome    51580  1749
 2 37003 Alexander County, North Carolina medincome    51329  3341
 3 37005 Alleghany County, North Carolina medincome    37158  5049
 4 37007 Anson County, North Carolina     medincome    39799  5189
 5 37009 Ashe County, North Carolina      medincome    43030  2327
 6 37011 Avery County, North Carolina     medincome    42695  3551
 7 37013 Beaufort County, North Carolina  medincome    48051  3675
 8 37015 Bertie County, North Carolina    medincome    35042  3090
 9 37017 Bladen County, North Carolina    medincome    37188  4231
10 37019 Brunswick County, North Carolina medincome    59763  1672
# ℹ 90 more rows

With geographical information:

Code
nc_inc <- get_acs(geography = "county",
                  variables = c(medincome = "B19013_001"),
                  state = "NC",
                  year = 2020,
                  geometry = TRUE)
Code
# Now we have a GEOMETRY column and spatial features information
nc_inc
Simple feature collection with 100 features and 5 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: -84.32187 ymin: 33.84232 xmax: -75.46062 ymax: 36.58812
Geodetic CRS:  NAD83
First 10 features:
   GEOID                               NAME  variable estimate   moe
1  37039    Cherokee County, North Carolina medincome    40793  2333
2  37089   Henderson County, North Carolina medincome    58928  2699
3  37171       Surry County, North Carolina medincome    44979  3011
4  37131 Northampton County, North Carolina medincome    38969  2388
5  37177     Tyrrell County, North Carolina medincome    38250 13010
6  37043        Clay County, North Carolina medincome    42160  5900
7  37075      Graham County, North Carolina medincome    42207  6008
8  37031    Carteret County, North Carolina medincome    57871  2532
9  37007       Anson County, North Carolina medincome    39799  5189
10 37183        Wake County, North Carolina medincome    83567  1282
                         geometry
1  MULTIPOLYGON (((-84.31749 3...
2  MULTIPOLYGON (((-82.74289 3...
3  MULTIPOLYGON (((-80.97364 3...
4  MULTIPOLYGON (((-77.90008 3...
5  MULTIPOLYGON (((-76.4056 35...
6  MULTIPOLYGON (((-84.00582 3...
7  MULTIPOLYGON (((-84.03815 3...
8  MULTIPOLYGON (((-76.33014 3...
9  MULTIPOLYGON (((-80.3153 34...
10 MULTIPOLYGON (((-78.9877 35...

Draw a Map

Code
nc_inc |>
  mutate(slab = stringr::str_remove(NAME, " County.*"),
         estimate = estimate/1000) |>
  ggplot(mapping = aes(fill = estimate)) +
  geom_sf(color = "white") +
  geom_sf_text(aes(label = slab), color = "white", size = rel(0.9)) +
  scale_fill_continuous(labels = scales::label_dollar()) +
  guides(fill = guide_legend(keywidth = rel(2.2),
                             title.position = "top",
                             label.position = "bottom")) +
  labs(fill = "Median HH Income ('000s)") + 
  theme(legend.position = "bottom")
Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
give correct results for longitude/latitude data