Setup

Code
library(here)       # manage file paths
here() starts at /Users/kjhealy/Documents/courses/vsd
Code
library(socviz)     # data and some useful functions
library(tidyverse)  # your friend and mine
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(tidycensus) # Tidily interact with the US Census
library(maps)       # Some basic maps

Attaching package: 'maps'

The following object is masked from 'package:purrr':

    map
Code
library(sf)         # Make maps in ggplot
Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
Code
library(tigris)     # Talk to the Census's TIGER data
To enable caching of data, set `options(tigris_use_cache = TRUE)`
in your R script or .Rprofile.
Code
library(colorspace) # Palettes
library(nycdogs)    # New York City dog license data

Data

Mapping

Joining tables, and using geom_polygon()

Remember, we use geom_polygon() as a kind of illustration of what’s happening conceptually, not as our go-to method for mapping.

Code
theme_map <- function(base_size=9, base_family="") {
    require(grid)
    theme_bw(base_size=base_size, base_family=base_family) %+replace%
        theme(axis.line=element_blank(),
              axis.text=element_blank(),
              axis.ticks=element_blank(),
              axis.title=element_blank(),
              panel.background=element_blank(),
              panel.border=element_blank(),
              panel.grid=element_blank(),
              panel.spacing=unit(0, "lines"),
              plot.background=element_blank(),
              legend.justification = c(0,0),
              legend.position = c(0,0)
              )
}

head(county_map)
     long      lat order  hole piece            group    id
1 1225889 -1275020     1 FALSE     1 0500000US01001.1 01001
2 1235324 -1274008     2 FALSE     1 0500000US01001.1 01001
3 1244873 -1272331     3 FALSE     1 0500000US01001.1 01001
4 1244129 -1267515     4 FALSE     1 0500000US01001.1 01001
5 1272010 -1262889     5 FALSE     1 0500000US01001.1 01001
6 1276797 -1295514     6 FALSE     1 0500000US01001.1 01001
Code
dim(county_map)
[1] 191382      7
Code
head(county_data)
     id           name state census_region      pop_dens   pop_dens4
1     0           <NA>  <NA>          <NA> [   50,  100) [ 45,  118)
2 01000              1    AL         South [   50,  100) [ 45,  118)
3 01001 Autauga County    AL         South [   50,  100) [ 45,  118)
4 01003 Baldwin County    AL         South [  100,  500) [118,71672]
5 01005 Barbour County    AL         South [   10,   50) [ 17,   45)
6 01007    Bibb County    AL         South [   10,   50) [ 17,   45)
    pop_dens6   pct_black       pop female white black travel_time  land_area
1 [ 82,  215) [10.0,15.0) 318857056   50.8  77.7  13.2        25.5 3531905.43
2 [ 82,  215) [25.0,50.0)   4849377   51.5  69.8  26.6        24.2   50645.33
3 [ 82,  215) [15.0,25.0)     55395   51.5  78.1  18.4        26.2     594.44
4 [ 82,  215) [ 5.0,10.0)    200111   51.2  87.3   9.5        25.9    1589.78
5 [ 25,   45) [25.0,50.0)     26887   46.5  50.2  47.6        24.6     884.88
6 [ 25,   45) [15.0,25.0)     22506   46.0  76.3  22.1        27.6     622.58
  hh_income su_gun4 su_gun6 fips votes_dem_2016 votes_gop_2016 total_votes_2016
1     53046    <NA>    <NA>    0             NA             NA               NA
2     43253    <NA>    <NA> 1000             NA             NA               NA
3     53682 [11,54] [10,12) 1001           5908          18110            24661
4     50221 [11,54] [10,12) 1003          18409          72780            94090
5     32911 [ 5, 8) [ 7, 8) 1005           4848           5431            10390
6     36447 [11,54] [10,12) 1007           1874           6733             8748
  per_dem_2016 per_gop_2016 diff_2016 per_dem_2012 per_gop_2012 diff_2012
1           NA           NA        NA           NA           NA        NA
2           NA           NA        NA           NA           NA        NA
3    0.2395685    0.7343579     12202    0.2657577    0.7263374     11012
4    0.1956531    0.7735147     54371    0.2156657    0.7738975     47443
5    0.4666025    0.5227141       583    0.5125229    0.4833755       334
6    0.2142204    0.7696616      4859    0.2621857    0.7306638      3931
  winner partywinner16 winner12 partywinner12 flipped
1   <NA>          <NA>     <NA>          <NA>    <NA>
2   <NA>          <NA>     <NA>          <NA>    <NA>
3  Trump    Republican   Romney    Republican      No
4  Trump    Republican   Romney    Republican      No
5  Trump    Republican    Obama      Democrat     Yes
6  Trump    Republican   Romney    Republican      No
Code
dim(county_data)
[1] 3195   32
Code
county_full <- left_join(county_map, county_data, by = "id")

p <- ggplot(data = county_full,
            mapping = aes(x = long, y = lat,
                          fill = pop_dens, 
                          group = group))

p1 <- p + geom_polygon(color = "gray70", size = 0.1) + coord_equal()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Code
p2 <- p1 + scale_fill_brewer(palette="Blues",
                             labels = c("0-10", "10-50", "50-100",
                                        "100-500", "500-1,000",
                                        "1,000-5,000", ">5,000"))

p2 + labs(fill = "Population per\nsquare mile") + theme_map() +
    guides(fill = guide_legend(nrow = 1)) + 
    theme(legend.position = "bottom")
Loading required package: grid

Using simple features and geom_sf()

The simple features model and associated geom_sf() is a much more compact and efficient way to draw maps.

Code
nyc_fb <- nyc_license |>
    group_by(zip_code, breed_rc) |>
    tally() |>
    mutate(freq = n / sum(n),
           pct = round(freq*100, 2)) |>
    filter(breed_rc == "French Bulldog")

nyc_fb
# A tibble: 192 × 5
# Groups:   zip_code [192]
   zip_code breed_rc           n   freq    pct
      <int> <chr>          <int>  <dbl>  <dbl>
 1     6851 French Bulldog     1 1      100   
 2     7030 French Bulldog     2 0.333   33.3 
 3     7087 French Bulldog     1 0.2     20   
 4     7302 French Bulldog     1 0.0909   9.09
 5     7310 French Bulldog     1 1      100   
 6    10001 French Bulldog   126 0.0338   3.38
 7    10002 French Bulldog    92 0.0197   1.97
 8    10003 French Bulldog   138 0.0235   2.35
 9    10004 French Bulldog    28 0.0458   4.58
10    10005 French Bulldog    61 0.0471   4.71
# ℹ 182 more rows
Code
fb_map <- left_join(nyc_zips, nyc_fb)
Joining with `by = join_by(zip_code)`
Code
theme_nymap <- function(base_size=9, base_family="") {
    require(grid)
    theme_bw(base_size=base_size, base_family=base_family) %+replace%
        theme(axis.line=element_blank(),
              axis.text=element_blank(),
              axis.ticks=element_blank(),
              axis.title=element_blank(),
              panel.background=element_blank(),
              panel.border=element_blank(),
              panel.grid=element_blank(),
              panel.spacing=unit(0, "lines"),
              plot.background=element_blank(),
              legend.justification = c(0,0),
              legend.position = c(0.1, 0.6), 
              legend.direction = "horizontal"
        )
}

fb_map |> 
  select(zip_code, po_name, breed_rc:pct) |> 
  pull(pct)
  [1]  1.15  0.23    NA  0.21  1.13  1.38    NA  0.80  0.89  1.39  0.92    NA
 [13]  0.38  0.84  0.64  0.19  0.56  1.28  0.68    NA    NA  1.52  0.73  0.20
 [25]  0.74  1.37    NA  0.80  0.43  0.90  0.17  1.05  0.52  0.66  0.29  0.30
 [37]  0.47  0.44  0.64  0.51  0.31  0.78  0.31  0.82  0.33  0.53  0.37  0.31
 [49]  0.30  1.11  1.64  0.78  1.30  0.66  0.19  0.33  0.33  0.77  0.85  0.83
 [61]  1.04  0.64  0.36  0.43  1.24  0.83  1.37  0.37  0.45  0.25  1.85  0.29
 [73]  0.57  0.90  1.20  0.18  1.29  0.67  1.19  0.67  0.49  1.10    NA  0.77
 [85]    NA  1.51  0.91  1.22    NA  1.94  1.02  1.37  1.23  1.88  0.75  0.96
 [97]  0.64  0.56  0.80  1.92  0.30  1.87  1.10  0.69  0.59  3.41  0.64  3.06
[109]  0.55  3.83    NA    NA  1.97  3.38  4.30  3.25  1.17  3.74  2.68  0.43
[121]  3.54  2.35  1.36  1.97  0.55  2.31  2.71  3.69  2.33  4.57  1.71  0.83
[133]  3.08  1.32  4.02  0.11  4.71    NA  4.58  0.20  2.35  4.58  1.79  0.31
[145]  0.60  4.58  4.58    NA  0.90  1.01  0.98    NA  0.32  0.83  1.05  0.60
[157]  0.98  1.03    NA  0.19  0.11  0.37  0.65  0.35  0.33  0.07  0.45  1.26
[169]  0.09  1.08  0.83  0.46  0.09  0.95    NA  0.52  0.64  0.77  0.59    NA
[181]  0.72  0.09  0.64  0.68  0.09  0.43    NA  0.81  0.26    NA  0.60  0.58
[193]  0.90  2.26    NA    NA    NA    NA    NA    NA    NA 50.00    NA  2.89
[205]    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
[217]    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
[229]    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA  1.51    NA
[241]    NA    NA    NA    NA    NA    NA    NA 50.00    NA    NA    NA    NA
[253]    NA    NA    NA    NA  0.75  1.68  1.64  2.57    NA  5.67
Code
fb_map |> 
  filter(n > 1) |> 
  ggplot(mapping = aes(fill = pct)) +
    geom_sf(color = "gray80", size = 0.1) +
    scale_fill_viridis_c(option = "A") +
    labs(fill = "Percent of All Licensed Dogs") +
    # This next bit is a hack--we're just positioning the boxes
    # relative to the latitude/longitude coordinates
    annotate(geom = "text", x = -74.145 + 0.029, y = 40.82-0.012, 
           label = "New York City's French Bulldogs", size = 6) + 
    annotate(geom = "text", x = -74.1468 + 0.029, y = 40.8075-0.012, 
           label = "By Zip Code. Based on Licensing Data", size = 5) + 
    theme_nymap() + 
    guides(fill = guide_legend(title.position = "top", 
                               label.position = "bottom",
                             keywidth = 1, nrow = 1))

Keeping zero-count rows

We’ll also fix the color here.

Code
nyc_license |>
  filter(extract_year == 2018) |> 
    group_by(zip_code, breed_rc) |>
    tally() |>
    mutate(freq = n / sum(n),
           pct = round(freq*100, 2)) |>
    filter(breed_rc == "French Bulldog")
# A tibble: 161 × 5
# Groups:   zip_code [161]
   zip_code breed_rc           n   freq   pct
      <int> <chr>          <int>  <dbl> <dbl>
 1    10001 French Bulldog    27 0.0293  2.93
 2    10002 French Bulldog    20 0.0171  1.71
 3    10003 French Bulldog    36 0.0257  2.57
 4    10004 French Bulldog     9 0.0562  5.62
 5    10005 French Bulldog    15 0.0469  4.69
 6    10006 French Bulldog     8 0.0559  5.59
 7    10007 French Bulldog    17 0.0675  6.75
 8    10009 French Bulldog    51 0.0249  2.49
 9    10010 French Bulldog    31 0.0284  2.84
10    10011 French Bulldog    88 0.0426  4.26
# ℹ 151 more rows
Code
nyc_fb <- nyc_license |>
    group_by(zip_code, breed_rc) |>
    tally() |>
    ungroup() |>
    complete(zip_code, breed_rc, 
             fill = list(n = 0)) |>
    mutate(freq = n / sum(n),
           pct = round(freq*100, 2)) |>
    filter(breed_rc == "French Bulldog")


fb_map <- left_join(nyc_zips, nyc_fb)
Joining with `by = join_by(zip_code)`
Code
fb_map
Simple feature collection with 262 features and 15 fields
Geometry type: POLYGON
Dimension:     XY
Bounding box:  xmin: -74.25576 ymin: 40.49584 xmax: -73.6996 ymax: 40.91517
Geodetic CRS:  WGS 84
# A tibble: 262 × 16
   objectid zip_code po_name     state borough st_fips cty_fips bld_gpostal_code
      <int>    <int> <chr>       <chr> <chr>   <chr>   <chr>               <int>
 1        1    11372 Jackson He… NY    Queens  36      081                     0
 2        2    11004 Glen Oaks   NY    Queens  36      081                     0
 3        3    11040 New Hyde P… NY    Queens  36      081                     0
 4        4    11426 Bellerose   NY    Queens  36      081                     0
 5        5    11365 Fresh Mead… NY    Queens  36      081                     0
 6        6    11373 Elmhurst    NY    Queens  36      081                     0
 7        7    11001 Floral Park NY    Queens  36      081                     0
 8        8    11375 Forest Hil… NY    Queens  36      081                     0
 9        9    11427 Queens Vil… NY    Queens  36      081                     0
10       10    11374 Rego Park   NY    Queens  36      081                     0
# ℹ 252 more rows
# ℹ 8 more variables: shape_leng <dbl>, shape_area <dbl>, x_id <chr>,
#   geometry <POLYGON [°]>, breed_rc <chr>, n <int>, freq <dbl>, pct <dbl>
Code
fb_map |> 
  ggplot(mapping = aes(fill = pct)) +
    geom_sf(color = "gray80", size = 0.1) +
    scale_fill_continuous_sequential(palette = "Oranges") +
   labs(fill = "Percent of All Licensed Dogs in the City") +
  annotate(geom = "text", x = -74.145 + 0.029, y = 40.82-0.012, 
           label = "New York City's French Bulldogs", size = 6) + 
  annotate(geom = "text", x = -74.1468 + 0.029, y = 40.8075-0.012, 
           label = "By Zip Code. Based on Licensing Data", size = 5) + 
    theme_nymap() + 
    guides(fill = guide_legend(title.position = "top", 
                               label.position = "bottom",
                             keywidth = 1, nrow = 1))

Census data

Population components example

Code
# From the Census. Remember to load your API key!
us_components <- get_estimates(geography = "state", 
                               product = "components")
Code
us_components
# A tibble: 676 × 5
   GEOID NAME    variable          year    value
   <chr> <chr>   <chr>            <int>    <dbl>
 1 01    Alabama BIRTHS            2022 58280   
 2 01    Alabama DEATHS            2022 66870   
 3 01    Alabama NATURALCHG        2022 -8590   
 4 01    Alabama INTERNATIONALMIG  2022  4597   
 5 01    Alabama DOMESTICMIG       2022 28609   
 6 01    Alabama NETMIG            2022 33206   
 7 01    Alabama RESIDUAL          2022  -166   
 8 01    Alabama RBIRTH            2022    11.5 
 9 01    Alabama RDEATH            2022    13.2 
10 01    Alabama RNATURALCHG       2022    -1.70
# ℹ 666 more rows
Code
unique(us_components$variable)
 [1] "BIRTHS"            "DEATHS"            "NATURALCHG"       
 [4] "INTERNATIONALMIG"  "DOMESTICMIG"       "NETMIG"           
 [7] "RESIDUAL"          "RBIRTH"            "RDEATH"           
[10] "RNATURALCHG"       "RINTERNATIONALMIG" "RDOMESTICMIG"     
[13] "RNETMIG"          
Code
net_migration <- get_estimates(geography = "county",
                               variables = "RNETMIG",
                               year = 2019,
                               geometry = TRUE,
                               resolution = "20m") |>
  shift_geometry() # puts Alaska and Hawaii in the bottom left
Code
net_migration
Simple feature collection with 3142 features and 4 fields
Geometry type: GEOMETRY
Dimension:     XY
Bounding box:  xmin: -3112200 ymin: -1697728 xmax: 2258154 ymax: 1558935
Projected CRS: USA_Contiguous_Albers_Equal_Area_Conic
# A tibble: 3,142 × 5
   GEOID NAME                         variable  value                   geometry
   <chr> <chr>                        <chr>     <dbl>         <MULTIPOLYGON [m]>
 1 29227 Worth County, Missouri       RNETMIG   -8.91 (((114835.6 345071.6, 123…
 2 31061 Franklin County, Nebraska    RNETMIG  -14.4  (((-267685.1 323958.5, -2…
 3 36013 Chautauqua County, New York  RNETMIG   -3.54 (((1324221 647717.4, 1334…
 4 37181 Vance County, North Carolina RNETMIG   -3.25 (((1544260 32202.52, 1547…
 5 47183 Weakley County, Tennessee    RNETMIG   -1.02 (((625934.5 -98887.34, 63…
 6 44003 Kent County, Rhode Island    RNETMIG    2.29 (((1977965 726702.3, 2004…
 7 08101 Pueblo County, Colorado      RNETMIG    6.15 (((-783174.5 122269, -773…
 8 17175 Stark County, Illinois       RNETMIG  -10.6  (((500559 424779.4, 51023…
 9 29169 Pulaski County, Missouri     RNETMIG    4.42 (((312851.7 46166.36, 312…
10 19151 Pocahontas County, Iowa      RNETMIG  -12.2  (((88185.95 606331.9, 126…
# ℹ 3,132 more rows
Code
order <- c("-15 and below", "-15 to -5", 
            "-5 to +5", "+5 to +15", "+15 and up")
  
net_migration <- net_migration |>
    mutate(groups = case_when(
      value > 15 ~ "+15 and up",
      value > 5 ~ "+5 to +15",
      value > -5 ~ "-5 to +5",
      value > -15 ~ "-15 to -5",
      TRUE ~ "-15 and below"
    )) |>
    mutate(groups = factor(groups, levels = order))
  
  state_overlay <- states(
    cb = TRUE,
    resolution = "20m") |>
    filter(GEOID != "72") |>
    shift_geometry()
Retrieving data for the year 2021

  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |===========================================                           |  62%
  |                                                                            
  |========================================================              |  79%
  |                                                                            
  |======================================================================| 100%
Code
ggplot() +
    geom_sf(data = net_migration, 
            mapping = aes(fill = groups, color = groups), 
            size = 0.1) +
    geom_sf(data = state_overlay, 
            fill = NA, color = "black", size = 0.1) +
    scale_fill_brewer(palette = "PuOr", direction = -1) +
    scale_color_brewer(palette = "PuOr", direction = -1, guide = "none") +
    coord_sf(datum = NA) +
    theme_minimal() +
    labs(title = "Net migration per 1000 residents by county",
         subtitle = "US Census Bureau 2019 Population Estimates",
         fill = "Rate",
         caption = "Data acquired with the R tidycensus package")