Visualizing Social Data
Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode

Code

Mapping

Joining tables, and using geom_polygon()

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::edition_get()   masks testthat::edition_get()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ purrr::is_null()       masks testthat::is_null()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ readr::local_edition() masks testthat::local_edition()
## ✖ dplyr::matches()       masks tidyr::matches(), testthat::matches()
library(socviz)
## 
## Attaching package: 'socviz'

## The following object is masked from 'package:kjhutils':
## 
##     %nin%
theme_map <- function(base_size=9, base_family="") {
    require(grid)
    theme_bw(base_size=base_size, base_family=base_family) %+replace%
        theme(axis.line=element_blank(),
              axis.text=element_blank(),
              axis.ticks=element_blank(),
              axis.title=element_blank(),
              panel.background=element_blank(),
              panel.border=element_blank(),
              panel.grid=element_blank(),
              panel.spacing=unit(0, "lines"),
              plot.background=element_blank(),
              legend.justification = c(0,0),
              legend.position = c(0,0)
              )
}

head(county_map)
##      long      lat order  hole piece            group    id
## 1 1225889 -1275020     1 FALSE     1 0500000US01001.1 01001
## 2 1235324 -1274008     2 FALSE     1 0500000US01001.1 01001
## 3 1244873 -1272331     3 FALSE     1 0500000US01001.1 01001
## 4 1244129 -1267515     4 FALSE     1 0500000US01001.1 01001
## 5 1272010 -1262889     5 FALSE     1 0500000US01001.1 01001
## 6 1276797 -1295514     6 FALSE     1 0500000US01001.1 01001
dim(county_map)
## [1] 191382      7
head(county_data)
##      id           name state census_region      pop_dens   pop_dens4
## 1     0           <NA>  <NA>          <NA> [   50,  100) [ 45,  118)
## 2 01000              1    AL         South [   50,  100) [ 45,  118)
## 3 01001 Autauga County    AL         South [   50,  100) [ 45,  118)
## 4 01003 Baldwin County    AL         South [  100,  500) [118,71672]
## 5 01005 Barbour County    AL         South [   10,   50) [ 17,   45)
## 6 01007    Bibb County    AL         South [   10,   50) [ 17,   45)
##     pop_dens6   pct_black       pop female white black travel_time  land_area
## 1 [ 82,  215) [10.0,15.0) 318857056   50.8  77.7  13.2        25.5 3531905.43
## 2 [ 82,  215) [25.0,50.0)   4849377   51.5  69.8  26.6        24.2   50645.33
## 3 [ 82,  215) [15.0,25.0)     55395   51.5  78.1  18.4        26.2     594.44
## 4 [ 82,  215) [ 5.0,10.0)    200111   51.2  87.3   9.5        25.9    1589.78
## 5 [ 25,   45) [25.0,50.0)     26887   46.5  50.2  47.6        24.6     884.88
## 6 [ 25,   45) [15.0,25.0)     22506   46.0  76.3  22.1        27.6     622.58
##   hh_income su_gun4 su_gun6 fips votes_dem_2016 votes_gop_2016 total_votes_2016
## 1     53046    <NA>    <NA>    0             NA             NA               NA
## 2     43253    <NA>    <NA> 1000             NA             NA               NA
## 3     53682 [11,54] [10,12) 1001           5908          18110            24661
## 4     50221 [11,54] [10,12) 1003          18409          72780            94090
## 5     32911 [ 5, 8) [ 7, 8) 1005           4848           5431            10390
## 6     36447 [11,54] [10,12) 1007           1874           6733             8748
##   per_dem_2016 per_gop_2016 diff_2016 per_dem_2012 per_gop_2012 diff_2012
## 1           NA           NA        NA           NA           NA        NA
## 2           NA           NA        NA           NA           NA        NA
## 3    0.2395685    0.7343579     12202    0.2657577    0.7263374     11012
## 4    0.1956531    0.7735147     54371    0.2156657    0.7738975     47443
## 5    0.4666025    0.5227141       583    0.5125229    0.4833755       334
## 6    0.2142204    0.7696616      4859    0.2621857    0.7306638      3931
##   winner partywinner16 winner12 partywinner12 flipped
## 1   <NA>          <NA>     <NA>          <NA>    <NA>
## 2   <NA>          <NA>     <NA>          <NA>    <NA>
## 3  Trump    Republican   Romney    Republican      No
## 4  Trump    Republican   Romney    Republican      No
## 5  Trump    Republican    Obama      Democrat     Yes
## 6  Trump    Republican   Romney    Republican      No
dim(county_data)
## [1] 3195   32
county_full <- left_join(county_map, county_data, by = "id")

p <- ggplot(data = county_full,
            mapping = aes(x = long, y = lat,
                          fill = pop_dens, 
                          group = group))

p1 <- p + geom_polygon(color = "gray70", size = 0.1) + coord_equal()

p2 <- p1 + scale_fill_brewer(palette="Blues",
                             labels = c("0-10", "10-50", "50-100",
                                        "100-500", "500-1,000",
                                        "1,000-5,000", ">5,000"))

p2 + labs(fill = "Population per\nsquare mile") + theme_map() +
    guides(fill = guide_legend(nrow = 1)) + 
    theme(legend.position = "bottom")

Using simple features and geom_sf()

library(tidyverse)
library(socviz)
library(colorspace)
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.2.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(nycdogs)


nyc_fb <- nyc_license %>%
    group_by(zip_code, breed_rc) %>%
    tally() %>%
    mutate(freq = n / sum(n),
           pct = round(freq*100, 2)) %>%
    filter(breed_rc == "French Bulldog")

nyc_fb
## # A tibble: 192 × 5
## # Groups:   zip_code [192]
##    zip_code breed_rc           n   freq    pct
##       <int> <chr>          <int>  <dbl>  <dbl>
##  1     6851 French Bulldog     1 1      100   
##  2     7030 French Bulldog     2 0.333   33.3 
##  3     7087 French Bulldog     1 0.2     20   
##  4     7302 French Bulldog     1 0.0909   9.09
##  5     7310 French Bulldog     1 1      100   
##  6    10001 French Bulldog   126 0.0338   3.38
##  7    10002 French Bulldog    92 0.0197   1.97
##  8    10003 French Bulldog   138 0.0235   2.35
##  9    10004 French Bulldog    28 0.0458   4.58
## 10    10005 French Bulldog    61 0.0471   4.71
## # … with 182 more rows
fb_map <- left_join(nyc_zips, nyc_fb)
## Joining, by = "zip_code"
theme_nymap <- function(base_size=9, base_family="") {
    require(grid)
    theme_bw(base_size=base_size, base_family=base_family) %+replace%
        theme(axis.line=element_blank(),
              axis.text=element_blank(),
              axis.ticks=element_blank(),
              axis.title=element_blank(),
              panel.background=element_blank(),
              panel.border=element_blank(),
              panel.grid=element_blank(),
              panel.spacing=unit(0, "lines"),
              plot.background=element_blank(),
              legend.justification = c(0,0),
              legend.position = c(0.1, 0.6), 
              legend.direction = "horizontal"
        )
}

fb_map %>% 
  select(zip_code, po_name, breed_rc:pct) %>% 
  pull(pct)
##   [1]  1.15  0.23    NA  0.21  1.13  1.38    NA  0.80  0.89  1.39  0.92    NA
##  [13]  0.38  0.84  0.64  0.19  0.56  1.28  0.68    NA    NA  1.52  0.73  0.20
##  [25]  0.74  1.37    NA  0.80  0.43  0.90  0.17  1.05  0.52  0.66  0.29  0.30
##  [37]  0.47  0.44  0.64  0.51  0.31  0.78  0.31  0.82  0.33  0.53  0.37  0.31
##  [49]  0.30  1.11  1.64  0.78  1.30  0.66  0.19  0.33  0.33  0.77  0.85  0.83
##  [61]  1.04  0.64  0.36  0.43  1.24  0.83  1.37  0.37  0.45  0.25  1.85  0.29
##  [73]  0.57  0.90  1.20  0.18  1.29  0.67  1.19  0.67  0.49  1.10    NA  0.77
##  [85]    NA  1.51  0.91  1.22    NA  1.94  1.02  1.37  1.23  1.88  0.75  0.96
##  [97]  0.64  0.56  0.80  1.92  0.30  1.87  1.10  0.69  0.59  3.41  0.64  3.06
## [109]  0.55  3.83    NA    NA  1.97  3.38  4.30  3.25  1.17  3.74  2.68  0.43
## [121]  3.54  2.35  1.36  1.97  0.55  2.31  2.71  3.69  2.33  4.57  1.71  0.83
## [133]  3.08  1.32  4.02  0.11  4.71    NA  4.58  0.20  2.35  4.58  1.79  0.31
## [145]  0.60  4.58  4.58    NA  0.90  1.01  0.98    NA  0.32  0.83  1.05  0.60
## [157]  0.98  1.03    NA  0.19  0.11  0.37  0.65  0.35  0.33  0.07  0.45  1.26
## [169]  0.09  1.08  0.83  0.46  0.09  0.95    NA  0.52  0.64  0.77  0.59    NA
## [181]  0.72  0.09  0.64  0.68  0.09  0.43    NA  0.81  0.26    NA  0.60  0.58
## [193]  0.90  2.26    NA    NA    NA    NA    NA    NA    NA 50.00    NA  2.89
## [205]    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## [217]    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
## [229]    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA  1.51    NA
## [241]    NA    NA    NA    NA    NA    NA    NA 50.00    NA    NA    NA    NA
## [253]    NA    NA    NA    NA  0.75  1.68  1.64  2.57    NA  5.67
fb_map %>% 
  filter(n > 1) %>% 
  ggplot(mapping = aes(fill = pct)) +
    geom_sf(color = "gray80", size = 0.1) +
    scale_fill_viridis_c(option = "A") +
    labs(fill = "Percent of All Licensed Dogs") +
    # This next bit is a hack--we're just positioning the boxes
    # relative to the latitude/longitude coordinates
    annotate(geom = "text", x = -74.145 + 0.029, y = 40.82-0.012, 
           label = "New York City's French Bulldogs", size = 6) + 
    annotate(geom = "text", x = -74.1468 + 0.029, y = 40.8075-0.012, 
           label = "By Zip Code. Based on Licensing Data", size = 5) + 
    theme_nymap() + 
    guides(fill = guide_legend(title.position = "top", 
                               label.position = "bottom",
                             keywidth = 1, nrow = 1))

Keeping zero-count rows

We’ll also fix the color here.

nyc_license %>%
  filter(extract_year == 2018) %>% 
    group_by(zip_code, breed_rc) %>%
    tally() %>%
    mutate(freq = n / sum(n),
           pct = round(freq*100, 2)) %>%
    filter(breed_rc == "French Bulldog")
## # A tibble: 161 × 5
## # Groups:   zip_code [161]
##    zip_code breed_rc           n   freq   pct
##       <int> <chr>          <int>  <dbl> <dbl>
##  1    10001 French Bulldog    27 0.0293  2.93
##  2    10002 French Bulldog    20 0.0171  1.71
##  3    10003 French Bulldog    36 0.0257  2.57
##  4    10004 French Bulldog     9 0.0562  5.62
##  5    10005 French Bulldog    15 0.0469  4.69
##  6    10006 French Bulldog     8 0.0559  5.59
##  7    10007 French Bulldog    17 0.0675  6.75
##  8    10009 French Bulldog    51 0.0249  2.49
##  9    10010 French Bulldog    31 0.0284  2.84
## 10    10011 French Bulldog    88 0.0426  4.26
## # … with 151 more rows
nyc_fb <- nyc_license %>%
    group_by(zip_code, breed_rc) %>%
    tally() %>%
    ungroup() %>%
    complete(zip_code, breed_rc, 
             fill = list(n = 0)) %>%
    mutate(freq = n / sum(n),
           pct = round(freq*100, 2)) %>%
    filter(breed_rc == "French Bulldog")


fb_map <- left_join(nyc_zips, nyc_fb)
## Joining, by = "zip_code"
fb_map
## Simple feature collection with 262 features and 15 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25576 ymin: 40.49584 xmax: -73.6996 ymax: 40.91517
## Geodetic CRS:  WGS 84
## # A tibble: 262 × 16
##    objectid zip_code po_name     state borough st_fips cty_fips bld_gpostal_code
##       <int>    <int> <chr>       <chr> <chr>   <chr>   <chr>               <int>
##  1        1    11372 Jackson He… NY    Queens  36      081                     0
##  2        2    11004 Glen Oaks   NY    Queens  36      081                     0
##  3        3    11040 New Hyde P… NY    Queens  36      081                     0
##  4        4    11426 Bellerose   NY    Queens  36      081                     0
##  5        5    11365 Fresh Mead… NY    Queens  36      081                     0
##  6        6    11373 Elmhurst    NY    Queens  36      081                     0
##  7        7    11001 Floral Park NY    Queens  36      081                     0
##  8        8    11375 Forest Hil… NY    Queens  36      081                     0
##  9        9    11427 Queens Vil… NY    Queens  36      081                     0
## 10       10    11374 Rego Park   NY    Queens  36      081                     0
## # … with 252 more rows, and 8 more variables: shape_leng <dbl>,
## #   shape_area <dbl>, x_id <chr>, geometry <POLYGON [°]>, breed_rc <chr>,
## #   n <int>, freq <dbl>, pct <dbl>
fb_map %>% ggplot(mapping = aes(fill = pct)) +
    geom_sf(color = "gray80", size = 0.1) +
    scale_fill_continuous_sequential(palette = "Oranges") +
   labs(fill = "Percent of All Licensed Dogs in the City") +
  annotate(geom = "text", x = -74.145 + 0.029, y = 40.82-0.012, 
           label = "New York City's French Bulldogs", size = 6) + 
  annotate(geom = "text", x = -74.1468 + 0.029, y = 40.8075-0.012, 
           label = "By Zip Code. Based on Licensing Data", size = 5) + 
    theme_nymap() + 
    guides(fill = guide_legend(title.position = "top", 
                               label.position = "bottom",
                             keywidth = 1, nrow = 1))

Census data

library(tidyverse)
library(colorspace)
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(sf)

library(tidycensus)
library(cartogram)

Population components example

us_components <- get_estimates(geography = "state", 
                               product = "components")
us_components
## # A tibble: 624 × 4
##    NAME           GEOID variable  value
##    <chr>          <chr> <chr>     <dbl>
##  1 Mississippi    28    BIRTHS    35978
##  2 Missouri       29    BIRTHS    71297
##  3 Montana        30    BIRTHS    11618
##  4 Nebraska       31    BIRTHS    25343
##  5 Nevada         32    BIRTHS    35932
##  6 New Hampshire  33    BIRTHS    12004
##  7 New Jersey     34    BIRTHS    99501
##  8 New Mexico     35    BIRTHS    23125
##  9 New York       36    BIRTHS   222924
## 10 North Carolina 37    BIRTHS   119203
## # … with 614 more rows
unique(us_components$variable)
##  [1] "BIRTHS"            "DEATHS"            "DOMESTICMIG"      
##  [4] "INTERNATIONALMIG"  "NATURALINC"        "NETMIG"           
##  [7] "RBIRTH"            "RDEATH"            "RDOMESTICMIG"     
## [10] "RINTERNATIONALMIG" "RNATURALINC"       "RNETMIG"
net_migration <- get_estimates(geography = "county",
                               variables = "RNETMIG",
                               year = 2019,
                               geometry = TRUE,
                               resolution = "20m") %>%
  shift_geometry() # puts Alaska and Hawaii in the bottom left
net_migration
## Simple feature collection with 3142 features and 4 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: -3112200 ymin: -1697728 xmax: 2258154 ymax: 1558935
## Projected CRS: USA_Contiguous_Albers_Equal_Area_Conic
## # A tibble: 3,142 × 5
##    GEOID NAME                         variable  value                   geometry
##    <chr> <chr>                        <chr>     <dbl>         <MULTIPOLYGON [m]>
##  1 29227 Worth County, Missouri       RNETMIG   -8.91 (((114835.6 345071.6, 123…
##  2 31061 Franklin County, Nebraska    RNETMIG  -14.4  (((-267685.1 323958.5, -2…
##  3 36013 Chautauqua County, New York  RNETMIG   -3.54 (((1324221 647717.4, 1334…
##  4 37181 Vance County, North Carolina RNETMIG   -3.25 (((1544260 32202.52, 1547…
##  5 47183 Weakley County, Tennessee    RNETMIG   -1.02 (((625934.5 -98887.34, 63…
##  6 44003 Kent County, Rhode Island    RNETMIG    2.29 (((1977965 726702.3, 2004…
##  7 08101 Pueblo County, Colorado      RNETMIG    6.15 (((-783174.5 122269, -773…
##  8 17175 Stark County, Illinois       RNETMIG  -10.6  (((500559 424779.4, 51023…
##  9 29169 Pulaski County, Missouri     RNETMIG    4.42 (((312851.7 46166.36, 312…
## 10 19151 Pocahontas County, Iowa      RNETMIG  -12.2  (((88185.95 606331.9, 126…
## # … with 3,132 more rows
order <- c("-15 and below", "-15 to -5", 
            "-5 to +5", "+5 to +15", "+15 and up")
  
net_migration <- net_migration %>%
    mutate(groups = case_when(
      value > 15 ~ "+15 and up",
      value > 5 ~ "+5 to +15",
      value > -5 ~ "-5 to +5",
      value > -15 ~ "-15 to -5",
      TRUE ~ "-15 and below"
    )) %>%
    mutate(groups = factor(groups, levels = order))
  
  state_overlay <- states(
    cb = TRUE,
    resolution = "20m") %>%
    filter(GEOID != "72") %>%
    shift_geometry()
## Retrieving data for the year 2020
ggplot() +
    geom_sf(data = net_migration, 
            mapping = aes(fill = groups, color = groups), 
            size = 0.1) +
    geom_sf(data = state_overlay, 
            fill = NA, color = "black", size = 0.1) +
    scale_fill_brewer(palette = "PuOr", direction = -1) +
    scale_color_brewer(palette = "PuOr", direction = -1, guide = "none") +
    coord_sf(datum = NA) +
    theme_minimal() +
    labs(title = "Net migration per 1000 residents by county",
         subtitle = "US Census Bureau 2019 Population Estimates",
         fill = "Rate",
         caption = "Data acquired with the R tidycensus package")