Visualizing Social Data
Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode

Code

Getting started with the nycomplaints data

Setup

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::edition_get()   masks testthat::edition_get()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ purrr::is_null()       masks testthat::is_null()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ readr::local_edition() masks testthat::local_edition()
## ✖ dplyr::matches()       masks tidyr::matches(), testthat::matches()
library(socviz)
## 
## Attaching package: 'socviz'

## The following object is masked from 'package:kjhutils':
## 
##     %nin%
library(lubridate) # Date manipulation
## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidycensus)
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.2.3, PROJ 7.2.1; sf_use_s2() is TRUE
options(tigris_use_cache = TRUE)

## See: https://kjhealy.github.io/nycomplaints
## for some documentation.
## remotes::install_github("kjhealy/nycomplaints")
library(nycomplaints)

Some helpful tables

## Census variables: official names and short names
acs_vars <- tribble(
      ~varname, ~clean_name,
      "B01003_001", "pop",
      "B01001B_001", "black",
      "B01001A_001", "white",
      "B01001H_001", "nh_white",
      "B01001I_001", "hispanic",
      "B01001D_001", "asian",
      "B19013_001", "median_hh_inc")

## NYC Counties in the Census/ACS = NYC Boroughs in the complaints data 
ny_county_boros <- tribble(
  ~county, ~borough,
  "New York County, New York", "Manhattan",
  "Queens County, New York", "Queens",
  "Kings County, New York", "Brooklyn",
  "Bronx County, New York", "Bronx",
  "Richmond County, New York", "Staten Island"
)

ny_county_boros
## # A tibble: 5 × 2
##   county                    borough      
##   <chr>                     <chr>        
## 1 New York County, New York Manhattan    
## 2 Queens County, New York   Queens       
## 3 Kings County, New York    Brooklyn     
## 4 Bronx County, New York    Bronx        
## 5 Richmond County, New York Staten Island
## ZCTA - Zip Code Tabulation Area - data for New York State, 
## including Geometry (i.e. map data)
zip_data <- get_acs(geography = "zcta", 
        variables = acs_vars$varname, 
        state = "NY", 
        year = 2019,
        county = NULL, 
        geometry = TRUE) %>% 
  rename(zip = GEOID) %>% 
  select(-moe) %>% 
  tibble::as_tibble() %>% 
  pivot_wider(names_from = variable, 
              values_from = c(estimate)) %>% 
  rename_with(~ acs_vars$clean_name, 
              all_of(acs_vars$varname)) %>% 
  sf::st_as_sf()
## Getting data from the 2015-2019 5-year ACS
zip_data
## Simple feature collection with 1794 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -79.76215 ymin: 40.4961 xmax: -71.85621 ymax: 45.0129
## Geodetic CRS:  NAD83
## # A tibble: 1,794 × 10
##    zip   NAME                       geometry white black asian nh_white hispanic
##    <chr> <chr>            <MULTIPOLYGON [°]> <dbl> <dbl> <dbl>    <dbl>    <dbl>
##  1 11718 ZCTA5 11… (((-73.27225 40.73045, -…  2902    57     9     2694      216
##  2 14802 ZCTA5 14… (((-77.80317 42.25379, -…  3453   371   204     3260      334
##  3 11219 ZCTA5 11… (((-74.0127 40.62902, -7… 59594  1062 21241    54475    11831
##  4 13343 ZCTA5 13… (((-75.45905 43.70434, -…  1675     0     0     1665       10
##  5 13619 ZCTA5 13… (((-75.80203 43.9531, -7…  9585   294    76     9360      350
##  6 12751 ZCTA5 12… (((-74.68103 41.68604, -…   873    92    23      727      218
##  7 10021 ZCTA5 10… (((-73.95926 40.75928, -… 37414   294  3802    36034     3053
##  8 12814 ZCTA5 12… (((-73.53708 43.63908, -…  1165    61     0     1165       48
##  9 10044 ZCTA5 10… (((-73.96159 40.74982, -…  5341  1567  4264     4673     1603
## 10 11213 ZCTA5 11… (((-73.94782 40.66878, -… 16302 43727  1552    14719     7954
## # … with 1,784 more rows, and 2 more variables: pop <dbl>, median_hh_inc <dbl>
## Data for the 5 NYC Boroughs, with Borough names patched in
county_data <- get_acs(geography = "county", 
        variables = acs_vars$varname, 
        state = "NY", 
        geometry = TRUE) %>% 
  rename(fips = GEOID, 
         county = NAME) %>% 
  select(-moe) %>% 
  tibble::as_tibble() %>% 
  pivot_wider(names_from = variable, 
              values_from = c(estimate)) %>% 
  rename_with(~ acs_vars$clean_name, 
              all_of(acs_vars$varname)) %>% 
  filter(county %in% ny_county_boros$county) %>% 
  left_join(ny_county_boros, by = "county") %>% 
  select(fips, county, borough, everything()) %>% 
  sf::st_as_sf()
## Getting data from the 2016-2020 5-year ACS
county_data
## Simple feature collection with 5 features and 10 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -74.25609 ymin: 40.4961 xmax: -73.70036 ymax: 40.91771
## Geodetic CRS:  NAD83
## # A tibble: 5 × 11
##   fips  county   borough                  geometry  white  black  asian nh_white
##   <chr> <chr>    <chr>          <MULTIPOLYGON [°]>  <dbl>  <dbl>  <dbl>    <dbl>
## 1 36061 New Yor… Manhat… (((-73.9995 40.70033, -7… 8.99e5 233476 198678   763202
## 2 36047 Kings C… Brookl… (((-74.04201 40.62605, -… 1.10e6 806746 306741   938573
## 3 36005 Bronx C… Bronx   (((-73.77336 40.85945, -… 3.07e5 497301  55362   128717
## 4 36081 Queens … Queens  (((-73.96262 40.73903, -… 8.14e5 409975 588875   559778
## 5 36085 Richmon… Staten… (((-74.1617 40.64586, -7… 3.40e5  48623  47605   286462
## # … with 3 more variables: hispanic <dbl>, pop <dbl>, median_hh_inc <dbl>
## Explore other ACS variables:
acs_vars <- load_variables(year = 2019, dataset = "acs5")

Some helpful patterns

## Count and Arrange
nycomplaints %>% 
  count(complaint_type) %>% 
  arrange(desc(n))
## # A tibble: 57 × 2
##    complaint_type            n
##    <chr>                 <int>
##  1 Housing and Buildings 59207
##  2 Transportation        33916
##  3 Finance               23395
##  4 Immigration           13486
##  5 Sanitation            12336
##  6 General Welfare       12322
##  7 Parks                  9019
##  8 Public Safety          8935
##  9 Environment            7352
## 10 Utilities              7158
## # … with 47 more rows
## Group and count, 
## top 3 complaints within each borough
nycomplaints %>% 
  group_by(borough, complaint_type) %>% 
  tally() %>% 
  slice_max(order_by = n, 
            n = 3)
## # A tibble: 18 × 3
## # Groups:   borough [6]
##    borough       complaint_type            n
##    <chr>         <chr>                 <int>
##  1 Bronx         Housing and Buildings  5694
##  2 Bronx         Finance                3140
##  3 Bronx         Immigration            2334
##  4 Brooklyn      Housing and Buildings 12228
##  5 Brooklyn      Finance                8204
##  6 Brooklyn      Transportation         7103
##  7 Manhattan     Housing and Buildings 28766
##  8 Manhattan     Transportation         5137
##  9 Manhattan     General Welfare        3588
## 10 Queens        Transportation         8201
## 11 Queens        Housing and Buildings  7493
## 12 Queens        Finance                3638
## 13 Staten Island Transportation        10754
## 14 Staten Island Sanitation             5229
## 15 Staten Island Finance                4611
## 16 <NA>          Housing and Buildings  3097
## 17 <NA>          Immigration            2139
## 18 <NA>          Transportation         1839
## Group and calculate proportions: top complaint in each boro
nycomplaints %>% 
  group_by(borough, complaint_type) %>% 
  tally() %>%
  mutate(prop = n/sum(n)) %>% 
  slice_max(order_by = prop, 
            n = 1)
## # A tibble: 6 × 4
## # Groups:   borough [6]
##   borough       complaint_type            n  prop
##   <chr>         <chr>                 <int> <dbl>
## 1 Bronx         Housing and Buildings  5694 0.308
## 2 Brooklyn      Housing and Buildings 12228 0.192
## 3 Manhattan     Housing and Buildings 28766 0.472
## 4 Queens        Transportation         8201 0.172
## 5 Staten Island Transportation        10754 0.326
## 6 <NA>          Housing and Buildings  3097 0.192
## Join tables: 
## Merge the county_data into aggregated complaints data
## With this you can calculate e.g. complaints per capita within boros.
## This will work for zip codes too.
county_complaints <- nycomplaints %>%
  group_by(borough, complaint_type) %>% 
  tally() %>% 
  mutate(prop = n/sum(n)) %>% 
  slice_max(order_by = prop, 
            n = 1) %>% 
  left_join(county_data, by = "borough")

county_complaints
## # A tibble: 6 × 14
## # Groups:   borough [6]
##   borough      complaint_type     n  prop fips  county                  geometry
##   <chr>        <chr>          <int> <dbl> <chr> <chr>         <MULTIPOLYGON [°]>
## 1 Bronx        Housing and B…  5694 0.308 36005 Bronx… (((-73.77336 40.85945, -…
## 2 Brooklyn     Housing and B… 12228 0.192 36047 Kings… (((-74.04201 40.62605, -…
## 3 Manhattan    Housing and B… 28766 0.472 36061 New Y… (((-73.9995 40.70033, -7…
## 4 Queens       Transportation  8201 0.172 36081 Queen… (((-73.96262 40.73903, -…
## 5 Staten Isla… Transportation 10754 0.326 36085 Richm… (((-74.1617 40.64586, -7…
## 6 <NA>         Housing and B…  3097 0.192 <NA>  <NA>                       EMPTY
## # … with 7 more variables: white <dbl>, black <dbl>, asian <dbl>,
## #   nh_white <dbl>, hispanic <dbl>, pop <dbl>, median_hh_inc <dbl>