Visualizing Social Data
Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode

Code

library(here)
## here() starts at /Users/kjhealy/Documents/courses/vsd
library(socviz)
## 
## Attaching package: 'socviz'

## The following object is masked from 'package:kjhutils':
## 
##     %nin%
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::edition_get()   masks testthat::edition_get()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ purrr::is_null()       masks testthat::is_null()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ readr::local_edition() masks testthat::local_edition()
## ✖ dplyr::matches()       masks tidyr::matches(), testthat::matches()
library(tidycensus)
library(tigris)
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library(sf)
## Linking to GEOS 3.9.1, GDAL 3.2.3, PROJ 7.2.1; sf_use_s2() is TRUE
library(babynames)

theme_map <- function(base_size=9, base_family="") {
    require(grid)
    theme_bw(base_size=base_size, base_family=base_family) %+replace%
        theme(axis.line=element_blank(),
              axis.text=element_blank(),
              axis.ticks=element_blank(),
              axis.title=element_blank(),
              panel.background=element_blank(),
              panel.border=element_blank(),
              panel.grid=element_blank(),
              panel.spacing=unit(0, "lines"),
              plot.background=element_blank(),
              legend.justification = c(0,0),
              legend.position = c(0,0)
              )
}

Census Data

Population components example again

us_components <- get_estimates(geography = "state", 
                               product = "components")
us_components
## # A tibble: 624 × 4
##    NAME           GEOID variable  value
##    <chr>          <chr> <chr>     <dbl>
##  1 Mississippi    28    BIRTHS    35978
##  2 Missouri       29    BIRTHS    71297
##  3 Montana        30    BIRTHS    11618
##  4 Nebraska       31    BIRTHS    25343
##  5 Nevada         32    BIRTHS    35932
##  6 New Hampshire  33    BIRTHS    12004
##  7 New Jersey     34    BIRTHS    99501
##  8 New Mexico     35    BIRTHS    23125
##  9 New York       36    BIRTHS   222924
## 10 North Carolina 37    BIRTHS   119203
## # … with 614 more rows
unique(us_components$variable)
##  [1] "BIRTHS"            "DEATHS"            "DOMESTICMIG"      
##  [4] "INTERNATIONALMIG"  "NATURALINC"        "NETMIG"           
##  [7] "RBIRTH"            "RDEATH"            "RDOMESTICMIG"     
## [10] "RINTERNATIONALMIG" "RNATURALINC"       "RNETMIG"
net_migration <- get_estimates(geography = "county",
                               variables = "RNETMIG",
                               year = 2019,
                               geometry = TRUE,
                               resolution = "20m") %>%
  shift_geometry() # puts Alaska and Hawaii in the bottom left
net_migration
## Simple feature collection with 3142 features and 4 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: -3112200 ymin: -1697728 xmax: 2258154 ymax: 1558935
## Projected CRS: USA_Contiguous_Albers_Equal_Area_Conic
## # A tibble: 3,142 × 5
##    GEOID NAME                         variable  value                   geometry
##    <chr> <chr>                        <chr>     <dbl>         <MULTIPOLYGON [m]>
##  1 29227 Worth County, Missouri       RNETMIG   -8.91 (((114835.6 345071.6, 123…
##  2 31061 Franklin County, Nebraska    RNETMIG  -14.4  (((-267685.1 323958.5, -2…
##  3 36013 Chautauqua County, New York  RNETMIG   -3.54 (((1324221 647717.4, 1334…
##  4 37181 Vance County, North Carolina RNETMIG   -3.25 (((1544260 32202.52, 1547…
##  5 47183 Weakley County, Tennessee    RNETMIG   -1.02 (((625934.5 -98887.34, 63…
##  6 44003 Kent County, Rhode Island    RNETMIG    2.29 (((1977965 726702.3, 2004…
##  7 08101 Pueblo County, Colorado      RNETMIG    6.15 (((-783174.5 122269, -773…
##  8 17175 Stark County, Illinois       RNETMIG  -10.6  (((500559 424779.4, 51023…
##  9 29169 Pulaski County, Missouri     RNETMIG    4.42 (((312851.7 46166.36, 312…
## 10 19151 Pocahontas County, Iowa      RNETMIG  -12.2  (((88185.95 606331.9, 126…
## # … with 3,132 more rows
order <- c("-15 and below", "-15 to -5", 
            "-5 to +5", "+5 to +15", "+15 and up")
  
net_migration <- net_migration %>%
    mutate(groups = case_when(
      value > 15 ~ "+15 and up",
      value > 5 ~ "+5 to +15",
      value > -5 ~ "-5 to +5",
      value > -15 ~ "-15 to -5",
      TRUE ~ "-15 and below"
    )) %>%
    mutate(groups = factor(groups, levels = order))
  
  state_overlay <- states(
    cb = TRUE,
    resolution = "20m") %>%
    filter(GEOID != "72") %>%
    shift_geometry()
## Retrieving data for the year 2020
ggplot() +
    geom_sf(data = net_migration, 
            mapping = aes(fill = groups, color = groups), 
            size = 0.1) +
    geom_sf(data = state_overlay, 
            fill = NA, color = "black", size = 0.1) +
    scale_fill_brewer(palette = "PuOr", direction = -1) +
    scale_color_brewer(palette = "PuOr", direction = -1, guide = "none") +
    coord_sf(datum = NA) +
    theme_minimal() +
    labs(title = "Net migration per 1000 residents by county",
         subtitle = "US Census Bureau 2019 Population Estimates",
         fill = "Rate",
         caption = "Data acquired with the R tidycensus package")

Manipulating Tibbles with Spatial Data

## North Carolina shapefile / polygons, included in sf package
nc <- st_read(system.file("shape/nc.shp", package = "sf"), quiet = T)

nc
## Simple feature collection with 100 features and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -84.32385 ymin: 33.88199 xmax: -75.45698 ymax: 36.58965
## Geodetic CRS:  NAD27
## First 10 features:
##     AREA PERIMETER CNTY_ CNTY_ID        NAME  FIPS FIPSNO CRESS_ID BIR74 SID74
## 1  0.114     1.442  1825    1825        Ashe 37009  37009        5  1091     1
## 2  0.061     1.231  1827    1827   Alleghany 37005  37005        3   487     0
## 3  0.143     1.630  1828    1828       Surry 37171  37171       86  3188     5
## 4  0.070     2.968  1831    1831   Currituck 37053  37053       27   508     1
## 5  0.153     2.206  1832    1832 Northampton 37131  37131       66  1421     9
## 6  0.097     1.670  1833    1833    Hertford 37091  37091       46  1452     7
## 7  0.062     1.547  1834    1834      Camden 37029  37029       15   286     0
## 8  0.091     1.284  1835    1835       Gates 37073  37073       37   420     0
## 9  0.118     1.421  1836    1836      Warren 37185  37185       93   968     4
## 10 0.124     1.428  1837    1837      Stokes 37169  37169       85  1612     1
##    NWBIR74 BIR79 SID79 NWBIR79                       geometry
## 1       10  1364     0      19 MULTIPOLYGON (((-81.47276 3...
## 2       10   542     3      12 MULTIPOLYGON (((-81.23989 3...
## 3      208  3616     6     260 MULTIPOLYGON (((-80.45634 3...
## 4      123   830     2     145 MULTIPOLYGON (((-76.00897 3...
## 5     1066  1606     3    1197 MULTIPOLYGON (((-77.21767 3...
## 6      954  1838     5    1237 MULTIPOLYGON (((-76.74506 3...
## 7      115   350     2     139 MULTIPOLYGON (((-76.00897 3...
## 8      254   594     2     371 MULTIPOLYGON (((-76.56251 3...
## 9      748  1190     2     844 MULTIPOLYGON (((-78.30876 3...
## 10     160  2038     5     176 MULTIPOLYGON (((-80.02567 3...

The spatial information is built in to the GEOMETRY column. geom_sf() understands this.

nc %>%   
  ggplot() +
  geom_sf() 

To show some quantity, map something to the fill aesthetic.

nc %>%   
  ggplot() +
  geom_sf(mapping = aes(fill = NWBIR74)) 

If you have e.g. county or state data in a table, join it to the table with spatial information.

nc_components <- get_estimates(geography = "county",
                               product = "components",
                               state = "North Carolina",
                               geometry = FALSE) # Don't download the spatial information

nc_components

unique(nc_components$variable)

# Get the data in the right shape and
# Make sure we have a column to join on
nc_components <- nc_components %>% 
  mutate(variable = str_to_lower(variable)) %>%
  rename(FIPS = GEOID,
         full_name = NAME) %>% 
  pivot_wider(names_from = "variable", 
              values_from = "value") 

nc_components

# Now join
nc <- nc %>% 
  left_join(nc_components, by = "FIPS")


nc

Now we can plot any of these colunmns.

order <- c("-15 and below", "-15 to -5", 
            "-5 to +5", "+5 to +15", "+15 and up")
  
nc <- nc %>%
    mutate(rnm_grp = case_when(
      rnetmig > 15 ~ "+15 and up",
      rnetmig > 5 ~ "+5 to +15",
      rnetmig > -5 ~ "-5 to +5",
      rnetmig > -15 ~ "-15 to -5",
      TRUE ~ "-15 and below"
    )) %>%
    mutate(rnm_grp = factor(rnm_grp, 
                            levels = order))
  
ggplot() +
  geom_sf(data = nc, 
            mapping = aes(fill = rnm_grp), 
            size = 0.1, 
          color = "gray30") +
    scale_fill_brewer(palette = "PuOr", direction = -1) +
    theme_map() +
    labs(title = "Net migration per 1000 residents by county",
         subtitle = "US Census Bureau 2019 Population Estimates",
         fill = "Rate",
         caption = "Data acquired with the R tidycensus package | @kjhealy")

The Babyname data

# It's already ordered/arranged
babynames
## # A tibble: 1,924,665 × 5
##     year sex   name          n   prop
##    <dbl> <chr> <chr>     <int>  <dbl>
##  1  1880 F     Mary       7065 0.0724
##  2  1880 F     Anna       2604 0.0267
##  3  1880 F     Emma       2003 0.0205
##  4  1880 F     Elizabeth  1939 0.0199
##  5  1880 F     Minnie     1746 0.0179
##  6  1880 F     Margaret   1578 0.0162
##  7  1880 F     Ida        1472 0.0151
##  8  1880 F     Alice      1414 0.0145
##  9  1880 F     Bertha     1320 0.0135
## 10  1880 F     Sarah      1288 0.0132
## # … with 1,924,655 more rows
# But we could make that explicit too
babynames %>% 
  group_by(year, sex) %>% 
  slice_max(order_by = prop, n = 3)
## # A tibble: 828 × 5
## # Groups:   year, sex [276]
##     year sex   name        n   prop
##    <dbl> <chr> <chr>   <int>  <dbl>
##  1  1880 F     Mary     7065 0.0724
##  2  1880 F     Anna     2604 0.0267
##  3  1880 F     Emma     2003 0.0205
##  4  1880 M     John     9655 0.0815
##  5  1880 M     William  9532 0.0805
##  6  1880 M     James    5927 0.0501
##  7  1881 F     Mary     6919 0.0700
##  8  1881 F     Anna     2698 0.0273
##  9  1881 F     Emma     2034 0.0206
## 10  1881 M     John     8769 0.0810
## # … with 818 more rows
# Plot the popularity of a single name
babynames %>% 
  filter(sex == "M" & name == "Oliver") %>% 
  group_by(year) %>% 
  ggplot(mapping = aes(x = year, y = prop)) + 
  geom_line(size = 1.2) + 
  scale_y_continuous(labels = scales::percent) +
   scale_x_continuous(breaks = seq(1880, 2015, by = 10)) +
   labs(y = "Percent of All Names", x = "Year", title = "Oliver")
# Top name for boys and girls each year 
babynames %>% 
  group_by(sex, year) %>% 
  slice_max(n = 1, order_by = prop)
## # A tibble: 276 × 5
## # Groups:   sex, year [276]
##     year sex   name      n   prop
##    <dbl> <chr> <chr> <int>  <dbl>
##  1  1880 F     Mary   7065 0.0724
##  2  1881 F     Mary   6919 0.0700
##  3  1882 F     Mary   8148 0.0704
##  4  1883 F     Mary   8012 0.0667
##  5  1884 F     Mary   9217 0.0670
##  6  1885 F     Mary   9128 0.0643
##  7  1886 F     Mary   9889 0.0643
##  8  1887 F     Mary   9888 0.0636
##  9  1888 F     Mary  11754 0.0620
## 10  1889 F     Mary  11648 0.0616
## # … with 266 more rows
# Plot of Mary
babynames %>% 
  filter(sex == "F") %>% 
  group_by(year) %>% 
  mutate(popularity = -rank(prop)) %>% 
  filter(name == "Mary") %>% 
  ggplot(mapping = aes(x = year, y = popularity)) + 
  geom_point()
babynames %>% 
  group_by(year, sex) %>%
  slice_max(n = 1, order_by = prop) %>% 
  ggplot(mapping = aes(x = year, y = prop, color = sex)) + 
  geom_line(size = 1) + scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(breaks = seq(1880, 2015, by = 10)) +
   labs(y = "Percent", x = "Year",
        title = "Most Popular Name as a Percent of All Names",
   color = "Sex") + 
  theme(legend.position = "top")

Filter by proportion

babynames %>% 
  group_by(year, sex) %>%
  filter(prop <= 0.001) %>%
  group_by(year, sex) %>% 
  tally() %>%   
  ggplot(mapping = aes(x = year, y = n, color = sex)) + 
  geom_line(size = 1) + scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(breaks = seq(1880, 2015, by = 10)) +
  labs(y = "Count", x = "Year", 
       title = "Names with a frequency of 1 in 1,000 or less",
       color = "Sex") + 
  theme(legend.position = "top")

Write a function to draw a name plot

babynames %>% 
  filter(sex == "M" & name == "Oliver") %>% 
  group_by(year) %>% 
  ggplot(mapping = aes(x = year, y = prop)) + 
  geom_line(size = 1.2) + 
  scale_y_continuous(labels = scales::percent) +
   scale_x_continuous(breaks = seq(1880, 2015, by = 10)) +
   labs(y = "Percent of All Names", x = "Year", title = "Oliver")

We don’t want to have to copy and paste this code every time we’d like to look at a new name. Instead, we’ll write a function to draw the plot for any given name, and map it to a some vector of names.

First let’s write the function.

## A bit more efficient
make_nameplot <- function(the_name, the_sex) {
  
  babynames %>% 
    filter(sex == the_sex) %>% 
    filter(name == the_name) %>% 
    group_by(year) %>% 
    ggplot(mapping = aes(x = year, y = prop)) + 
    geom_line(size = 1.1) + 
    scale_y_continuous(labels = scales::percent) +
    scale_x_continuous(breaks = seq(1880, 2015, by = 10)) +
    labs(y = "Percent of All Names", x = "Year", title = the_name)
  
}

Next, which names? Here are some:

## We'll assign gender on the basis of the most 
## common designation of the _name_ in the babynames 
## data, not my guess about people in particular

example_names <- tibble(
  name = c("Thorin", "Melanie", "Jacob", "Vincent", "Christina", "Livia", "Elyse", "Akash", "Eavan", "Krishna", "Jonah", "Jackson", "Jamael", "Georgiana", "Daniel"),
  gen = c("M", "F", "M", "M", "F", "F", "F", "M", "F", "F", "M", "M", "M", "F", "M")
)

example_names
## # A tibble: 15 × 2
##    name      gen  
##    <chr>     <chr>
##  1 Thorin    M    
##  2 Melanie   F    
##  3 Jacob     M    
##  4 Vincent   M    
##  5 Christina F    
##  6 Livia     F    
##  7 Elyse     F    
##  8 Akash     M    
##  9 Eavan     F    
## 10 Krishna   F    
## 11 Jonah     M    
## 12 Jackson   M    
## 13 Jamael    M    
## 14 Georgiana F    
## 15 Daniel    M

Now we’re going to feed this data one bit at a time to the make_nameplot() function we just wrote. This is called mapping a function. When we only need to pass one piece of information to a function to get it to work properly, we use map(). But our plotting function needs to know two things: the name we want and the sex category we want to look within. So we use map2().

example_names <- example_names %>% 
  mutate(plot = map2(name, gen, make_nameplot))

example_names
## # A tibble: 15 × 3
##    name      gen   plot  
##    <chr>     <chr> <list>
##  1 Thorin    M     <gg>  
##  2 Melanie   F     <gg>  
##  3 Jacob     M     <gg>  
##  4 Vincent   M     <gg>  
##  5 Christina F     <gg>  
##  6 Livia     F     <gg>  
##  7 Elyse     F     <gg>  
##  8 Akash     M     <gg>  
##  9 Eavan     F     <gg>  
## 10 Krishna   F     <gg>  
## 11 Jonah     M     <gg>  
## 12 Jackson   M     <gg>  
## 13 Jamael    M     <gg>  
## 14 Georgiana F     <gg>  
## 15 Daniel    M     <gg>

Isn’t that nice? Using mutate() we’ve added a column to the table of data. Each entry in that column is a ggplot object that draws our plot for that name. We could look at them one by one, but let’s walk the list to print each one of them right here.

walk(example_names$plot, print)

Notice that these plots are not comparable — not just on the y-axis, but on the x-axis as well. Some names don’t show up in the data until later years. Think about how you could make a comparable plot using these names.

Name Heterogeneity

babynames %>% 
  group_by(year, sex) %>%
  slice(which.max(prop)) %>% 
  ggplot(mapping = aes(x = year, y = prop, color = sex)) + 
  geom_line(size = 1) + scale_y_continuous(labels = scales::percent) +
   scale_x_continuous(breaks = seq(1880, 2015, by = 10)) +
   labs(y = "Percent", x = "Year",
        title = "Most Popular Name as a Percent of All Names",
   color = "Gender") + theme(legend.position = "top")

Julia Silge’s example

babynames %>%
    group_by(year, sex) %>%
    mutate(top_100 = row_number() <= 100) %>%
    ungroup() %>%
    # Silge uses count() with several variables and a weight. 
    # This is equivalent to group_by(year, sex, top100) %>% count(n)
    count(year, sex, top_100, wt = n) %>%
    mutate(top_100 = if_else(top_100, "Top 100 names", "All other names"),
           sex = if_else(sex == "F", "Baby girls", "Baby boys")) %>%
    group_by(year, sex) %>%
    mutate(prop = n / sum(n)) %>%
    ungroup() %>%
    ggplot(mapping = aes(x = year, y = prop, fill = top_100)) +
    geom_area(alpha = 0.8) +
    facet_wrap(vars(sex)) +
    scale_fill_brewer(palette = "Paired") +
    scale_y_continuous(labels = scales::percent) +
    labs(x = NULL, y = NULL, fill = NULL,
         title = "What proportion of babies are given the most common names?",
         subtitle = "Proportion of babies given one of the top 100 names has been dropping, especially since about 1990") +
    theme(legend.position = "bottom") +
    guides(fill = guide_legend(reverse = TRUE))

Which year was a name at the peak of its popularity?

## One way ... (will work generally)
babynames %>% 
  filter(sex == "F", 
         name %in% c("Linda", "Jennifer"))  %>%
    group_by(name) %>%
    mutate(max_yr = year[prop == max(prop)],
           year_c = year - max_yr)
## # A tibble: 238 × 7
## # Groups:   name [2]
##     year sex   name      n     prop max_yr year_c
##    <dbl> <chr> <chr> <int>    <dbl>  <dbl>  <dbl>
##  1  1880 F     Linda    27 0.000277   1948    -68
##  2  1881 F     Linda    38 0.000384   1948    -67
##  3  1882 F     Linda    36 0.000311   1948    -66
##  4  1883 F     Linda    49 0.000408   1948    -65
##  5  1884 F     Linda    33 0.000240   1948    -64
##  6  1885 F     Linda    60 0.000423   1948    -63
##  7  1886 F     Linda    49 0.000319   1948    -62
##  8  1887 F     Linda    50 0.000322   1948    -61
##  9  1888 F     Linda    77 0.000406   1948    -60
## 10  1889 F     Linda    74 0.000391   1948    -59
## # … with 228 more rows
## Alternatively (assuming the data are sorted)
babynames %>% 
  filter(sex == "F", 
         name %in% c("Linda", "Jennifer"))  %>%
    group_by(name) %>%
    arrange(desc(prop)) %>% 
    mutate(year_c = year - first(year)) %>% 
    arrange(year)
## # A tibble: 238 × 6
## # Groups:   name [2]
##     year sex   name      n     prop year_c
##    <dbl> <chr> <chr> <int>    <dbl>  <dbl>
##  1  1880 F     Linda    27 0.000277    -68
##  2  1881 F     Linda    38 0.000384    -67
##  3  1882 F     Linda    36 0.000311    -66
##  4  1883 F     Linda    49 0.000408    -65
##  5  1884 F     Linda    33 0.000240    -64
##  6  1885 F     Linda    60 0.000423    -63
##  7  1886 F     Linda    49 0.000319    -62
##  8  1887 F     Linda    50 0.000322    -61
##  9  1888 F     Linda    77 0.000406    -60
## 10  1889 F     Linda    74 0.000391    -59
## # … with 228 more rows

Heterogeneity: Blau and Shannon Indices

## These functions work on data that we've already got a frequency count for

get_blau <- function(feat_n){
    1 - sum((feat_n / sum(feat_n))^2)
  }
  
  
get_shannon <- function(feat_n){
    prop <- feat_n / sum(feat_n)
    -sum(prop * log(prop))
  }
  
babynames %>%
    group_by(year, sex) %>%
    summarize(blau = get_blau(n),
              shannon = get_shannon(n)) %>%
    ggplot(aes(x = year, y = shannon, color = sex)) +
    geom_line(size = 1.2) +
    scale_x_continuous(breaks = seq(1880, 2015, by = 10)) +
    labs(y = "Shannon Index",
         x = "Year",
         color = "Gender") +
    theme(legend.position = "top")
## `summarise()` has grouped output by 'year'. You can override using the `.groups`
## argument.

Skew and Kurtosis

#install.packages(e1071)
library(e1071)

babynames %>% 
  filter(sex == "F", 
         name %in% c("Linda", "Jennifer"))  %>%
  group_by(name) %>% 
  summarize(kurt = kurtosis(prop, na.rm = TRUE), 
            skew = skewness(prop, na.rm = TRUE))
## # A tibble: 2 × 3
##   name      kurt  skew
##   <chr>    <dbl> <dbl>
## 1 Jennifer 0.886  1.52
## 2 Linda    5.44   2.41
babynames %>% 
  filter(sex == "F", 
         name %in% c("Linda", "Jennifer"))  %>%
  ggplot(aes(x = year, y = prop, color = name)) + 
  geom_line()

Last letter of boys’ names

babynames %>%
    filter(sex == "M") %>%
    # A trick with str_sub() to get the last character: use -1
    mutate(endletter = str_sub(name, -1)) %>%
    group_by(year, endletter) %>%
    summarize(letter_count = n()) %>%
    # Calculate the proportion of end letters each year 
    # Then recode them to just focus on N, E, with all 
    # other letters in a third category 
    mutate(letter_prop = letter_count / sum(letter_count), 
           letter_rc = case_when(
             endletter == "n" ~ "N",
             endletter == "e" ~ "E", 
             TRUE ~ "Other letters"
           ), 
           letter_rc = factor(letter_rc, 
                              levels = c("E", "N", 
                                         "Other letters"), 
                              ordered = TRUE)) %>% 
  ggplot(mapping = aes(x = year, 
                       y = letter_prop, 
                       group = endletter, 
                       color = letter_rc)) + 
  geom_line(size = 0.5) + 
  scale_y_continuous(labels = scales::percent_format()) + 
  scale_color_brewer(palette = "Set2") + 
  guides(color = guide_legend(nrow = 1)) +
  labs(x = "Year", 
       y = "Percent of newborn boys' names ending in this letter", 
       color = "Letter:",
       title = "ONE OF N",
       subtitle = "End-letters of boys' names in the U.S. since 1880", 
       caption = "Data: United States Social Secturity Administration. Graph: @kjhealy") + 
  theme(legend.position = "top")
## `summarise()` has grouped output by 'year'. You can override using the `.groups`
## argument.