Example 06: Expanding your Vocabulary

Setup

Code
library(here)      # manage file paths
here() starts at /Users/kjhealy/Documents/courses/vsd
Code
library(socviz)    # data and some useful functions
library(tidyverse) # your friend and mine
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
## New packages we'll use
library(ggrepel) #<<
library(scales) #<<

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

Text in Plots

Code
elections_historic
# A tibble: 49 × 19
   election  year winner      win_party ec_pct popular_pct popular_margin  votes
      <int> <int> <chr>       <chr>      <dbl>       <dbl>          <dbl>  <int>
 1       10  1824 John Quinc… D.-R.      0.322       0.309        -0.104  1.13e5
 2       11  1828 Andrew Jac… Dem.       0.682       0.559         0.122  6.43e5
 3       12  1832 Andrew Jac… Dem.       0.766       0.547         0.178  7.03e5
 4       13  1836 Martin Van… Dem.       0.578       0.508         0.142  7.63e5
 5       14  1840 William He… Whig       0.796       0.529         0.0605 1.28e6
 6       15  1844 James Polk  Dem.       0.618       0.495         0.0145 1.34e6
 7       16  1848 Zachary Ta… Whig       0.562       0.473         0.0479 1.36e6
 8       17  1852 Franklin P… Dem.       0.858       0.508         0.0695 1.61e6
 9       18  1856 James Buch… Dem.       0.588       0.453         0.122  1.84e6
10       19  1860 Abraham Li… Rep.       0.594       0.396         0.101  1.86e6
# ℹ 39 more rows
# ℹ 11 more variables: margin <int>, runner_up <chr>, ru_part <chr>,
#   turnout_pct <dbl>, winner_lname <chr>, winner_label <chr>, ru_lname <chr>,
#   ru_label <chr>, two_term <lgl>, ec_votes <dbl>, ec_denom <dbl>
Code
## Set up long strings as objects, for neatness.
p_title <- "Presidential Elections: Popular & Electoral College Margins"
p_subtitle <- "1824-2016"
p_caption <- "Data for 2016 are provisional."
x_label <- "Winner's share of Popular Vote"
y_label <- "Winner's share of Electoral College Votes"
Code
p <- ggplot(data = elections_historic, 
            mapping = aes(x = popular_pct, 
                          y = ec_pct,
                          label = winner_label))

p + geom_hline(yintercept = 0.5, 
               size = 1.4, 
               color = "gray80") +
    geom_vline(xintercept = 0.5, 
               size = 1.4, 
               color = "gray80") +
    geom_point()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Code
p <- ggplot(data = elections_historic, 
            mapping = aes(x = popular_pct, 
                          y = ec_pct,
                          label = winner_label))

p + geom_hline(yintercept = 0.5, 
               linewidth = 1.4, color = "gray80") +
  geom_vline(xintercept = 0.5, 
             linewidth = 1.4, color = "gray80") +
  geom_point() + 
  geom_text_repel()
Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Resize it with fig.width and fig.height in the chunk options:

Code
p <- ggplot(data = elections_historic, 
            mapping  = aes(x = popular_pct, 
                           y = ec_pct,
                           label = winner_label))
p_out <- p + geom_hline(yintercept = 0.5, 
                        size = 1.4, 
                        color = "gray80") +
  geom_vline(xintercept = 0.5, 
             linewidth = 1.4, 
             color = "gray80") +
  geom_point() +
  geom_text_repel() +#<<
  scale_x_continuous(labels = label_percent()) +
  scale_y_continuous(labels = label_percent()) +
  labs(x = x_label, y = y_label,  #<<
       title = p_title, 
       subtitle = p_subtitle,
       caption = p_caption)   

p_out

Subsetting for text

Code
by_country <- organdata |> 
    group_by(consent_law, country) |>
      summarize(across(where(is.numeric),
                       list(mean = ~ mean(.x, na.rm = TRUE), 
                            sd = ~ sd(.x, na.rm = TRUE))))
`summarise()` has grouped output by 'consent_law'. You can override using the
`.groups` argument.
Code
by_country
# A tibble: 17 × 28
# Groups:   consent_law [2]
   consent_law country       donors_mean donors_sd pop_mean pop_sd pop_dens_mean
   <chr>       <chr>               <dbl>     <dbl>    <dbl>  <dbl>         <dbl>
 1 Informed    Australia            10.6     1.14    18318. 8.31e2         0.237
 2 Informed    Canada               14.0     0.751   29608. 1.19e3         0.297
 3 Informed    Denmark              13.1     1.47     5257. 8.06e1        12.2  
 4 Informed    Germany              13.0     0.611   80255. 5.16e3        22.5  
 5 Informed    Ireland              19.8     2.48     3674. 1.32e2         5.23 
 6 Informed    Netherlands          13.7     1.55    15548. 3.73e2        37.4  
 7 Informed    United Kingd…        13.5     0.775   58187. 6.26e2        24.0  
 8 Informed    United States        20.0     1.33   269330. 1.25e4         2.80 
 9 Presumed    Austria              23.5     2.42     7927. 1.09e2         9.45 
10 Presumed    Belgium              21.9     1.94    10153. 1.09e2        30.7  
11 Presumed    Finland              18.4     1.53     5112. 6.86e1         1.51 
12 Presumed    France               16.8     1.60    58056. 8.51e2        10.5  
13 Presumed    Italy                11.1     4.28    57360. 4.25e2        19.0  
14 Presumed    Norway               15.4     1.11     4386. 9.73e1         1.35 
15 Presumed    Spain                28.1     4.96    39666. 9.51e2         7.84 
16 Presumed    Sweden               13.1     1.75     8789. 1.14e2         1.95 
17 Presumed    Switzerland          14.2     1.71     7037. 1.70e2        17.0  
# ℹ 21 more variables: pop_dens_sd <dbl>, gdp_mean <dbl>, gdp_sd <dbl>,
#   gdp_lag_mean <dbl>, gdp_lag_sd <dbl>, health_mean <dbl>, health_sd <dbl>,
#   health_lag_mean <dbl>, health_lag_sd <dbl>, pubhealth_mean <dbl>,
#   pubhealth_sd <dbl>, roads_mean <dbl>, roads_sd <dbl>, cerebvas_mean <dbl>,
#   cerebvas_sd <dbl>, assault_mean <dbl>, assault_sd <dbl>,
#   external_mean <dbl>, external_sd <dbl>, txp_pop_mean <dbl>,
#   txp_pop_sd <dbl>

Using subset()

Code
by_country |> 
  ggplot(mapping = aes(x = gdp_mean,
                       y = health_mean)) +
  geom_point() + 
  geom_text_repel(data = subset(by_country, gdp_mean > 25000), 
                  mapping = aes(label = country))

Code
by_country |> 
  ggplot(mapping = aes(x = gdp_mean,
                       y = health_mean)) +
  geom_point() + 
  geom_text_repel(data = subset(by_country, 
                                gdp_mean > 25000 |
                                  health_mean < 1500 |
                                  country %in% "Belgium"), 
                  mapping = aes(label = country))

Pre-filtering

Code
df_hl <- by_country |> 
  filter(gdp_mean > 25000 | 
           health_mean < 1500 | 
           country %in% "Belgium")

df_hl
# A tibble: 6 × 28
# Groups:   consent_law [2]
  consent_law country       donors_mean donors_sd pop_mean  pop_sd pop_dens_mean
  <chr>       <chr>               <dbl>     <dbl>    <dbl>   <dbl>         <dbl>
1 Informed    Ireland              19.8      2.48    3674.   132.           5.23
2 Informed    United States        20.0      1.33  269330. 12545.           2.80
3 Presumed    Belgium              21.9      1.94   10153.   109.          30.7 
4 Presumed    Norway               15.4      1.11    4386.    97.3          1.35
5 Presumed    Spain                28.1      4.96   39666.   951.           7.84
6 Presumed    Switzerland          14.2      1.71    7037.   170.          17.0 
# ℹ 21 more variables: pop_dens_sd <dbl>, gdp_mean <dbl>, gdp_sd <dbl>,
#   gdp_lag_mean <dbl>, gdp_lag_sd <dbl>, health_mean <dbl>, health_sd <dbl>,
#   health_lag_mean <dbl>, health_lag_sd <dbl>, pubhealth_mean <dbl>,
#   pubhealth_sd <dbl>, roads_mean <dbl>, roads_sd <dbl>, cerebvas_mean <dbl>,
#   cerebvas_sd <dbl>, assault_mean <dbl>, assault_sd <dbl>,
#   external_mean <dbl>, external_sd <dbl>, txp_pop_mean <dbl>,
#   txp_pop_sd <dbl>
Code
by_country |> 
  ggplot(mapping = aes(x = gdp_mean,
                       y = health_mean)) +
  geom_point() + 
  geom_text_repel(data = df_hl, 
                  mapping = aes(label = country))

Using annotate()

Code
organdata |> 
  ggplot(mapping = aes(x = roads, 
                       y = donors)) + 
  geom_point() + 
  annotate(geom = "text", 
           x = 157, 
           y = 33,
           label = "A surprisingly high \n recovery rate.",
           hjust = 0)
Warning: Removed 34 rows containing missing values (`geom_point()`).

You can layer annotations too:

Code
organdata |> 
  ggplot(mapping = aes(x = roads, 
                       y = donors)) + 
  geom_point() +
  annotate(geom = "rect", 
           xmin = 125, xmax = 155,
           ymin = 30, ymax = 35,
           fill = "red", 
           alpha = 0.2) + 
  annotate(geom = "text", 
           x = 157, y = 33,
           label = "A surprisingly high \n recovery rate.", 
           hjust = 0)
Warning: Removed 34 rows containing missing values (`geom_point()`).

Scale customization

Code
organdata |> 
  ggplot(mapping = aes(x = roads,
                       y = donors,
                       color = world)) + 
  geom_point() +
  scale_y_continuous(breaks = c(5, 15, 25),
                     labels = c("Five", 
                                "Fifteen", 
                                "Twenty Five"))
Warning: Removed 34 rows containing missing values (`geom_point()`).

Code
organdata |> 
  ggplot(mapping = aes(x = roads,
                       y = donors,
                       color = world)) + 
  geom_point() +
  scale_color_discrete(labels =
                         c("Corporatist", 
                           "Liberal",
                           "Social Democratic", 
                           "Unclassified")) +
  labs(x = "Road Deaths",
       y = "Donor Procurement",
       color = "Welfare State")
Warning: Removed 34 rows containing missing values (`geom_point()`).

Code
organdata |> 
  ggplot(mapping = aes(x = roads,
                       y = donors,
                       color = consent_law)) + 
  geom_point() +
  facet_wrap(~ consent_law, ncol = 1) +
  guides(color = "none") + 
  labs(x = "Road Deaths",
       y = "Donor Procurement")
Warning: Removed 34 rows containing missing values (`geom_point()`).

Code
## Using the "classic" ggplot theme here
organdata |> 
  ggplot(mapping = aes(x = roads,
                       y = donors,
                       color = consent_law)) + 
  geom_point() +
  labs(title = "By Consent Law",
    x = "Road Deaths",
    y = "Donor Procurement", 
    color = "Legal Regime:") + 
  theme(legend.position = "bottom", 
        plot.title = element_text(color = "darkred",
                                  face = "bold"))
Warning: Removed 34 rows containing missing values (`geom_point()`).