Example 06: Expanding your Vocabulary

Setup

Code
library(here)      # manage file paths
here() starts at /Users/kjhealy/Documents/courses/vsd
Code
library(socviz)    # data and some useful functions
library(tidyverse) # your friend and mine
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.2.0
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
Code
## New packages we'll use
library(ggrepel) #<<
library(scales) #<<

Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

Text in Plots

Code
elections_historic
# A tibble: 49 × 19
   election  year winner    win_p…¹ ec_pct popul…² popul…³  votes margin runne…⁴
      <int> <int> <chr>     <chr>    <dbl>   <dbl>   <dbl>  <int>  <int> <chr>  
 1       10  1824 John Qui… D.-R.    0.322   0.309 -0.104  1.13e5 -38221 Andrew…
 2       11  1828 Andrew J… Dem.     0.682   0.559  0.122  6.43e5 140839 John Q…
 3       12  1832 Andrew J… Dem.     0.766   0.547  0.178  7.03e5 228628 Henry …
 4       13  1836 Martin V… Dem.     0.578   0.508  0.142  7.63e5 213384 Willia…
 5       14  1840 William … Whig     0.796   0.529  0.0605 1.28e6 145938 Martin…
 6       15  1844 James Po… Dem.     0.618   0.495  0.0145 1.34e6  39413 Henry …
 7       16  1848 Zachary … Whig     0.562   0.473  0.0479 1.36e6 137882 Lewis …
 8       17  1852 Franklin… Dem.     0.858   0.508  0.0695 1.61e6 219525 Winfie…
 9       18  1856 James Bu… Dem.     0.588   0.453  0.122  1.84e6 494472 John F…
10       19  1860 Abraham … Rep.     0.594   0.396  0.101  1.86e6 474049 John B…
# … with 39 more rows, 9 more variables: ru_part <chr>, turnout_pct <dbl>,
#   winner_lname <chr>, winner_label <chr>, ru_lname <chr>, ru_label <chr>,
#   two_term <lgl>, ec_votes <dbl>, ec_denom <dbl>, and abbreviated variable
#   names ¹​win_party, ²​popular_pct, ³​popular_margin, ⁴​runner_up
Code
## Set up long strings as objects, for neatness.
p_title <- "Presidential Elections: Popular & Electoral College Margins"
p_subtitle <- "1824-2016"
p_caption <- "Data for 2016 are provisional."
x_label <- "Winner's share of Popular Vote"
y_label <- "Winner's share of Electoral College Votes"
Code
p <- ggplot(data = elections_historic, 
            mapping = aes(x = popular_pct, 
                          y = ec_pct,
                          label = winner_label))

p + geom_hline(yintercept = 0.5, 
               size = 1.4, 
               color = "gray80") +
    geom_vline(xintercept = 0.5, 
               size = 1.4, 
               color = "gray80") +
    geom_point()
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Code
p <- ggplot(data = elections_historic, 
            mapping = aes(x = popular_pct, 
                          y = ec_pct,
                          label = winner_label))

p + geom_hline(yintercept = 0.5, 
               linewidth = 1.4, color = "gray80") +
  geom_vline(xintercept = 0.5, 
             linewidth = 1.4, color = "gray80") +
  geom_point() + 
  geom_text_repel()
Warning: ggrepel: 19 unlabeled data points (too many overlaps). Consider
increasing max.overlaps

Resize it with fig.width and fig.height in the chunk options:

Code
p <- ggplot(data = elections_historic, 
            mapping  = aes(x = popular_pct, 
                           y = ec_pct,
                           label = winner_label))
p_out <- p + geom_hline(yintercept = 0.5, 
                        size = 1.4, 
                        color = "gray80") +
  geom_vline(xintercept = 0.5, 
             linewidth = 1.4, 
             color = "gray80") +
  geom_point() +
  geom_text_repel() +#<<
  scale_x_continuous(labels = label_percent()) +
  scale_y_continuous(labels = label_percent()) +
  labs(x = x_label, y = y_label,  #<<
       title = p_title, 
       subtitle = p_subtitle,
       caption = p_caption)   

p_out

Subsetting for text

Code
by_country <- organdata |> 
    group_by(consent_law, country) |>
      summarize(across(where(is.numeric),
                       list(mean = ~ mean(.x, na.rm = TRUE), 
                            sd = ~ sd(.x, na.rm = TRUE))))
`summarise()` has grouped output by 'consent_law'. You can override using the
`.groups` argument.
Code
by_country
# A tibble: 17 × 28
# Groups:   consent_law [2]
   conse…¹ country donor…² donor…³ pop_m…⁴ pop_sd pop_d…⁵ pop_d…⁶ gdp_m…⁷ gdp_sd
   <chr>   <chr>     <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
 1 Inform… Austra…    10.6   1.14   18318. 8.31e2   0.237  0.0107  22179.  3959.
 2 Inform… Canada     14.0   0.751  29608. 1.19e3   0.297  0.0120  23711.  3966.
 3 Inform… Denmark    13.1   1.47    5257. 8.06e1  12.2    0.187   23722.  3896.
 4 Inform… Germany    13.0   0.611  80255. 5.16e3  22.5    1.44    22163.  2501.
 5 Inform… Ireland    19.8   2.48    3674. 1.32e2   5.23   0.187   20824.  6670.
 6 Inform… Nether…    13.7   1.55   15548. 3.73e2  37.4    0.898   23013.  3770.
 7 Inform… United…    13.5   0.775  58187. 6.26e2  24.0    0.258   21359.  3929.
 8 Inform… United…    20.0   1.33  269330. 1.25e4   2.80   0.130   29212.  4571.
 9 Presum… Austria    23.5   2.42    7927. 1.09e2   9.45   0.130   23876.  3343.
10 Presum… Belgium    21.9   1.94   10153. 1.09e2  30.7    0.330   22500.  3171.
11 Presum… Finland    18.4   1.53    5112. 6.86e1   1.51   0.0203  21019.  3668.
12 Presum… France     16.8   1.60   58056. 8.51e2  10.5    0.154   22603.  3260.
13 Presum… Italy      11.1   4.28   57360. 4.25e2  19.0    0.141   21554.  2781.
14 Presum… Norway     15.4   1.11    4386. 9.73e1   1.35   0.0300  26448.  6492.
15 Presum… Spain      28.1   4.96   39666. 9.51e2   7.84   0.188   16933   2888.
16 Presum… Sweden     13.1   1.75    8789. 1.14e2   1.95   0.0253  22415.  3213.
17 Presum… Switze…    14.2   1.71    7037. 1.70e2  17.0    0.411   27233   2153.
# … with 18 more variables: gdp_lag_mean <dbl>, gdp_lag_sd <dbl>,
#   health_mean <dbl>, health_sd <dbl>, health_lag_mean <dbl>,
#   health_lag_sd <dbl>, pubhealth_mean <dbl>, pubhealth_sd <dbl>,
#   roads_mean <dbl>, roads_sd <dbl>, cerebvas_mean <dbl>, cerebvas_sd <dbl>,
#   assault_mean <dbl>, assault_sd <dbl>, external_mean <dbl>,
#   external_sd <dbl>, txp_pop_mean <dbl>, txp_pop_sd <dbl>, and abbreviated
#   variable names ¹​consent_law, ²​donors_mean, ³​donors_sd, ⁴​pop_mean, …

Using subset()

Code
by_country |> 
  ggplot(mapping = aes(x = gdp_mean,
                       y = health_mean)) +
  geom_point() + 
  geom_text_repel(data = subset(by_country, gdp_mean > 25000), 
                  mapping = aes(label = country))

Code
by_country |> 
  ggplot(mapping = aes(x = gdp_mean,
                       y = health_mean)) +
  geom_point() + 
  geom_text_repel(data = subset(by_country, 
                                gdp_mean > 25000 |
                                  health_mean < 1500 |
                                  country %in% "Belgium"), 
                  mapping = aes(label = country))

Pre-filtering

Code
df_hl <- by_country |> 
  filter(gdp_mean > 25000 | 
           health_mean < 1500 | 
           country %in% "Belgium")

df_hl
# A tibble: 6 × 28
# Groups:   consent_law [2]
  consen…¹ country donor…² donor…³ pop_m…⁴ pop_sd pop_d…⁵ pop_d…⁶ gdp_m…⁷ gdp_sd
  <chr>    <chr>     <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
1 Informed Ireland    19.8    2.48   3674. 1.32e2    5.23  0.187   20824.  6670.
2 Informed United…    20.0    1.33 269330. 1.25e4    2.80  0.130   29212.  4571.
3 Presumed Belgium    21.9    1.94  10153. 1.09e2   30.7   0.330   22500.  3171.
4 Presumed Norway     15.4    1.11   4386. 9.73e1    1.35  0.0300  26448.  6492.
5 Presumed Spain      28.1    4.96  39666. 9.51e2    7.84  0.188   16933   2888.
6 Presumed Switze…    14.2    1.71   7037. 1.70e2   17.0   0.411   27233   2153.
# … with 18 more variables: gdp_lag_mean <dbl>, gdp_lag_sd <dbl>,
#   health_mean <dbl>, health_sd <dbl>, health_lag_mean <dbl>,
#   health_lag_sd <dbl>, pubhealth_mean <dbl>, pubhealth_sd <dbl>,
#   roads_mean <dbl>, roads_sd <dbl>, cerebvas_mean <dbl>, cerebvas_sd <dbl>,
#   assault_mean <dbl>, assault_sd <dbl>, external_mean <dbl>,
#   external_sd <dbl>, txp_pop_mean <dbl>, txp_pop_sd <dbl>, and abbreviated
#   variable names ¹​consent_law, ²​donors_mean, ³​donors_sd, ⁴​pop_mean, …
Code
by_country |> 
  ggplot(mapping = aes(x = gdp_mean,
                       y = health_mean)) +
  geom_point() + 
  geom_text_repel(data = df_hl, 
                  mapping = aes(label = country))

Using annotate()

Code
organdata |> 
  ggplot(mapping = aes(x = roads, 
                       y = donors)) + 
  geom_point() + 
  annotate(geom = "text", 
           x = 157, 
           y = 33,
           label = "A surprisingly high \n recovery rate.",
           hjust = 0)
Warning: Removed 34 rows containing missing values (`geom_point()`).

You can layer annotations too:

Code
organdata |> 
  ggplot(mapping = aes(x = roads, 
                       y = donors)) + 
  geom_point() +
  annotate(geom = "rect", 
           xmin = 125, xmax = 155,
           ymin = 30, ymax = 35,
           fill = "red", 
           alpha = 0.2) + 
  annotate(geom = "text", 
           x = 157, y = 33,
           label = "A surprisingly high \n recovery rate.", 
           hjust = 0)
Warning: Removed 34 rows containing missing values (`geom_point()`).

Scale customization

Code
organdata |> 
  ggplot(mapping = aes(x = roads,
                       y = donors,
                       color = world)) + 
  geom_point() +
  scale_y_continuous(breaks = c(5, 15, 25),
                     labels = c("Five", 
                                "Fifteen", 
                                "Twenty Five"))
Warning: Removed 34 rows containing missing values (`geom_point()`).

Code
organdata |> 
  ggplot(mapping = aes(x = roads,
                       y = donors,
                       color = world)) + 
  geom_point() +
  scale_color_discrete(labels =
                         c("Corporatist", 
                           "Liberal",
                           "Social Democratic", 
                           "Unclassified")) +
  labs(x = "Road Deaths",
       y = "Donor Procurement",
       color = "Welfare State")
Warning: Removed 34 rows containing missing values (`geom_point()`).

Code
organdata |> 
  ggplot(mapping = aes(x = roads,
                       y = donors,
                       color = consent_law)) + 
  geom_point() +
  facet_wrap(~ consent_law, ncol = 1) +
  guides(color = "none") + 
  labs(x = "Road Deaths",
       y = "Donor Procurement")
Warning: Removed 34 rows containing missing values (`geom_point()`).

Code
## Using the "classic" ggplot theme here
organdata |> 
  ggplot(mapping = aes(x = roads,
                       y = donors,
                       color = consent_law)) + 
  geom_point() +
  labs(title = "By Consent Law",
    x = "Road Deaths",
    y = "Donor Procurement", 
    color = "Legal Regime:") + 
  theme(legend.position = "bottom", 
        plot.title = element_text(color = "darkred",
                                  face = "bold"))
Warning: Removed 34 rows containing missing values (`geom_point()`).