Visualizing Social Data
Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode

Code

library(tidyverse)
library(gapminder)
library(socviz)

Geoms can transform data

p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion))
p + geom_bar()
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion))
p + geom_bar(mapping = aes(y = ..prop..))
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion))
p + geom_bar(mapping = aes(y = ..prop.., group = 1)) 

Histograms and Density Plots

p <- ggplot(data = midwest,
            mapping = aes(x = area))
p + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p <- ggplot(data = midwest,
            mapping = aes(x = area))
p + geom_histogram(bins = 10)
oh_wi <- c("OH", "WI")

p <- ggplot(data = subset(midwest, subset = state %in% oh_wi),
            mapping = aes(x = percollege, fill = state))
p + geom_histogram(alpha = 0.4, bins = 20)
p <- ggplot(data = midwest,
            mapping = aes(x = area))
p + geom_density()
p <- ggplot(data = midwest,
            mapping = aes(x = area, fill = state, color = state))
p + geom_density(alpha = 0.3)

Avoid transformations when necessary with geom_col()

titanic
##       fate    sex    n percent
## 1 perished   male 1364    62.0
## 2 perished female  126     5.7
## 3 survived   male  367    16.7
## 4 survived female  344    15.6
p <- ggplot(data = titanic,
            mapping = aes(x = fate, y = percent, fill = sex))
p + geom_bar(position = "dodge", stat = "identity") + theme(legend.position = "top")
oecd_sum
## # A tibble: 57 × 5
## # Groups:   year [57]
##     year other   usa  diff hi_lo
##    <int> <dbl> <dbl> <dbl> <chr>
##  1  1960  68.6  69.9 1.30  Below
##  2  1961  69.2  70.4 1.20  Below
##  3  1962  68.9  70.2 1.30  Below
##  4  1963  69.1  70   0.900 Below
##  5  1964  69.5  70.3 0.800 Below
##  6  1965  69.6  70.3 0.700 Below
##  7  1966  69.9  70.3 0.400 Below
##  8  1967  70.1  70.7 0.600 Below
##  9  1968  70.1  70.4 0.300 Below
## 10  1969  70.1  70.6 0.5   Below
## # … with 47 more rows
p <- ggplot(data = oecd_sum,
            mapping = aes(x = year, y = diff, fill = hi_lo))
p + geom_col() + guides(fill = FALSE) +
  labs(x = NULL, y = "Difference in Years",
       title = "The US Life Expectancy Gap",
       subtitle = "Difference between US and OECD average life expectancies, 1960-2015",
       caption = "Data: OECD. After a chart by Christopher Ingraham,
                  Washington Post, December 27th 2017.")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

## Warning: Removed 1 rows containing missing values (position_stack).

Frequency Plots the Awkward Way: Don’t do this any more than absolutely necessary

with(gss_sm, table(religion))
## religion
## Protestant   Catholic     Jewish       None      Other 
##       1371        649         51        619        159
p <- ggplot(data = gss_sm,
            mapping = aes(x = religion, color = religion))
p + geom_bar()
p <- ggplot(data = gss_sm,
            mapping = aes(x = religion, fill = religion))
p + geom_bar() + guides(fill = FALSE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion, 
                          fill = religion))
p + geom_bar()
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion, 
                          fill = religion))
p + geom_bar(position = "fill")      
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion, 
                          fill = religion))
p + geom_bar(position = "dodge",
             mapping = aes(y = ..prop..))      
p <- ggplot(data = gss_sm,
            mapping = aes(x = bigregion, 
                          fill = religion))
p + geom_bar(position = "dodge",
             mapping = aes(y = ..prop.., 
                           group = religion))       

This gets awkward and a bit confusing quite fast. In general it’s better not to try to make ggplot do too much right at the plotting stage. Instead, pre-compute the numbers you want to show. We’ll see how to do that next.

Dplyr pipelines: Much better

rel_by_region <- gss_sm %>%
    group_by(bigregion, religion) %>%
    tally() %>%
    mutate(freq = n / sum(n),
           pct = round((freq*100), 1))
rel_by_region
## # A tibble: 24 × 5
## # Groups:   bigregion [4]
##    bigregion religion       n    freq   pct
##    <fct>     <fct>      <int>   <dbl> <dbl>
##  1 Northeast Protestant   158 0.324    32.4
##  2 Northeast Catholic     162 0.332    33.2
##  3 Northeast Jewish        27 0.0553    5.5
##  4 Northeast None         112 0.230    23  
##  5 Northeast Other         28 0.0574    5.7
##  6 Northeast <NA>           1 0.00205   0.2
##  7 Midwest   Protestant   325 0.468    46.8
##  8 Midwest   Catholic     172 0.247    24.7
##  9 Midwest   Jewish         3 0.00432   0.4
## 10 Midwest   None         157 0.226    22.6
## # … with 14 more rows

Avoiding legends

p <- ggplot(data = rel_by_region, 
            mapping = aes(x = pct, y = religion, fill = religion))
p + geom_col() +
    labs(x = "Percent", y = NULL) +
    guides(fill = FALSE) + 
    facet_wrap(~ bigregion, nrow = 1)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

n(), tally(), and count()

Compare and contrast. Note the different results for grouping.

gss_sm %>%
    group_by(bigregion, religion) %>%
    summarize(n = n()) 
## `summarise()` has grouped output by 'bigregion'. You can override using the
## `.groups` argument.

## # A tibble: 24 × 3
## # Groups:   bigregion [4]
##    bigregion religion       n
##    <fct>     <fct>      <int>
##  1 Northeast Protestant   158
##  2 Northeast Catholic     162
##  3 Northeast Jewish        27
##  4 Northeast None         112
##  5 Northeast Other         28
##  6 Northeast <NA>           1
##  7 Midwest   Protestant   325
##  8 Midwest   Catholic     172
##  9 Midwest   Jewish         3
## 10 Midwest   None         157
## # … with 14 more rows
gss_sm %>%
    group_by(bigregion, religion) %>%
    tally()
## # A tibble: 24 × 3
## # Groups:   bigregion [4]
##    bigregion religion       n
##    <fct>     <fct>      <int>
##  1 Northeast Protestant   158
##  2 Northeast Catholic     162
##  3 Northeast Jewish        27
##  4 Northeast None         112
##  5 Northeast Other         28
##  6 Northeast <NA>           1
##  7 Midwest   Protestant   325
##  8 Midwest   Catholic     172
##  9 Midwest   Jewish         3
## 10 Midwest   None         157
## # … with 14 more rows
gss_sm %>%
    count(bigregion, religion)
## # A tibble: 24 × 3
##    bigregion religion       n
##    <fct>     <fct>      <int>
##  1 Northeast Protestant   158
##  2 Northeast Catholic     162
##  3 Northeast Jewish        27
##  4 Northeast None         112
##  5 Northeast Other         28
##  6 Northeast <NA>           1
##  7 Midwest   Protestant   325
##  8 Midwest   Catholic     172
##  9 Midwest   Jewish         3
## 10 Midwest   None         157
## # … with 14 more rows

Kinds of facet

p <- ggplot(data = gss_sm, mapping = aes(x = age, y = childs))
p + geom_point(alpha = 0.2) +
    geom_smooth() + facet_wrap(~ race)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 18 rows containing non-finite values (stat_smooth).

## Warning: Removed 18 rows containing missing values (geom_point).
p <- ggplot(data = gss_sm, 
             mapping = aes(x = age, y = childs))
p + geom_point(alpha = 0.2) +
    geom_smooth() + 
    facet_wrap(~ sex + race, nrow = 1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 18 rows containing non-finite values (stat_smooth).

## Warning: Removed 18 rows containing missing values (geom_point).
p <- ggplot(data = gss_sm, mapping = aes(x = age, y = childs))
p + geom_point(alpha = 0.2) +
    geom_smooth(method = "lm") + facet_grid(bigregion ~ sex + race)
## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 18 rows containing non-finite values (stat_smooth).

## Warning: Removed 18 rows containing missing values (geom_point).

The Organ Donation Data

organdata
## # A tibble: 238 × 21
##    country   year       donors   pop pop_dens   gdp gdp_lag health health_lag
##    <chr>     <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
##  1 Australia NA          NA    17065    0.220 16774   16591   1300       1224
##  2 Australia 1991-01-01  12.1  17284    0.223 17171   16774   1379       1300
##  3 Australia 1992-01-01  12.4  17495    0.226 17914   17171   1455       1379
##  4 Australia 1993-01-01  12.5  17667    0.228 18883   17914   1540       1455
##  5 Australia 1994-01-01  10.2  17855    0.231 19849   18883   1626       1540
##  6 Australia 1995-01-01  10.2  18072    0.233 21079   19849   1737       1626
##  7 Australia 1996-01-01  10.6  18311    0.237 21923   21079   1846       1737
##  8 Australia 1997-01-01  10.3  18518    0.239 22961   21923   1948       1846
##  9 Australia 1998-01-01  10.5  18711    0.242 24148   22961   2077       1948
## 10 Australia 1999-01-01   8.67 18926    0.244 25445   24148   2231       2077
## # … with 228 more rows, and 12 more variables: pubhealth <dbl>, roads <dbl>,
## #   cerebvas <int>, assault <int>, external <int>, txp_pop <dbl>, world <chr>,
## #   opt <chr>, consent_law <chr>, consent_practice <chr>, consistent <chr>,
## #   ccode <chr>
organdata %>% 
  ggplot(mapping = aes(x = year, y = donors, group = country)) + 
  geom_line() + 
  facet_wrap(~ reorder(country, -donors, mean, na.rm = TRUE))
## Warning: Removed 34 row(s) containing missing values (geom_path).
organdata %>% 
  filter(country != "Spain") %>% 
  ggplot(mapping = aes(x = donors, 
                       y = reorder(country, donors, na.rm = TRUE))) + 
  geom_boxplot() + 
  facet_wrap(~ world, scales = "free_y", ncol = 1) + 
  labs(x = "Donors", y = NULL) 
## Warning: Removed 32 rows containing non-finite values (stat_boxplot).