Visualizing Social Data
Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode

Code

Animation

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::edition_get()   masks testthat::edition_get()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ purrr::is_null()       masks testthat::is_null()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ readr::local_edition() masks testthat::local_edition()
## ✖ dplyr::matches()       masks tidyr::matches(), testthat::matches()
library(socviz)
## 
## Attaching package: 'socviz'

## The following object is masked from 'package:kjhutils':
## 
##     %nin%
library(gganimate)

library(gapminder)
library(uscenpops)
library(babynames)
library(congress)

Gapminder and doc examples

p <- ggplot(data = gapminder, 
            mapping = aes(x = gdpPercap, y=lifeExp, 
                          size = pop, color = country)) +
  geom_point(show.legend = FALSE, alpha = 0.7) +
  scale_color_viridis_d() +
  scale_size(range = c(2, 12)) +
  scale_x_log10() +
  labs(x = "GDP per capita", y = "Life expectancy")
p
g1 <- p + transition_time(year) +
  labs(title = "Year: {frame_time}")


anim_save("code/g1.gif", g1)
knitr::include_graphics("g1.gif")

g2 <- p + facet_wrap(~continent) +
  transition_time(year) +
  labs(title = "Year: {frame_time}")

anim_save("code/g2.gif", g2)
knitr::include_graphics("g2.gif")

g3 <- p + transition_time(year) +
  labs(title = "Year: {frame_time}") +
  view_follow(fixed_y = TRUE)

anim_save("code/g3.gif", g3)
knitr::include_graphics("g3.gif")

g4 <- p + transition_time(year) +
  labs(title = "Year: {frame_time}") +
  shadow_wake(wake_length = 0.1, alpha = FALSE)

anim_save("code/g4.gif", g4)
knitr::include_graphics("g4.gif")

Census examples

library(tidycensus)

la_age_hisp <- get_estimates(geography = "county",
                             product = "characteristics",
                             breakdown = c("SEX", "AGEGROUP", "HISP"),
                             breakdown_labels = TRUE,
                             state = "CA",
                             county = "Los Angeles")

la_age_hisp
## # A tibble: 210 × 6
##    GEOID NAME                              value SEX        AGEGROUP       HISP 
##    <chr> <chr>                             <dbl> <chr>      <fct>          <chr>
##  1 06037 Los Angeles County, California 10039107 Both sexes All ages       Both…
##  2 06037 Los Angeles County, California  5157137 Both sexes All ages       Non-…
##  3 06037 Los Angeles County, California  4881970 Both sexes All ages       Hisp…
##  4 06037 Los Angeles County, California   579856 Both sexes Age 0 to 4 ye… Both…
##  5 06037 Los Angeles County, California   236459 Both sexes Age 0 to 4 ye… Non-…
##  6 06037 Los Angeles County, California   343397 Both sexes Age 0 to 4 ye… Hisp…
##  7 06037 Los Angeles County, California   600191 Both sexes Age 5 to 9 ye… Both…
##  8 06037 Los Angeles County, California   229438 Both sexes Age 5 to 9 ye… Non-…
##  9 06037 Los Angeles County, California   370753 Both sexes Age 5 to 9 ye… Hisp…
## 10 06037 Los Angeles County, California   601542 Both sexes Age 10 to 14 … Both…
## # … with 200 more rows

Static population pyramid, LA County

compare <- la_age_hisp %>% 
  filter(str_detect(AGEGROUP, "^Age"),
         HISP != "Both Hispanic Origins",
         SEX != "Both sexes") %>%
  mutate(value = ifelse(SEX == "Male", -value, value))


compare
## # A tibble: 72 × 6
##    GEOID NAME                             value SEX   AGEGROUP           HISP   
##    <chr> <chr>                            <dbl> <chr> <fct>              <chr>  
##  1 06037 Los Angeles County, California -121579 Male  Age 0 to 4 years   Non-Hi…
##  2 06037 Los Angeles County, California -175409 Male  Age 0 to 4 years   Hispan…
##  3 06037 Los Angeles County, California -117983 Male  Age 5 to 9 years   Non-Hi…
##  4 06037 Los Angeles County, California -189418 Male  Age 5 to 9 years   Hispan…
##  5 06037 Los Angeles County, California -115019 Male  Age 10 to 14 years Non-Hi…
##  6 06037 Los Angeles County, California -193045 Male  Age 10 to 14 years Hispan…
##  7 06037 Los Angeles County, California -123649 Male  Age 15 to 19 years Non-Hi…
##  8 06037 Los Angeles County, California -189936 Male  Age 15 to 19 years Hispan…
##  9 06037 Los Angeles County, California -142239 Male  Age 20 to 24 years Non-Hi…
## 10 06037 Los Angeles County, California -198868 Male  Age 20 to 24 years Hispan…
## # … with 62 more rows
compare %>% 
  ggplot(mapping = aes(x = AGEGROUP, y = value, fill = SEX)) +
  geom_col(width = 1) +
  scale_y_continuous(labels = function(y) paste0(abs(y / 1000), "k")) +
  scale_x_discrete(labels = function(x) str_replace(x, "Age | years", "")) +
  scale_fill_manual(values = c("darkred", "navy")) +
  coord_flip() +
  facet_wrap(~ HISP) +
  labs(x = NULL,
       y = "Census Bureau population estimate",
       title = "Population structure by Hispanic origin",
       subtitle = "Los Angeles County, California",
       fill = NULL)

Dynamic population pyramid for the US

## drat::addRepo("kjhealy")
## install.packages("uscenpops")
  
library(uscenpops)

uscenpops
## # A tibble: 10,520 × 5
##     year   age     pop   male female
##    <int> <dbl>   <dbl>  <dbl>  <dbl>
##  1  1900     0 1811000 919000 892000
##  2  1900     1 1835000 928000 907000
##  3  1900     2 1846000 932000 914000
##  4  1900     3 1848000 932000 916000
##  5  1900     4 1841000 928000 913000
##  6  1900     5 1827000 921000 906000
##  7  1900     6 1806000 911000 895000
##  8  1900     7 1780000 899000 881000
##  9  1900     8 1750000 884000 866000
## 10  1900     9 1717000 868000 849000
## # … with 10,510 more rows
pop_pyr <- uscenpops %>% 
  select(year, age, male, female) %>%
  pivot_longer(male:female, names_to = "group", values_to = "count") %>%
  group_by(year, group) %>%
  mutate(total = sum(count), pct = (count/total)*100, base = 0) 

pop_pyr
## # A tibble: 21,040 × 7
## # Groups:   year, group [240]
##     year   age group   count    total   pct  base
##    <int> <dbl> <chr>   <dbl>    <dbl> <dbl> <dbl>
##  1  1900     0 male   919000 38867000  2.36     0
##  2  1900     0 female 892000 37227000  2.40     0
##  3  1900     1 male   928000 38867000  2.39     0
##  4  1900     1 female 907000 37227000  2.44     0
##  5  1900     2 male   932000 38867000  2.40     0
##  6  1900     2 female 914000 37227000  2.46     0
##  7  1900     3 male   932000 38867000  2.40     0
##  8  1900     3 female 916000 37227000  2.46     0
##  9  1900     4 male   928000 38867000  2.39     0
## 10  1900     4 female 913000 37227000  2.45     0
## # … with 21,030 more rows
uscenpops %>%
  group_by(year) %>%
  summarize(max_age = max(age)) %>%
  group_by(max_age) %>%
  summarize(minyr = min(year), 
            maxyr = max(year))
## # A tibble: 3 × 3
##   max_age minyr maxyr
##     <dbl> <int> <int>
## 1      75  1900  1939
## 2      85  1940  1979
## 3     100  1980  2019
## Make all the Male ages negative
pop_pyr$count[pop_pyr$group == "male"] <- -pop_pyr$count[pop_pyr$group == "male"]

pop_pyr
## # A tibble: 21,040 × 7
## # Groups:   year, group [240]
##     year   age group    count    total   pct  base
##    <int> <dbl> <chr>    <dbl>    <dbl> <dbl> <dbl>
##  1  1900     0 male   -919000 38867000  2.36     0
##  2  1900     0 female  892000 37227000  2.40     0
##  3  1900     1 male   -928000 38867000  2.39     0
##  4  1900     1 female  907000 37227000  2.44     0
##  5  1900     2 male   -932000 38867000  2.40     0
##  6  1900     2 female  914000 37227000  2.46     0
##  7  1900     3 male   -932000 38867000  2.40     0
##  8  1900     3 female  916000 37227000  2.46     0
##  9  1900     4 male   -928000 38867000  2.39     0
## 10  1900     4 female  913000 37227000  2.45     0
## # … with 21,030 more rows
mbreaks <- c("1M", "2M", "3M")
p <- ggplot(data = pop_pyr,
            mapping = aes(xmin = base,
                          xmax = count, 
                          y = age,
                          fill = group))
p + geom_ribbon(alpha = 0.5) +
    scale_x_continuous(labels = c(rev(mbreaks), "0", mbreaks), 
                       breaks = seq(-3e6, 3e6, 1e6), 
                       limits = c(-3e6, 3e6)) + 
    scale_y_continuous(breaks = seq(10, 100, 10)) +
    scale_fill_manual(values = my.colors("bly"), 
                      labels = c("Female", "Male")) + 
    guides(fill = guide_legend(reverse = TRUE))
p + geom_ribbon(alpha = 0.5) +
    scale_x_continuous(labels = c(rev(mbreaks), "0", mbreaks), 
                       breaks = seq(-3e6, 3e6, 1e6), 
                       limits = c(-3e6, 3e6)) + 
    scale_y_continuous(breaks = seq(10, 100, 10)) +
    scale_fill_manual(values = my.colors("bly"), 
                      labels = c("Female", "Male")) + 
    guides(fill = guide_legend(reverse = TRUE)) + 
  labs(x = "Age", y = "Number of People",
         title = "{frame_time}. Absolute Age/Sex Distribution of the U.S. Population",
         subtitle = "Age is top-coded at 75 until 1939, at 85 until 1979, and at 100 since 1980.",
         caption = "Kieran Healy / kieranhealy.org / Data: US Census Bureau.",
         fill = NULL) +
    theme(legend.position = "bottom",
          plot.title = element_text(size = rel(2), face = "bold"),
          plot.subtitle = element_text(size = rel(2)),
          plot.caption = element_text(size = rel(2)),
          axis.text.y = element_text(size = rel(3)),
          axis.text.x = element_text(size = rel(3)),
          axis.title.x = element_text(size = rel(3)),
          axis.title.y = element_text(size = rel(3)),
          legend.text = element_text(size = rel(3)))
p_pyr_count <- p + geom_ribbon(alpha = 0.5) +
    scale_x_continuous(labels = c(rev(mbreaks), "0", mbreaks), 
                       breaks = seq(-3e6, 3e6, 1e6), 
                       limits = c(-3e6, 3e6)) + 
    scale_y_continuous(breaks = seq(10, 100, 10)) +
    scale_fill_manual(values = my.colors("bly"), 
                      labels = c("Female", "Male")) + 
    guides(fill = guide_legend(reverse = TRUE)) +
    labs(x = "Age", y = "Number of People",
         title = "{frame_time}. Absolute Age/Sex Distribution of the U.S. Population",
         subtitle = "Age is top-coded at 75 until 1939, at 85 until 1979, and at 100 since 1980.",
         caption = "Kieran Healy / kieranhealy.org / Data: US Census Bureau.",
         fill = NULL) +
    theme(legend.position = "bottom",
          plot.title = element_text(size = rel(2), face = "bold"),
          plot.subtitle = element_text(size = rel(2)),
          plot.caption = element_text(size = rel(2)),
          axis.text.y = element_text(size = rel(3)),
          axis.text.x = element_text(size = rel(3)),
          axis.title.x = element_text(size = rel(3)),
          axis.title.y = element_text(size = rel(3)),
          legend.text = element_text(size = rel(3)))
p_pyr_anim <- p_pyr_count + 
  transition_time(as.integer(year)) + 
  ease_aes("cubic-in-out")

anim_save("code/pyramid.gif", p_pyr_anim, width = 1024, height = 1024)
knitr::include_graphics("pyramid.gif")