Visualizing Social Data
Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode

Code

Polishing and Case Studies

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::edition_get()   masks testthat::edition_get()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ purrr::is_null()       masks testthat::is_null()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ readr::local_edition() masks testthat::local_edition()
## ✖ dplyr::matches()       masks tidyr::matches(), testthat::matches()
library(socviz)
## 
## Attaching package: 'socviz'

## The following object is masked from 'package:kjhutils':
## 
##     %nin%
library(ggrepel)
library(colorspace)

County election data

head(county_data)
##      id           name state census_region      pop_dens   pop_dens4
## 1     0           <NA>  <NA>          <NA> [   50,  100) [ 45,  118)
## 2 01000              1    AL         South [   50,  100) [ 45,  118)
## 3 01001 Autauga County    AL         South [   50,  100) [ 45,  118)
## 4 01003 Baldwin County    AL         South [  100,  500) [118,71672]
## 5 01005 Barbour County    AL         South [   10,   50) [ 17,   45)
## 6 01007    Bibb County    AL         South [   10,   50) [ 17,   45)
##     pop_dens6   pct_black       pop female white black travel_time  land_area
## 1 [ 82,  215) [10.0,15.0) 318857056   50.8  77.7  13.2        25.5 3531905.43
## 2 [ 82,  215) [25.0,50.0)   4849377   51.5  69.8  26.6        24.2   50645.33
## 3 [ 82,  215) [15.0,25.0)     55395   51.5  78.1  18.4        26.2     594.44
## 4 [ 82,  215) [ 5.0,10.0)    200111   51.2  87.3   9.5        25.9    1589.78
## 5 [ 25,   45) [25.0,50.0)     26887   46.5  50.2  47.6        24.6     884.88
## 6 [ 25,   45) [15.0,25.0)     22506   46.0  76.3  22.1        27.6     622.58
##   hh_income su_gun4 su_gun6 fips votes_dem_2016 votes_gop_2016 total_votes_2016
## 1     53046    <NA>    <NA>    0             NA             NA               NA
## 2     43253    <NA>    <NA> 1000             NA             NA               NA
## 3     53682 [11,54] [10,12) 1001           5908          18110            24661
## 4     50221 [11,54] [10,12) 1003          18409          72780            94090
## 5     32911 [ 5, 8) [ 7, 8) 1005           4848           5431            10390
## 6     36447 [11,54] [10,12) 1007           1874           6733             8748
##   per_dem_2016 per_gop_2016 diff_2016 per_dem_2012 per_gop_2012 diff_2012
## 1           NA           NA        NA           NA           NA        NA
## 2           NA           NA        NA           NA           NA        NA
## 3    0.2395685    0.7343579     12202    0.2657577    0.7263374     11012
## 4    0.1956531    0.7735147     54371    0.2156657    0.7738975     47443
## 5    0.4666025    0.5227141       583    0.5125229    0.4833755       334
## 6    0.2142204    0.7696616      4859    0.2621857    0.7306638      3931
##   winner partywinner16 winner12 partywinner12 flipped
## 1   <NA>          <NA>     <NA>          <NA>    <NA>
## 2   <NA>          <NA>     <NA>          <NA>    <NA>
## 3  Trump    Republican   Romney    Republican      No
## 4  Trump    Republican   Romney    Republican      No
## 5  Trump    Republican    Obama      Democrat     Yes
## 6  Trump    Republican   Romney    Republican      No
# Democrat Blue and Republican Red
party_colors <- c("#2E74C0", "#CB454A")

p0 <- ggplot(data = subset(county_data,
                           flipped == "No"),
             mapping = aes(x = pop,
                           y = black/100))

p1 <- p0 + geom_point(alpha = 0.15, color = "gray20") +
    scale_x_log10(labels=scales::comma) 

p1
p2 <- p1 + geom_point(data = subset(county_data,
                                    flipped == "Yes"),
                      mapping = aes(x = pop, y = black/100,
                                    color = partywinner16)) +
    scale_color_manual(values = party_colors)

p2
p3 <- p2 + scale_y_continuous(labels=scales::percent) +
    labs(color = "County flipped to ... ",
         x = "County Population (log scale)",
         y = "Percent Black Population",
         title = "Flipped counties, 2016",
         caption = "Counties in gray did not flip.")

p3
p4 <- p3 + geom_text_repel(data = subset(county_data,
                                      flipped == "Yes" &
                                      black  > 25),
                           mapping = aes(x = pop,
                                   y = black/100,
                                   label = state), size = 2)

p4 + theme_gray() +
    theme(legend.position="top")

Debt plots

studebt
## # A tibble: 16 × 4
##    Debt      type        pct Debtrc   
##    <ord>     <fct>     <int> <ord>    
##  1 Under $5  Borrowers    20 Under $5 
##  2 $5-$10    Borrowers    17 $5-$10   
##  3 $10-$25   Borrowers    28 $10-$25  
##  4 $25-$50   Borrowers    19 $25-$50  
##  5 $50-$75   Borrowers     8 $50-$75  
##  6 $75-$100  Borrowers     3 $75-$100 
##  7 $100-$200 Borrowers     4 $100-$200
##  8 Over $200 Borrowers     1 Over $200
##  9 Under $5  Balances      2 Under $5 
## 10 $5-$10    Balances      4 $5-$10   
## 11 $10-$25   Balances     15 $10-$25  
## 12 $25-$50   Balances     23 $25-$50  
## 13 $50-$75   Balances     16 $50-$75  
## 14 $75-$100  Balances     10 $75-$100 
## 15 $100-$200 Balances     19 $100-$200
## 16 Over $200 Balances     11 Over $200
p_ylab <- "Amount Owed, in thousands of Dollars"
p_title <- "Outstanding Student Loans"
p_subtitle <- "44 million borrowers owe a total of $1.3 trillion"
p_caption <- "Source: FRB NY"

f_labs <- c(Borrowers = "Percent of\nall Borrowers",
            Balances = "Percent of\nall Balances")

p <- ggplot(data = studebt,
            mapping = aes(x = pct/100, 
                          y = Debt,
                          fill = type))
p + geom_col() +
    scale_fill_brewer(type = "qual", palette = "Dark2") +
    scale_x_continuous(labels = scales::percent) +
    guides(fill = "none") +
    theme(strip.text.x = element_text(face = "bold")) +
    labs(x = NULL, y = p_ylab,
      caption = p_caption,
      title = p_title,
      subtitle = p_subtitle) +
    facet_grid(~ type, labeller = as_labeller(f_labs)) 
p <- ggplot(data = studebt, 
            mapping = aes(x = pct/100, y = type, fill = Debtrc))
p + geom_col(color = "gray80") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_discrete(labels = as_labeller(f_labs)) +
  scale_fill_viridis_d() +
  guides(fill = guide_legend(reverse = TRUE,
                             title.position = "top",
                             label.position = "bottom",
                             keywidth = 3,
                             nrow = 1)) +
  labs(x = NULL, y = NULL,
       fill = "Amount Owed, in thousands of dollars",
       caption = p_caption,
       title = p_title,
       subtitle = p_subtitle) +
  theme(legend.position = "bottom",
        axis.text.x = element_text(face = "bold", hjust = 1, size = 12),
        axis.ticks.length = unit(0, "cm"),
        panel.grid.major.x = element_blank()) 

okboomer

# drat::addRepo("kjhealy")
# install.packages("demog")
library(demog)
okboomer
## # A tibble: 1,644 × 12
##     year month n_days births total_pop births_pct births_pct_day date      
##    <dbl> <dbl>  <dbl>  <dbl>     <dbl>      <dbl>          <dbl> <date>    
##  1  1938     1     31  51820  41215000    0.00126           40.6 1938-01-01
##  2  1938     2     28  47421  41215000    0.00115           41.1 1938-02-01
##  3  1938     3     31  54887  41215000    0.00133           43.0 1938-03-01
##  4  1938     4     30  54623  41215000    0.00133           44.2 1938-04-01
##  5  1938     5     31  56853  41215000    0.00138           44.5 1938-05-01
##  6  1938     6     30  53145  41215000    0.00129           43.0 1938-06-01
##  7  1938     7     31  53214  41215000    0.00129           41.6 1938-07-01
##  8  1938     8     31  50444  41215000    0.00122           39.5 1938-08-01
##  9  1938     9     30  50545  41215000    0.00123           40.9 1938-09-01
## 10  1938    10     31  50079  41215000    0.00122           39.2 1938-10-01
## # … with 1,634 more rows, and 4 more variables: seasonal <dbl>, trend <dbl>,
## #   remainder <dbl>, country <chr>
okboomer %>%
    filter(country == "United States") %>%
    ggplot(aes(x = date, y = births_pct_day)) +
    geom_line(size = 0.5) +
    labs(x = "Year",
         y = "Average daily births per million") 
okboomer_p <- okboomer %>%
    mutate(year_fct = factor(year,  
                             levels = unique(year), 
                             ordered = TRUE),
           month_fct = factor(month,
                              levels = rev(c(1:12)),
                              labels = rev(c("Jan", "Feb", "Mar", "Apr",
                                    "May", "Jun", "Jul", "Aug",
                                    "Sep", "Oct", "Nov", "Dec")),
                              ordered = TRUE)) %>%
    select(year, month, 
           year_fct, month_fct, everything())

okboomer_p
## # A tibble: 1,644 × 14
##     year month year_fct month_fct n_days births total_pop births_pct
##    <dbl> <dbl> <ord>    <ord>      <dbl>  <dbl>     <dbl>      <dbl>
##  1  1938     1 1938     Jan           31  51820  41215000    0.00126
##  2  1938     2 1938     Feb           28  47421  41215000    0.00115
##  3  1938     3 1938     Mar           31  54887  41215000    0.00133
##  4  1938     4 1938     Apr           30  54623  41215000    0.00133
##  5  1938     5 1938     May           31  56853  41215000    0.00138
##  6  1938     6 1938     Jun           30  53145  41215000    0.00129
##  7  1938     7 1938     Jul           31  53214  41215000    0.00129
##  8  1938     8 1938     Aug           31  50444  41215000    0.00122
##  9  1938     9 1938     Sep           30  50545  41215000    0.00123
## 10  1938    10 1938     Oct           31  50079  41215000    0.00122
## # … with 1,634 more rows, and 6 more variables: births_pct_day <dbl>,
## #   date <date>, seasonal <dbl>, trend <dbl>, remainder <dbl>, country <chr>
okboomer_p %>%
    filter(country == "United States") %>%
    ggplot(aes(x = year_fct, y = month_fct)) +
    geom_tile(mapping = aes(fill = births_pct_day), 
              color = "white") + 
   scale_x_discrete(breaks = seq(1940, 2010, 5)) +    
   scale_y_discrete(position = "top") +              
   scale_fill_viridis_c(option = "B") + 
    labs(x = NULL, y = NULL, fill = NULL, title = "Monthly Birth Rates",
         subtitle = "Average births per million people per day.",
         caption = "Data: US Census Bureau.")
## Warning: Position guide is perpendicular to the intended axis. Did you mean to
## specify a different guide `position`?

Beeplot

# drat::addRepo("kjhealy")
# install.packages("cavax")
library(cavax)

# install.packages("ggbeeswarm")
library(ggbeeswarm)
library(RColorBrewer)
cavax
## # A tibble: 7,032 × 13
##      code county name  type  district city  enrollment pbe_pct exempt med_exempt
##     <dbl> <chr>  <chr> <chr> <chr>    <chr>      <dbl>   <dbl>  <dbl>      <dbl>
##  1 1.10e5 ALAME… FAME… PUBL… ALAMEDA… NEWA…        109      13  12.8        0   
##  2 6.00e6 ALAME… COX … PUBL… ALAMEDA… OAKL…        115       1   0.87       0.87
##  3 6.00e6 ALAME… LAZE… PUBL… ALAMEDA… OAKL…         40       0   0          0   
##  4 1.24e5 ALAME… YU M… PUBL… ALAMEDA… OAKL…         52      10   9.62       0   
##  5 6.10e6 ALAME… AMEL… PUBL… ALAMEDA… ALAM…        128       2   1.56       0   
##  6 6.11e6 ALAME… BAY … PUBL… ALAMEDA… ALAM…         70       1   1.43       0   
##  7 6.09e6 ALAME… DONA… PUBL… ALAMEDA… ALAM…        100       3   3          0   
##  8 6.09e6 ALAME… EDIS… PUBL… ALAMEDA… ALAM…         70       1   1.43       0   
##  9 6.09e6 ALAME… FRAN… PUBL… ALAMEDA… ALAM…         95       1   1.05       1.05
## 10 6.09e6 ALAME… FRAN… PUBL… ALAMEDA… ALAM…         50       2   2          0   
## # … with 7,022 more rows, and 3 more variables: rel_exempt <dbl>, mwc <fct>,
## #   kind <fct>
aux_info <- cavax %>%  group_by(mwc) %>% 
  summarize(Schools=n(), Students=sum(enrollment, na.rm=TRUE)) %>% 
  na.omit()

aux_info$Summary <- paste(aux_info$Schools, " Schools enrolling\n", 
                          aux_info$Students, " Kindergarteners", sep="")

## Format the numbers with commas
aux_info$Summary2 <- paste(formatC(aux_info$Schools, format="d", 
                                   big.mark = ","),
                           " Schools\n",
                           formatC(aux_info$Students, format="d", 
                                   big.mark=","),
                           " Kindergarteners", sep="")

aux_info$School.labs <- c("Public", "Charter", "Private\nNon-Specific",
                          "Private\nChristian", "Private\nCatholic", 
                          "Private\nMontessori", "Private\nWaldorf", 
                          "Charter\nMontessori", "Public\nMontessori", 
                          "Private\nChristian\nMontessori",
                          "Private Jewish\nor Islamic")

aux_info
## # A tibble: 11 × 6
##    mwc                          Schools Students Summary    Summary2 School.labs
##    <fct>                          <int>    <dbl> <chr>      <chr>    <chr>      
##  1 Public                          5314   472802 "5314 Sch… "5,314 … "Public"   
##  2 Charter                          314    19863 "314 Scho… "314 Sc… "Charter"  
##  3 Private Non-Specific             591    16697 "591 Scho… "591 Sc… "Private\n…
##  4 Private Christian                336     8836 "336 Scho… "336 Sc… "Private\n…
##  5 Private Catholic                 334     9869 "334 Scho… "334 Sc… "Private\n…
##  6 Private Montessori                99     2112 "99 Schoo… "99 Sch… "Private\n…
##  7 Private Waldorf                   16      513 "16 Schoo… "16 Sch… "Private\n…
##  8 Charter Montessori                 5      227 "5 School… "5 Scho… "Charter\n…
##  9 Public Montessori                 11      706 "11 Schoo… "11 Sch… "Public\nM…
## 10 Private Christian Montessori       4       78 "4 School… "4 Scho… "Private\n…
## 11 Private Jewish/Islamic             8      237 "8 School… "8 Scho… "Private J…
## Force newlines for top annotation
addline_format <- function(x,...){
    gsub('\\s','\n',x)
}
make_bee_plot <- function(dat = cavax,
                          balpha = 0.3,
                          bwidth = 0.9,
                          varwidth = FALSE,
                          method = "quasirandom",
                          title = "Vaccination Exemption Rates in California Kindergartens",
                          subtitle = "Percent of Kindergarteners with a Personal Belief Exemption, by Type and Size of School."){
    theme <- theme_set(theme_minimal())
    theme <- theme_update(panel.grid.major.x=element_blank())

    colorCount <- length(levels(dat$mwc))
    getPalette <- colorRampPalette(brewer.pal(8, "Set2"))

    p <- ggplot(data = dat, mapping = aes(y = pbe_pct, 
                                          x = mwc, 
                                          size = enrollment, 
                                          fill = mwc))

    p1 <- p + geom_quasirandom(shape=21, alpha = balpha,
                               color="gray30",
                               method = method,
                               varwidth = varwidth,
                               bandwidth = bwidth,
                               position = position)

    p2 <- p1 + xlab("") + 
      ggtitle(title, subtitle = subtitle) + 
      guides(color = "none",
             shape= "none",
             fill= "none",
             size = guide_legend(override.aes =
                                   list(fill = "black"))) +
        scale_size(breaks=c(20, 40, 75, 100, 300),
                   range=c(1,10)) +
        scale_color_manual(values = getPalette(colorCount)) +
        labs(size="Number of Kindergarteners in each School") +
            ylab("Percent") +
        theme(legend.position = "bottom",
              axis.title.x = element_blank(),
              axis.text.x = element_blank())
 p2
}

# pdf(file="figures/pbe-by-school-type-bee.pdf", height=8, width=12, pointsize = 12)

aux_info_sub <- subset(aux_info, mwc %nin% c("Private Jewish/Islamic", "Private Christian Montessori"))
auxlen <- nrow(aux_info_sub)

p <- make_bee_plot(dat = subset(cavax, mwc %nin% c("Private Jewish/Islamic", "Private Christian Montessori")),
                   bwidth = 0.7,
                   method = "quasirandom")
p1 <- p + annotate("text", x = seq(1, auxlen, 1), y=-4,
                   label= aux_info_sub$Summary2, size=2)
p2 <- p1 + annotate("text", x=seq(1, auxlen, 1), y=-10, size=3,
                    fontface="bold",
                    label=addline_format(aux_info_sub$mwc))

p2
# dev.off()