Visualizing Social Data
Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode

Code

Collider Bias

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::edition_get()   masks testthat::edition_get()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ purrr::is_null()       masks testthat::is_null()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ readr::local_edition() masks testthat::local_edition()
## ✖ dplyr::matches()       masks tidyr::matches(), testthat::matches()
library(socviz)
## 
## Attaching package: 'socviz'

## The following object is masked from 'package:kjhutils':
## 
##     %nin%
## Use of `quantile()` to create 99th percentile cutoff
df <- tibble(looks = rnorm(10000),
             talent = rnorm(10000), 
             total = looks + talent, 
             hollywood = total > quantile(total, 0.99))

df %>% 
  ggplot(mapping = aes(x = talent,
                       y = looks)) + 
  geom_point(alpha = 0.1) + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
df %>% 
  ggplot(mapping = aes(x = talent, 
                       y = looks, 
                       color = hollywood)) + 
  geom_point(alpha = 0.1) + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

Simpson’s Paradox

library(palmerpenguins)


ggplot(data = penguins,
       aes(x = bill_length_mm,
           y = bill_depth_mm)) +
  geom_point(size = 3,
             alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing missing values (geom_point).
ggplot(data = penguins,
       aes(x = bill_length_mm,
           y = bill_depth_mm,
           group = species)) +
  geom_point(aes(color = species,
                 shape = species),
             size = 3,
             alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE, aes(color = species)) +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Removed 2 rows containing missing values (geom_point).
ggplot(data = penguins,
       aes(x = bill_length_mm,
           y = bill_depth_mm)) +
  geom_point(size = 3,
             alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE) +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Removed 2 rows containing missing values (geom_point).

Naniar

library(naniar)
library(visdat)

organdata
## # A tibble: 238 × 21
##    country   year       donors   pop pop_dens   gdp gdp_lag health health_lag
##    <chr>     <date>      <dbl> <int>    <dbl> <int>   <int>  <dbl>      <dbl>
##  1 Australia NA          NA    17065    0.220 16774   16591   1300       1224
##  2 Australia 1991-01-01  12.1  17284    0.223 17171   16774   1379       1300
##  3 Australia 1992-01-01  12.4  17495    0.226 17914   17171   1455       1379
##  4 Australia 1993-01-01  12.5  17667    0.228 18883   17914   1540       1455
##  5 Australia 1994-01-01  10.2  17855    0.231 19849   18883   1626       1540
##  6 Australia 1995-01-01  10.2  18072    0.233 21079   19849   1737       1626
##  7 Australia 1996-01-01  10.6  18311    0.237 21923   21079   1846       1737
##  8 Australia 1997-01-01  10.3  18518    0.239 22961   21923   1948       1846
##  9 Australia 1998-01-01  10.5  18711    0.242 24148   22961   2077       1948
## 10 Australia 1999-01-01   8.67 18926    0.244 25445   24148   2231       2077
## # … with 228 more rows, and 12 more variables: pubhealth <dbl>, roads <dbl>,
## #   cerebvas <int>, assault <int>, external <int>, txp_pop <dbl>, world <chr>,
## #   opt <chr>, consent_law <chr>, consent_practice <chr>, consistent <chr>,
## #   ccode <chr>
gg_miss_var(organdata)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
vis_dat(organdata)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
miss_var_summary(organdata)
## # A tibble: 21 × 3
##    variable  n_miss pct_miss
##    <chr>      <int>    <dbl>
##  1 year          34    14.3 
##  2 donors        34    14.3 
##  3 opt           28    11.8 
##  4 pubhealth     21     8.82
##  5 pop           17     7.14
##  6 pop_dens      17     7.14
##  7 gdp           17     7.14
##  8 roads         17     7.14
##  9 cerebvas      17     7.14
## 10 assault       17     7.14
## # … with 11 more rows
miss_case_summary(organdata)
## # A tibble: 238 × 3
##     case n_miss pct_miss
##    <int>  <int>    <dbl>
##  1    84     12     57.1
##  2   182     12     57.1
##  3   210     12     57.1
##  4    14     11     52.4
##  5    28     11     52.4
##  6    42     11     52.4
##  7    56     11     52.4
##  8    70     11     52.4
##  9    98     11     52.4
## 10   112     11     52.4
## # … with 228 more rows
organdata %>%
  select(consent_law, year, pubhealth, roads) %>%
  group_by(consent_law) %>%
  miss_var_summary()
## # A tibble: 6 × 4
## # Groups:   consent_law [2]
##   consent_law variable  n_miss pct_miss
##   <chr>       <chr>      <int>    <dbl>
## 1 Informed    year          16    14.3 
## 2 Informed    pubhealth      8     7.14
## 3 Informed    roads          8     7.14
## 4 Presumed    year          18    14.3 
## 5 Presumed    pubhealth     13    10.3 
## 6 Presumed    roads          9     7.14
organdata %>%
  select(consent_law, year, pubhealth, roads) %>%
  group_by(consent_law) %>%
  miss_var_summary()
## # A tibble: 6 × 4
## # Groups:   consent_law [2]
##   consent_law variable  n_miss pct_miss
##   <chr>       <chr>      <int>    <dbl>
## 1 Informed    year          16    14.3 
## 2 Informed    pubhealth      8     7.14
## 3 Informed    roads          8     7.14
## 4 Presumed    year          18    14.3 
## 5 Presumed    pubhealth     13    10.3 
## 6 Presumed    roads          9     7.14

Upset plot

gg_miss_upset(organdata)

Missing data in the Congressional dataset

# install_github("kjhealy/congress")
library(congress)

gg_miss_var(congress)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
gg_miss_upset(congress)