Visualizing Social Data
Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode

Code

Super Bowl Stuff

This material goes beyond what we’ve covered in class, although a lot of it is, at its core, the same process of taking data and then selecting, filtering, grouping, and summarizing pieces of it until it’s handed off to a ggplot call of greater or lesser complexity. Many of the examples here are borrowed from Thomas Mock, who does excellent work on this and related R stuff.

Get the data

options(repos = c(
  nflverse = 'https://nflverse.r-universe.dev',
  CRAN = 'https://cloud.r-project.org'))

# Install some packages as needed
#install.packages("nflfastR")
#install.packages("ggtext") # for element_markdown below
#install.packages("hexbin") # for the QB plot
#install.packages("colorspace")

library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::edition_get()   masks testthat::edition_get()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ purrr::is_null()       masks testthat::is_null()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ readr::local_edition() masks testthat::local_edition()
## ✖ dplyr::matches()       masks tidyr::matches(), testthat::matches()
library(ggtext)
library(colorspace)

library(nflfastR)
## Warning: package 'nflfastR' was built under R version 4.1.2
## 2021 Play-by-plays
df <- load_pbp(2021)

Filter, group, and plot

df %>%
  filter(wp > .20 & wp < .80 & down <= 2 & qtr <= 2 & 
           half_seconds_remaining > 120) %>%
  group_by(posteam) %>%
  summarize(mean_pass = mean(pass), 
            plays = n()) %>%
  arrange(mean_pass) %>% 
  ggplot(mapping = aes(x = reorder(posteam,-mean_pass), y = mean_pass)) +
  geom_text(aes(label = posteam)) + 
  theme_minimal()

Chase vs Kupp

df %>%
  filter(receiver %in% c("J.Chase", "C.Kupp")) %>%
  group_by(week, receiver) %>%
  summarize(mean_epa = mean(epa, na.rm = TRUE)) %>% 
  ggplot(mapping = aes(x = week, y = mean_epa, color = receiver)) +
  geom_line(size = 1) + 
  geom_hline(yintercept = 0, size = 1, color = "black") +
  labs(
    x = "\nGame Date",
    y = "EPA (Average)",
    title = "Quick comparison of Chase vs Kupp across the 2021 season",
    caption = "Data: @nflfastR"
  ) + 
  theme_minimal()
## `summarise()` has grouped output by 'week'. You can override using the `.groups`
## argument.

Polish that plot

tm_colors <- nflfastR::teams_colors_logos %>% 
  filter(team_abbr %in% c("CIN", "LA")) 

cin_color <- tm_colors %>%
  filter(team_abbr == "CIN") %>%
  pull(team_color) 

la_color <- tm_colors %>%
  filter(team_abbr == "LA") %>%
  pull(team_color)


df %>%
  filter(receiver %in% c("J.Chase", "C.Kupp")) %>%
  group_by(week, receiver) %>%
  summarize(mean_epa = mean(epa, na.rm = TRUE)) %>% 
  ggplot(mapping = aes(x = week, y = mean_epa, 
                       color = if_else(receiver == "J.Chase", 
                                       cin_color, la_color))) +
  geom_line(size = 1) + 
  geom_hline(yintercept = 0, size = 1, color = "black") +
  scale_color_identity() +
  scale_x_continuous(breaks = seq(1, 21, 1)) +
  scale_y_continuous(breaks = seq(-1, 2, 0.5)) +
  labs(
    x = NULL,
    y = "EPA (Average)",
    title = glue::glue("Quick comparison of <span style='color:{cin_color}'>**Chase**</span> vs <span style='color:{la_color}'>**Kupp**</span> across the 2021 season"),
    caption = "Data: @nflfastR"
  ) + 
  theme_minimal() + 
  theme(plot.title = element_markdown())
## `summarise()` has grouped output by 'week'. You can override using the `.groups`
## argument.

EPA Play

df %>%
  filter(pass == 1, !is.na(posteam)) %>% 
  group_by(posteam) %>% 
  summarize(
    n = n(),
    epa_per_db = sum(epa, na.rm = TRUE) / n,
    success_rate = sum(epa, na.rm = TRUE) / n
  ) %>% 
  ggplot(mapping = aes(x = posteam, y = epa_per_db)) +
  geom_col()
df %>%
  filter(pass == 1, !is.na(posteam)) %>% 
  group_by(posteam) %>% 
  summarize(
    n = n(),
    epa_per_db = sum(epa, na.rm = TRUE) / n,
    success_rate = sum(epa, na.rm = TRUE) / n
  ) %>% 
  ggplot(mapping = aes(x = epa_per_db, y = reorder(posteam, epa_per_db))) +
  geom_col(aes(fill = if_else(epa_per_db >= 0, "#2c7bb6", "#d7181c"))) +
  scale_fill_identity() +
  geom_hline(yintercept = 0) +
  scale_x_continuous(breaks = seq(-0.2, 0.3, 0.1)) +
  labs(
    x = "",
    y = "EPA per Dropback",
    title = "The majority of teams had positive EPA/dropback",
    caption = "Data: @nflfastR"
  ) + 
  theme_minimal() + 
  theme(panel.grid.major.y = element_blank()) 
df %>%
  filter(play_type == "pass", !is.na(air_yards)) %>%
  filter(posteam %in% c("CIN", "LA")) %>%
  ggplot(aes(x = air_yards, fill = posteam)) +
  geom_histogram(binwidth = 2, alpha = 0.9) +
  scale_fill_manual(values = c(cin_color, la_color)) +
  geom_hline(yintercept = 0, size = 1) +
  guides(
    fill = guide_legend(
      label = TRUE, title = "", label.position = "left",
      direction = "vertical",
      label.theme = element_text(size = 20)
      )
    ) +
  scale_x_continuous(breaks = seq(-10, 60, 10)) +
  labs(
    x = "\nAir Yards",
    y = "Count",
    title = "Throwing Passes",
    caption = "Data: @nflfastR"
  ) + 
  theme_minimal() + 
  theme(legend.position = c(0.5, 0.9)) 
df %>%
  filter(play_type == "pass", !is.na(air_yards)) %>%
  filter(posteam %in% c("CIN", "LA")) %>%
  ggplot(aes(x = air_yards, y = ..scaled.., fill = posteam)) +
  geom_density(alpha = 0.4) +
  scale_fill_manual(values = c(cin_color, la_color)) +
  geom_hline(yintercept = 0, size = 1) +
  guides(
    fill = guide_legend(
      label = TRUE, title = "", label.position = "left",
      direction = "vertical",
      label.theme = element_text(size = 20)
      )
    ) +
  scale_x_continuous(breaks = seq(-10, 60, 10)) +
  labs(
    x = "\nAir Yards",
    y = "Count",
    title = "Throwing Passes",
    caption = "Data: @nflfastR"
  ) + 
  theme_minimal() + 
  theme(legend.position = c(0.5, 0.9)) 

Quarterback comparison

Data

dat_url <- "https://raw.githubusercontent.com/ArrowheadAnalytics/next-gen-scrapy-2.0/master/pass_and_game_data.csv"

Functions to build the field

pm_df <- read_csv(dat_url) %>%
  na.omit() 
## Rows: 68331 Columns: 42
## ── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (17): posteam, name, pass_type, game_id, game_type, weekday, away_team,...
## dbl  (23): week, x, y, season, away_score, home_score, result, total, overti...
## date  (1): gameday
## time  (1): gametime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
not_div_5 <- function(x) {
  # select only elements of the vector not divisible by 5
  x[x %% 5 != 0]
}

center_df <- tibble(
  x =  c(rep(-3.1, 60), rep(3.1, 60)),
  y =  seq(-14, 59, 1) %>% rep(2) %>% not_div_5(),
  text = "--"
)

# line labels
annotate_df <- tibble(
  x = c(12.88, -12.88) %>% rep(each = 5),
  y =  seq(10, 50, 10) %>% rep(2),
  text = seq(10, 50, 10) %>% rep(2) %>% str_replace("(.)(.)", "\\1 \\2"),
  rotation = c(90, 270) %>% rep(each = 5)
)

# yardlines
yardline_df <- tibble(
  y =  seq(-15, 60, 5),
  yend = seq(-15, 60, 5),
  x =  rep(-56 / 2, 16),
  xend = rep(56 / 2, 16)
)

# sidelines
sideline_df <- tibble(
  y =  c(-15.15, -15.15),
  yend = c(60.15, 60.15),
  x =  c(-56 / 2, 56 / 2),
  xend = c(-56 / 2, 56 / 2)
)

## Amalgamate into a single call
add_field <- function() {
  list(
    coord_cartesian(
      xlim = c(-53.333 / 2, 53.333 / 2),
      ylim = c(-15, 60)
    ),
    geom_text(
      data = annotate_df, aes(x = x, y = y, label = text, angle = rotation),
      color = front_col, size = 8
    ),
    geom_segment(
      data = yardline_df, color = front_col, size = 1,
      aes(x = x, y = y, xend = xend, yend = yend)
    ),
    geom_segment(
      x = -56 / 2, y = 0, xend = 56 / 2, yend = 0,
      color = "blue", size = 1, alpha = 0.5
    ),
    geom_segment(
      data = sideline_df, color = front_col, size = 2,
      aes(x = x, y = y, xend = xend, yend = yend)
    ),
    geom_text(
      data = center_df,
      aes(label = text), color = front_col, vjust = 0.32
    ),
    theme_void(),
    theme(
      strip.text = element_text(size = 20, color = front_col),
      plot.background = element_rect(fill = back_col, color = NA),
      legend.position = "none",
      plot.margin = unit(c(2, 1, 0.5, 1), unit = "cm"),
      plot.caption = element_text(color = front_col),
      plot.title = element_text(color = front_col),
      plot.subtitle = element_text(color = front_col),
      panel.background = element_rect(fill = back_col, color = NA),
      panel.border = element_blank()
    )
  )
}

Draw the plot

back_col <- "#1B4434"
front_col <- "white"


my_colors <- sequential_hcl(10, palette = "Oranges")

pm_df %>% 
  filter(str_detect(name, c("Mahomes|Derek Carr"))) %>%
  ggplot(aes(x = x, y = y)) +
  add_field() +
  geom_density_2d_filled(
    aes(fill = ..level.., 
        color = ..level.., 
        alpha = 0.7),
    contour_var = "ndensity", # normalize to each QBs total passes
    breaks = seq(0.1, 1.0, length.out = 10) # drop the lowest passes
  ) +
  scale_fill_manual(values = my_colors) + 
  scale_color_manual(values = my_colors) +
  facet_wrap(~ name)