Example 12: Network Data

Setup

Code
library(here)       # manage file paths
here() starts at /Users/kjhealy/Documents/courses/vsd
Code
library(socviz)     # data and some useful functions
library(tidyverse)  # your friend and mine
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(tidygraph) # tidy management of relational data

Attaching package: 'tidygraph'

The following object is masked from 'package:stats':

    filter
Code
library(ggraph)   # geoms for drawing graphs

#remotes::install_github("kjhealy/kjhnet")
library(kjhnet)   # some network datasets

Sometimes, we have data that doesn’t look particularly relational at first glance, but we can make it so.

Code
set_graph_style(family = "Myriad Pro SemiCondensed")

library(ggraph)
library(tidygraph)

set_graph_style(family = "Myriad Pro SemiCondensed")

Warren’s socjobs data

Code
socjobs
# A tibble: 343 × 7
   sex   job_yr job_dept   phd_yr phd_dept  top25phd   gap
   <chr>  <dbl> <chr>       <dbl> <chr>     <chr>    <dbl>
 1 F       2017 NYU          2017 Duke      Yes          0
 2 F       2017 Columbia     2016 Princeton Yes          1
 3 F       2008 Minnesota    2006 Sussex    No           2
 4 M       2013 Arizona      2012 UCSF      No           1
 5 M       2015 Penn State   2012 Arizona   Yes          3
 6 M       1997 Indiana      1997 UNC       Yes          0
 7 F       1991 Harvard      1990 Stanford  Yes          1
 8 F       2010 Yale         2008 UCLA      Yes          2
 9 M       2013 Minnesota    2013 Irvine    No           0
10 M       2013 Cornell      2011 Wisconsin Yes          2
# ℹ 333 more rows

Let’s clean it up a little:

Code
clean_dept_names <- function(x) {
    x <- str_replace(x, "California-", "")
    x <- str_replace(x, "SUNY-", "")
    x <- str_replace(x, "SUNY-", "")
    x <- str_replace(x, "Illinois-Chicago", "UIC")
    x <- str_replace(x, "U of Illinois", "UIUC")
    x <- str_replace(x, "U of Pennsylvania", "U Penn")
    x <- str_replace(x, "San Francisco", "UCSF")
    x <- str_replace(x, "North Carolina", "UNC")
    x <- str_replace(x, "N.C. State", "NC State")
    x
}

jobnet <- socjobs |> 
  # Texas isn't in the data properly
  filter(top25phd == "Yes", phd_dept != "Texas", job_dept != "Texas") |>
  select(phd_dept, job_dept) |>
  mutate(phd_dept = clean_dept_names(phd_dept),
         job_dept = clean_dept_names(job_dept)) |>
  group_by(phd_dept, job_dept) |>
  tally() 

At this point we have tallied jobs by PhD department and First Job Department:

Code
jobnet
# A tibble: 182 × 3
# Groups:   phd_dept [22]
   phd_dept job_dept         n
   <chr>    <chr>        <int>
 1 Arizona  Cornell          2
 2 Arizona  Duke             1
 3 Arizona  Indiana          1
 4 Arizona  Northwestern     1
 5 Arizona  Ohio State       2
 6 Arizona  Penn State       1
 7 Arizona  Stanford         1
 8 Berkeley Arizona          3
 9 Berkeley Chicago          2
10 Berkeley Columbia         3
# ℹ 172 more rows

Next we rename n to weight and make a scaled weight (in case we need it), and convert it to a network format:

Code
jobnet <- jobnet |> 
  select(from = phd_dept, to = job_dept, weight = n) |>
  mutate(scale_weight = scale(weight, center = FALSE)) |> 
  filter(weight > 0) |> 
  as_tbl_graph() 

jobnet
# A tbl_graph: 22 nodes and 182 edges
#
# A directed multigraph with 1 component
#
# Node Data: 22 × 1 (active)
   name     
   <chr>    
 1 Arizona  
 2 Berkeley 
 3 Chicago  
 4 Columbia 
 5 Cornell  
 6 Duke     
 7 Harvard  
 8 Indiana  
 9 Michigan 
10 Minnesota
# ℹ 12 more rows
#
# Edge Data: 182 × 4
   from    to weight scale_weight
  <int> <int>  <int>        <dbl>
1     1     5      2        1.36 
2     1     6      1        0.679
3     1     8      1        0.679
# ℹ 179 more rows

Now we can calculate centrality scores and draw a graph:

Code
jobnet |> 
  mutate(centrality = centrality_eigen(weights = weight)) |>
  ggraph(layout = "graphopt") + 
  geom_edge_fan(aes(alpha = weight),
                arrow = arrow(length = unit(3, "mm"), type = "closed"), 
                start_cap = circle(2, "mm"),
                end_cap = circle(8, "mm")) + 
  geom_node_label(aes(label = name)) + 
  scale_edge_alpha_continuous(name = "N Hires") + 
  labs(title = "New Assistant Professor Exchanges within the Top 25 Sociology Departments", 
       subtitle = "New Ph.D. hires, 1990-2017. Data show absolute number of within-network hires only.\nHires to and from outside the Top 25 are not shown. No adjustments are made for departmental or cohort sizes.", 
       caption = "Data: Warren (2019). Data exclude UT Austin.") + 
  theme_graph(base_family = "Myriad Pro SemiCondensed") + 
    theme(legend.position = "top") 
Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.