Example 05: Working with dplyr

Setup

Code
library(here)      # manage file paths
here() starts at /Users/kjhealy/Documents/courses/vsd
Code
library(socviz)    # data and some useful functions
library(tidyverse) # your friend and mine
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.2.0
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

Core dplyr verbs

Code
gss_sm  
# A tibble: 2,867 × 32
    year    id ballot   age childs sibs  degree race  sex   region incom…¹ relig
   <dbl> <dbl> <labe> <dbl>  <dbl> <lab> <fct>  <fct> <fct> <fct>  <fct>   <fct>
 1  2016     1 1         47      3 2     Bache… White Male  New E… $17000… None 
 2  2016     2 2         61      0 3     High … White Male  New E… $50000… None 
 3  2016     3 3         72      2 3     Bache… White Male  New E… $75000… Cath…
 4  2016     4 1         43      4 3     High … White Fema… New E… $17000… Cath…
 5  2016     5 3         55      2 2     Gradu… White Fema… New E… $17000… None 
 6  2016     6 2         53      2 2     Junio… White Fema… New E… $60000… None 
 7  2016     7 1         50      2 2     High … White Male  New E… $17000… None 
 8  2016     8 3         23      3 6     High … Other Fema… Middl… $30000… Cath…
 9  2016     9 1         45      3 5     High … Black Male  Middl… $60000… Prot…
10  2016    10 3         71      4 1     Junio… White Male  Middl… $60000… None 
# … with 2,857 more rows, 20 more variables: marital <fct>, padeg <fct>,
#   madeg <fct>, partyid <fct>, polviews <fct>, happy <fct>, partners <fct>,
#   grass <fct>, zodiac <fct>, pres12 <labelled>, wtssall <dbl>,
#   income_rc <fct>, agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>,
#   religion <fct>, bigregion <fct>, partners_rc <fct>, obama <dbl>, and
#   abbreviated variable name ¹​income16

Select columns

Code
gss_sm |> 
  select(age, degree, bigregion, religion)
# A tibble: 2,867 × 4
     age degree         bigregion religion  
   <dbl> <fct>          <fct>     <fct>     
 1    47 Bachelor       Northeast None      
 2    61 High School    Northeast None      
 3    72 Bachelor       Northeast Catholic  
 4    43 High School    Northeast Catholic  
 5    55 Graduate       Northeast None      
 6    53 Junior College Northeast None      
 7    50 High School    Northeast None      
 8    23 High School    Northeast Catholic  
 9    45 High School    Northeast Protestant
10    71 Junior College Northeast None      
# … with 2,857 more rows

Filter rows

Code
gss_sm |> 
  filter(age > 45)
# A tibble: 1,612 × 32
    year    id ballot   age childs sibs  degree race  sex   region incom…¹ relig
   <dbl> <dbl> <labe> <dbl>  <dbl> <lab> <fct>  <fct> <fct> <fct>  <fct>   <fct>
 1  2016     1 1         47      3 2     Bache… White Male  New E… $17000… None 
 2  2016     2 2         61      0 3     High … White Male  New E… $50000… None 
 3  2016     3 3         72      2 3     Bache… White Male  New E… $75000… Cath…
 4  2016     5 3         55      2 2     Gradu… White Fema… New E… $17000… None 
 5  2016     6 2         53      2 2     Junio… White Fema… New E… $60000… None 
 6  2016     7 1         50      2 2     High … White Male  New E… $17000… None 
 7  2016    10 3         71      4 1     Junio… White Male  Middl… $60000… None 
 8  2016    12 1         86      4 4     High … White Fema… Middl… under … Prot…
 9  2016    14 3         60      5 6     High … Black Fema… Middl… $12500… Prot…
10  2016    15 2         76      7 0     Lt Hi… White Male  New E… $40000… Cath…
# … with 1,602 more rows, 20 more variables: marital <fct>, padeg <fct>,
#   madeg <fct>, partyid <fct>, polviews <fct>, happy <fct>, partners <fct>,
#   grass <fct>, zodiac <fct>, pres12 <labelled>, wtssall <dbl>,
#   income_rc <fct>, agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>,
#   religion <fct>, bigregion <fct>, partners_rc <fct>, obama <dbl>, and
#   abbreviated variable name ¹​income16
Code
gss_sm |> 
  filter(childs > 4 & race == "White")
# A tibble: 110 × 32
    year    id ballot   age childs sibs  degree race  sex   region incom…¹ relig
   <dbl> <dbl> <labe> <dbl>  <dbl> <lab> <fct>  <fct> <fct> <fct>  <fct>   <fct>
 1  2016    15 2         76      7 0     Lt Hi… White Male  New E… $40000… Cath…
 2  2016    17 3         56      6 3     High … White Male  New E… $50000… Prot…
 3  2016    26 2         76      8 7     Lt Hi… White Fema… Middl… $5 000… Prot…
 4  2016   142 3         65      5 2     Junio… White Fema… New E… <NA>    Prot…
 5  2016   177 1         56      5 3     Bache… White Male  Pacif… $13000… Cath…
 6  2016   190 2         51      7 9     Lt Hi… White Fema… Pacif… $15000… Cath…
 7  2016   216 3         77      8 9     High … White Male  Pacif… $60000… Prot…
 8  2016   351 3         52      5 4     High … White Fema… E. No… $35000… Cath…
 9  2016   365 1         51      5 5     Gradu… White Male  South… $17000… Cath…
10  2016   379 3         NA      7 2     High … White Male  South… $17000… Jewi…
# … with 100 more rows, 20 more variables: marital <fct>, padeg <fct>,
#   madeg <fct>, partyid <fct>, polviews <fct>, happy <fct>, partners <fct>,
#   grass <fct>, zodiac <fct>, pres12 <labelled>, wtssall <dbl>,
#   income_rc <fct>, agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>,
#   religion <fct>, bigregion <fct>, partners_rc <fct>, obama <dbl>, and
#   abbreviated variable name ¹​income16

Logically Group with group_by()

Code
gss_sm |> 
  group_by(bigregion)
# A tibble: 2,867 × 32
# Groups:   bigregion [4]
    year    id ballot   age childs sibs  degree race  sex   region incom…¹ relig
   <dbl> <dbl> <labe> <dbl>  <dbl> <lab> <fct>  <fct> <fct> <fct>  <fct>   <fct>
 1  2016     1 1         47      3 2     Bache… White Male  New E… $17000… None 
 2  2016     2 2         61      0 3     High … White Male  New E… $50000… None 
 3  2016     3 3         72      2 3     Bache… White Male  New E… $75000… Cath…
 4  2016     4 1         43      4 3     High … White Fema… New E… $17000… Cath…
 5  2016     5 3         55      2 2     Gradu… White Fema… New E… $17000… None 
 6  2016     6 2         53      2 2     Junio… White Fema… New E… $60000… None 
 7  2016     7 1         50      2 2     High … White Male  New E… $17000… None 
 8  2016     8 3         23      3 6     High … Other Fema… Middl… $30000… Cath…
 9  2016     9 1         45      3 5     High … Black Male  Middl… $60000… Prot…
10  2016    10 3         71      4 1     Junio… White Male  Middl… $60000… None 
# … with 2,857 more rows, 20 more variables: marital <fct>, padeg <fct>,
#   madeg <fct>, partyid <fct>, polviews <fct>, happy <fct>, partners <fct>,
#   grass <fct>, zodiac <fct>, pres12 <labelled>, wtssall <dbl>,
#   income_rc <fct>, agegrp <fct>, ageq <fct>, siblings <fct>, kids <fct>,
#   religion <fct>, bigregion <fct>, partners_rc <fct>, obama <dbl>, and
#   abbreviated variable name ¹​income16

Summarize groups with summarize()

Code
gss_sm |> 
  group_by(bigregion) |>  #<<
  summarize(total = n()) 
# A tibble: 4 × 2
  bigregion total
  <fct>     <int>
1 Northeast   488
2 Midwest     695
3 South      1052
4 West        632

Multi-way groupings

Code
gss_sm |>  
  group_by(bigregion, religion) |> 
  summarize(total = n()) 
`summarise()` has grouped output by 'bigregion'. You can override using the
`.groups` argument.
# A tibble: 24 × 3
# Groups:   bigregion [4]
   bigregion religion   total
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# … with 14 more rows

Add columns with mutate()

Code
gss_sm |>  
  group_by(bigregion, religion) |> 
  summarize(total = n()) |> 
  mutate(freq = total / sum(total),
           pct = round((freq*100), 1))
`summarise()` has grouped output by 'bigregion'. You can override using the
`.groups` argument.
# A tibble: 24 × 5
# Groups:   bigregion [4]
   bigregion religion   total    freq   pct
   <fct>     <fct>      <int>   <dbl> <dbl>
 1 Northeast Protestant   158 0.324    32.4
 2 Northeast Catholic     162 0.332    33.2
 3 Northeast Jewish        27 0.0553    5.5
 4 Northeast None         112 0.230    23  
 5 Northeast Other         28 0.0574    5.7
 6 Northeast <NA>           1 0.00205   0.2
 7 Midwest   Protestant   325 0.468    46.8
 8 Midwest   Catholic     172 0.247    24.7
 9 Midwest   Jewish         3 0.00432   0.4
10 Midwest   None         157 0.226    22.6
# … with 14 more rows

Tally and Count

  • Do it yourself:
Code
gss_sm |> 
  group_by(bigregion, religion) |> #<<
  summarize(n = n()) #<<
`summarise()` has grouped output by 'bigregion'. You can override using the
`.groups` argument.
# A tibble: 24 × 3
# Groups:   bigregion [4]
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# … with 14 more rows
  • Use tally():
Code
gss_sm |> 
  group_by(bigregion, religion) |> 
  tally() #<<
# A tibble: 24 × 3
# Groups:   bigregion [4]
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# … with 14 more rows
  • Use count():
Code
gss_sm |> 
  count(bigregion, religion) #<<
# A tibble: 24 × 3
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# … with 14 more rows

Pay attention to how grouping works in these summaries.

Check your work

Code
rel_by_region <- gss_sm |> 
  count(bigregion, religion) |> 
  mutate(pct = round((n/sum(n))*100, 1)) 

rel_by_region
# A tibble: 24 × 4
   bigregion religion       n   pct
   <fct>     <fct>      <int> <dbl>
 1 Northeast Protestant   158   5.5
 2 Northeast Catholic     162   5.7
 3 Northeast Jewish        27   0.9
 4 Northeast None         112   3.9
 5 Northeast Other         28   1  
 6 Northeast <NA>           1   0  
 7 Midwest   Protestant   325  11.3
 8 Midwest   Catholic     172   6  
 9 Midwest   Jewish         3   0.1
10 Midwest   None         157   5.5
# … with 14 more rows
  • Each region should sum to ~100
Code
rel_by_region |> 
  group_by(bigregion) |> 
  summarize(total = sum(pct)) 
# A tibble: 4 × 2
  bigregion total
  <fct>     <dbl>
1 Northeast  17  
2 Midwest    24.3
3 South      36.7
4 West       22  
  • Grouping has caught us out. Try again.
Code
rel_by_region <- gss_sm |> 
  count(bigregion, religion) |> #<< 
  mutate(pct = round((n/sum(n))*100, 1)) 

rel_by_region
# A tibble: 24 × 4
   bigregion religion       n   pct
   <fct>     <fct>      <int> <dbl>
 1 Northeast Protestant   158   5.5
 2 Northeast Catholic     162   5.7
 3 Northeast Jewish        27   0.9
 4 Northeast None         112   3.9
 5 Northeast Other         28   1  
 6 Northeast <NA>           1   0  
 7 Midwest   Protestant   325  11.3
 8 Midwest   Catholic     172   6  
 9 Midwest   Jewish         3   0.1
10 Midwest   None         157   5.5
# … with 14 more rows

Summarize (usually) returns one tibble row per group

Code
gss_sm |> 
  group_by(bigregion) |> 
  tally()
# A tibble: 4 × 2
  bigregion     n
  <fct>     <int>
1 Northeast   488
2 Midwest     695
3 South      1052
4 West        632

When you have 2 or n-way groups the calculation is done from the inside out, on the innermost group.

Code
# 4 regions, 6 religion = 24 groups
gss_sm |> 
  group_by(bigregion, religion) |> 
  tally()
# A tibble: 24 × 3
# Groups:   bigregion [4]
   bigregion religion       n
   <fct>     <fct>      <int>
 1 Northeast Protestant   158
 2 Northeast Catholic     162
 3 Northeast Jewish        27
 4 Northeast None         112
 5 Northeast Other         28
 6 Northeast <NA>           1
 7 Midwest   Protestant   325
 8 Midwest   Catholic     172
 9 Midwest   Jewish         3
10 Midwest   None         157
# … with 14 more rows

Summarize many variables

The inefficient way:

Code
organdata |>  
  group_by(consent_law, country)  |> 
    summarize(donors_mean= mean(donors, na.rm = TRUE),
              donors_sd = sd(donors, na.rm = TRUE),
              gdp_mean = mean(gdp, na.rm = TRUE),
              gdp_sd = sd(gdp, na.rm = TRUE),
              health_mean = mean(health, na.rm = TRUE),
              roads_mean = mean(roads, na.rm = TRUE),
              cerebvas_mean = mean(cerebvas, na.rm = TRUE))
`summarise()` has grouped output by 'consent_law'. You can override using the
`.groups` argument.
# A tibble: 17 × 9
# Groups:   consent_law [2]
   consent_law country    donor…¹ donor…² gdp_m…³ gdp_sd healt…⁴ roads…⁵ cereb…⁶
   <chr>       <chr>        <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>
 1 Informed    Australia     10.6   1.14   22179.  3959.   1958.   105.     558.
 2 Informed    Canada        14.0   0.751  23711.  3966.   2272.   109.     422.
 3 Informed    Denmark       13.1   1.47   23722.  3896.   2054.   102.     641.
 4 Informed    Germany       13.0   0.611  22163.  2501.   2349.   113.     707.
 5 Informed    Ireland       19.8   2.48   20824.  6670.   1480.   118.     705.
 6 Informed    Netherlan…    13.7   1.55   23013.  3770.   1993.    76.1    585.
 7 Informed    United Ki…    13.5   0.775  21359.  3929.   1561.    67.9    708.
 8 Informed    United St…    20.0   1.33   29212.  4571.   3988.   155.     444.
 9 Presumed    Austria       23.5   2.42   23876.  3343.   1875.   150.     769.
10 Presumed    Belgium       21.9   1.94   22500.  3171.   1958.   155.     594.
11 Presumed    Finland       18.4   1.53   21019.  3668.   1615.    93.6    771.
12 Presumed    France        16.8   1.60   22603.  3260.   2160.   156.     433.
13 Presumed    Italy         11.1   4.28   21554.  2781.   1757    122.     712.
14 Presumed    Norway        15.4   1.11   26448.  6492.   2217.    70.0    662.
15 Presumed    Spain         28.1   4.96   16933   2888.   1289.   161.     655.
16 Presumed    Sweden        13.1   1.75   22415.  3213.   1951.    72.3    595.
17 Presumed    Switzerla…    14.2   1.71   27233   2153.   2776.    96.4    424.
# … with abbreviated variable names ¹​donors_mean, ²​donors_sd, ³​gdp_mean,
#   ⁴​health_mean, ⁵​roads_mean, ⁶​cerebvas_mean

Use across() and where() instead

Better:

Code
organdata |> 
    group_by(consent_law, country) |>
      summarize(across(where(is.numeric),
                       list(mean = ~ mean(.x, na.rm = TRUE), 
                            sd = ~ sd(.x, na.rm = TRUE))))
`summarise()` has grouped output by 'consent_law'. You can override using the
`.groups` argument.
# A tibble: 17 × 28
# Groups:   consent_law [2]
   conse…¹ country donor…² donor…³ pop_m…⁴ pop_sd pop_d…⁵ pop_d…⁶ gdp_m…⁷ gdp_sd
   <chr>   <chr>     <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
 1 Inform… Austra…    10.6   1.14   18318. 8.31e2   0.237  0.0107  22179.  3959.
 2 Inform… Canada     14.0   0.751  29608. 1.19e3   0.297  0.0120  23711.  3966.
 3 Inform… Denmark    13.1   1.47    5257. 8.06e1  12.2    0.187   23722.  3896.
 4 Inform… Germany    13.0   0.611  80255. 5.16e3  22.5    1.44    22163.  2501.
 5 Inform… Ireland    19.8   2.48    3674. 1.32e2   5.23   0.187   20824.  6670.
 6 Inform… Nether…    13.7   1.55   15548. 3.73e2  37.4    0.898   23013.  3770.
 7 Inform… United…    13.5   0.775  58187. 6.26e2  24.0    0.258   21359.  3929.
 8 Inform… United…    20.0   1.33  269330. 1.25e4   2.80   0.130   29212.  4571.
 9 Presum… Austria    23.5   2.42    7927. 1.09e2   9.45   0.130   23876.  3343.
10 Presum… Belgium    21.9   1.94   10153. 1.09e2  30.7    0.330   22500.  3171.
11 Presum… Finland    18.4   1.53    5112. 6.86e1   1.51   0.0203  21019.  3668.
12 Presum… France     16.8   1.60   58056. 8.51e2  10.5    0.154   22603.  3260.
13 Presum… Italy      11.1   4.28   57360. 4.25e2  19.0    0.141   21554.  2781.
14 Presum… Norway     15.4   1.11    4386. 9.73e1   1.35   0.0300  26448.  6492.
15 Presum… Spain      28.1   4.96   39666. 9.51e2   7.84   0.188   16933   2888.
16 Presum… Sweden     13.1   1.75    8789. 1.14e2   1.95   0.0253  22415.  3213.
17 Presum… Switze…    14.2   1.71    7037. 1.70e2  17.0    0.411   27233   2153.
# … with 18 more variables: gdp_lag_mean <dbl>, gdp_lag_sd <dbl>,
#   health_mean <dbl>, health_sd <dbl>, health_lag_mean <dbl>,
#   health_lag_sd <dbl>, pubhealth_mean <dbl>, pubhealth_sd <dbl>,
#   roads_mean <dbl>, roads_sd <dbl>, cerebvas_mean <dbl>, cerebvas_sd <dbl>,
#   assault_mean <dbl>, assault_sd <dbl>, external_mean <dbl>,
#   external_sd <dbl>, txp_pop_mean <dbl>, txp_pop_sd <dbl>, and abbreviated
#   variable names ¹​consent_law, ²​donors_mean, ³​donors_sd, ⁴​pop_mean, …

Think of .x as a pronoun. It means “the thing” or “the thing we’re doing something to right now”.

Optionally drop any remaning groups:

Code
organdata |> 
    group_by(consent_law, country) |>
      summarize(across(where(is.numeric),
                       list(mean = ~ mean(.x, na.rm = TRUE), 
                            sd = ~ sd(.x, na.rm = TRUE))),
                .groups = "drop")
# A tibble: 17 × 28
   conse…¹ country donor…² donor…³ pop_m…⁴ pop_sd pop_d…⁵ pop_d…⁶ gdp_m…⁷ gdp_sd
   <chr>   <chr>     <dbl>   <dbl>   <dbl>  <dbl>   <dbl>   <dbl>   <dbl>  <dbl>
 1 Inform… Austra…    10.6   1.14   18318. 8.31e2   0.237  0.0107  22179.  3959.
 2 Inform… Canada     14.0   0.751  29608. 1.19e3   0.297  0.0120  23711.  3966.
 3 Inform… Denmark    13.1   1.47    5257. 8.06e1  12.2    0.187   23722.  3896.
 4 Inform… Germany    13.0   0.611  80255. 5.16e3  22.5    1.44    22163.  2501.
 5 Inform… Ireland    19.8   2.48    3674. 1.32e2   5.23   0.187   20824.  6670.
 6 Inform… Nether…    13.7   1.55   15548. 3.73e2  37.4    0.898   23013.  3770.
 7 Inform… United…    13.5   0.775  58187. 6.26e2  24.0    0.258   21359.  3929.
 8 Inform… United…    20.0   1.33  269330. 1.25e4   2.80   0.130   29212.  4571.
 9 Presum… Austria    23.5   2.42    7927. 1.09e2   9.45   0.130   23876.  3343.
10 Presum… Belgium    21.9   1.94   10153. 1.09e2  30.7    0.330   22500.  3171.
11 Presum… Finland    18.4   1.53    5112. 6.86e1   1.51   0.0203  21019.  3668.
12 Presum… France     16.8   1.60   58056. 8.51e2  10.5    0.154   22603.  3260.
13 Presum… Italy      11.1   4.28   57360. 4.25e2  19.0    0.141   21554.  2781.
14 Presum… Norway     15.4   1.11    4386. 9.73e1   1.35   0.0300  26448.  6492.
15 Presum… Spain      28.1   4.96   39666. 9.51e2   7.84   0.188   16933   2888.
16 Presum… Sweden     13.1   1.75    8789. 1.14e2   1.95   0.0253  22415.  3213.
17 Presum… Switze…    14.2   1.71    7037. 1.70e2  17.0    0.411   27233   2153.
# … with 18 more variables: gdp_lag_mean <dbl>, gdp_lag_sd <dbl>,
#   health_mean <dbl>, health_sd <dbl>, health_lag_mean <dbl>,
#   health_lag_sd <dbl>, pubhealth_mean <dbl>, pubhealth_sd <dbl>,
#   roads_mean <dbl>, roads_sd <dbl>, cerebvas_mean <dbl>, cerebvas_sd <dbl>,
#   assault_mean <dbl>, assault_sd <dbl>, external_mean <dbl>,
#   external_sd <dbl>, txp_pop_mean <dbl>, txp_pop_sd <dbl>, and abbreviated
#   variable names ¹​consent_law, ²​donors_mean, ³​donors_sd, ⁴​pop_mean, …
  • Tidyverse functions use this dot prefix in names like .x and .groups for internal function arguments that should not be confused with trying to directly name or talk about columns in the data.

  • The across() function is used inside summarize() and mutate() to do something across some subset of columns.

  • Inside across(), use where() to choose columns, and then apply a function to each of them.

Code
organdata |>
  mutate(across(where(is.numeric), 
         round))
# A tibble: 238 × 21
   country  year       donors   pop pop_d…¹   gdp gdp_lag health healt…² pubhe…³
   <chr>    <date>      <dbl> <dbl>   <dbl> <dbl>   <dbl>  <dbl>   <dbl>   <dbl>
 1 Austral… NA             NA 17065       0 16774   16591   1300    1224       5
 2 Austral… 1991-01-01     12 17284       0 17171   16774   1379    1300       5
 3 Austral… 1992-01-01     12 17495       0 17914   17171   1455    1379       5
 4 Austral… 1993-01-01     13 17667       0 18883   17914   1540    1455       5
 5 Austral… 1994-01-01     10 17855       0 19849   18883   1626    1540       5
 6 Austral… 1995-01-01     10 18072       0 21079   19849   1737    1626       6
 7 Austral… 1996-01-01     11 18311       0 21923   21079   1846    1737       6
 8 Austral… 1997-01-01     10 18518       0 22961   21923   1948    1846       6
 9 Austral… 1998-01-01     10 18711       0 24148   22961   2077    1948       6
10 Austral… 1999-01-01      9 18926       0 25445   24148   2231    2077       6
# … with 228 more rows, 11 more variables: roads <dbl>, cerebvas <dbl>,
#   assault <dbl>, external <dbl>, txp_pop <dbl>, world <chr>, opt <chr>,
#   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>,
#   and abbreviated variable names ¹​pop_dens, ²​health_lag, ³​pubhealth

You can also use various “tidy selectors”, like this:

Code
organdata |>
  mutate(across(starts_with("pop"), 
         round))
# A tibble: 238 × 21
   country  year       donors   pop pop_d…¹   gdp gdp_lag health healt…² pubhe…³
   <chr>    <date>      <dbl> <dbl>   <dbl> <int>   <int>  <dbl>   <dbl>   <dbl>
 1 Austral… NA          NA    17065       0 16774   16591   1300    1224     4.8
 2 Austral… 1991-01-01  12.1  17284       0 17171   16774   1379    1300     5.4
 3 Austral… 1992-01-01  12.4  17495       0 17914   17171   1455    1379     5.4
 4 Austral… 1993-01-01  12.5  17667       0 18883   17914   1540    1455     5.4
 5 Austral… 1994-01-01  10.2  17855       0 19849   18883   1626    1540     5.4
 6 Austral… 1995-01-01  10.2  18072       0 21079   19849   1737    1626     5.5
 7 Austral… 1996-01-01  10.6  18311       0 21923   21079   1846    1737     5.6
 8 Austral… 1997-01-01  10.3  18518       0 22961   21923   1948    1846     5.7
 9 Austral… 1998-01-01  10.5  18711       0 24148   22961   2077    1948     5.9
10 Austral… 1999-01-01   8.67 18926       0 25445   24148   2231    2077     6.1
# … with 228 more rows, 11 more variables: roads <dbl>, cerebvas <int>,
#   assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
#   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>,
#   and abbreviated variable names ¹​pop_dens, ²​health_lag, ³​pubhealth

The function can be a named one, or you can write something yourself:

Code
organdata |>
  mutate(across(starts_with("pop"), 
         ~ .x / 100))
# A tibble: 238 × 21
   country  year       donors   pop pop_d…¹   gdp gdp_lag health healt…² pubhe…³
   <chr>    <date>      <dbl> <dbl>   <dbl> <int>   <int>  <dbl>   <dbl>   <dbl>
 1 Austral… NA          NA     171. 0.00220 16774   16591   1300    1224     4.8
 2 Austral… 1991-01-01  12.1   173. 0.00223 17171   16774   1379    1300     5.4
 3 Austral… 1992-01-01  12.4   175. 0.00226 17914   17171   1455    1379     5.4
 4 Austral… 1993-01-01  12.5   177. 0.00228 18883   17914   1540    1455     5.4
 5 Austral… 1994-01-01  10.2   179. 0.00231 19849   18883   1626    1540     5.4
 6 Austral… 1995-01-01  10.2   181. 0.00233 21079   19849   1737    1626     5.5
 7 Austral… 1996-01-01  10.6   183. 0.00237 21923   21079   1846    1737     5.6
 8 Austral… 1997-01-01  10.3   185. 0.00239 22961   21923   1948    1846     5.7
 9 Austral… 1998-01-01  10.5   187. 0.00242 24148   22961   2077    1948     5.9
10 Austral… 1999-01-01   8.67  189. 0.00244 25445   24148   2231    2077     6.1
# … with 228 more rows, 11 more variables: roads <dbl>, cerebvas <int>,
#   assault <int>, external <int>, txp_pop <dbl>, world <chr>, opt <chr>,
#   consent_law <chr>, consent_practice <chr>, consistent <chr>, ccode <chr>,
#   and abbreviated variable names ¹​pop_dens, ²​health_lag, ³​pubhealth

You can use where() with select() as well, when you are just subsetting by column but not yet doing anything across the columns:

Code
organdata |>
  select(where(is.character))
# A tibble: 238 × 7
   country   world   opt   consent_law consent_practice consistent ccode
   <chr>     <chr>   <chr> <chr>       <chr>            <chr>      <chr>
 1 Australia Liberal In    Informed    Informed         Yes        Oz   
 2 Australia Liberal In    Informed    Informed         Yes        Oz   
 3 Australia Liberal In    Informed    Informed         Yes        Oz   
 4 Australia Liberal In    Informed    Informed         Yes        Oz   
 5 Australia Liberal In    Informed    Informed         Yes        Oz   
 6 Australia Liberal In    Informed    Informed         Yes        Oz   
 7 Australia Liberal In    Informed    Informed         Yes        Oz   
 8 Australia Liberal In    Informed    Informed         Yes        Oz   
 9 Australia Liberal In    Informed    Informed         Yes        Oz   
10 Australia Liberal In    Informed    Informed         Yes        Oz   
# … with 228 more rows
Code
organdata |>
  select(starts_with("gdp"))
# A tibble: 238 × 2
     gdp gdp_lag
   <int>   <int>
 1 16774   16591
 2 17171   16774
 3 17914   17171
 4 18883   17914
 5 19849   18883
 6 21079   19849
 7 21923   21079
 8 22961   21923
 9 24148   22961
10 25445   24148
# … with 228 more rows
Code
organdata |>
  select(contains("health"))
# A tibble: 238 × 3
   health health_lag pubhealth
    <dbl>      <dbl>     <dbl>
 1   1300       1224       4.8
 2   1379       1300       5.4
 3   1455       1379       5.4
 4   1540       1455       5.4
 5   1626       1540       5.4
 6   1737       1626       5.5
 7   1846       1737       5.6
 8   1948       1846       5.7
 9   2077       1948       5.9
10   2231       2077       6.1
# … with 228 more rows

This applies to mutate() as well

If you use a function like mean() or sd() or n() with mutate() instead of summarize() it will work too. The difference is that a column will be added with the value repeated for all group members. This can be useful when you want e.g. to make a denominator for some calculation later. Remember, mutate() adds or changes columns but never changes the number of rows in the table, whereas summarize() will usually output a table with fewer rows than the one you give it.

Code
## Country-year data, 238 rows altogether,
## with yearly data for 17 countries.
organdata |>
  select(country, donors)
# A tibble: 238 × 2
   country   donors
   <chr>      <dbl>
 1 Australia  NA   
 2 Australia  12.1 
 3 Australia  12.4 
 4 Australia  12.5 
 5 Australia  10.2 
 6 Australia  10.2 
 7 Australia  10.6 
 8 Australia  10.3 
 9 Australia  10.5 
10 Australia   8.67
# … with 228 more rows
Code
## Summarize gets you one row per country
organdata |>
  select(country, donors) |> 
  group_by(country) |> 
  summarize(donors_mean = mean(donors, na.rm = TRUE))
# A tibble: 17 × 2
   country        donors_mean
   <chr>                <dbl>
 1 Australia             10.6
 2 Austria               23.5
 3 Belgium               21.9
 4 Canada                14.0
 5 Denmark               13.1
 6 Finland               18.4
 7 France                16.8
 8 Germany               13.0
 9 Ireland               19.8
10 Italy                 11.1
11 Netherlands           13.7
12 Norway                15.4
13 Spain                 28.1
14 Sweden                13.1
15 Switzerland           14.2
16 United Kingdom        13.5
17 United States         20.0
Code
## Mutate adds each country's donor mean 
## to the 238 observations
tmp <- organdata |>
  select(country, donors) |> 
  group_by(country) |> 
  mutate(donors_mean = mean(donors, na.rm = TRUE))

# First few rows of 238
head(tmp)
# A tibble: 6 × 3
# Groups:   country [1]
  country   donors donors_mean
  <chr>      <dbl>       <dbl>
1 Australia   NA          10.6
2 Australia   12.1        10.6
3 Australia   12.4        10.6
4 Australia   12.5        10.6
5 Australia   10.2        10.6
6 Australia   10.2        10.6
Code
# Last few rows of 238
tail(tmp)
# A tibble: 6 × 3
# Groups:   country [1]
  country       donors donors_mean
  <chr>          <dbl>       <dbl>
1 United States   21          20.0
2 United States   20.9        20.0
3 United States   21.2        20.0
4 United States   21.3        20.0
5 United States   21.5        20.0
6 United States   NA          20.0

Graph your summarized tables

Code
gss_sm |> 
  group_by(bigregion, religion) |> 
  tally() |> 
  mutate(pct = round((n/sum(n))*100, 1)) |> 
  drop_na() |> 
  ggplot(mapping = aes(x = pct, 
                       y = reorder(religion, -pct), fill = religion)) + #<<
  geom_col() + #<<
    labs(x = "Percent", y = NULL) +
    guides(fill = "none") + 
    facet_wrap(~ bigregion, nrow = 1)

Code
rel_by_region <- gss_sm |> 
  group_by(bigregion, religion) |> 
  tally() |> 
  mutate(pct = round((n/sum(n))*100, 1)) |> 
  drop_na()


head(rel_by_region)
# A tibble: 6 × 4
# Groups:   bigregion [2]
  bigregion religion       n   pct
  <fct>     <fct>      <int> <dbl>
1 Northeast Protestant   158  32.4
2 Northeast Catholic     162  33.2
3 Northeast Jewish        27   5.5
4 Northeast None         112  23  
5 Northeast Other         28   5.7
6 Midwest   Protestant   325  46.8
Code
p <- ggplot(data = rel_by_region, 
                mapping = aes(x = bigregion, 
                              y = pct, 
                              fill = religion))
p_out <- p + geom_col(position = "dodge") +
    labs(x = "Region",
         y = "Percent", 
         fill = "Religion") 

p_out

Experiment with facets:

Code
p <- ggplot(data = rel_by_region, 
                mapping = aes(x = pct, #<<
                              y = reorder(religion, -pct), #<<
                              fill = religion))
p_out_facet <- p + geom_col() +
  guides(fill = "none") + 
  facet_wrap(~ bigregion, nrow = 1) +
  labs(x = "Percent",
       y = NULL) 

p_out_facet

Multi-way facets

Code
p <-  ggplot(data = gss_sm,
             mapping = aes(x = age, y = childs))

p + geom_point(alpha = 0.2) + 
  geom_smooth() +
  facet_wrap(~ race)
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 18 rows containing missing values (`geom_point()`).

Code
p <-  ggplot(data = gss_sm,
             mapping = aes(x = age, y = childs))

p + geom_point(alpha = 0.2) + 
  geom_smooth() +
  facet_wrap(~ sex + race) #<< 
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 18 rows containing missing values (`geom_point()`).

Code
p <-  ggplot(data = gss_sm,
             mapping = aes(x = age, y = childs))

p + geom_point(alpha = 0.2) + 
  geom_smooth() +
  facet_wrap(~ sex + race, nrow = 1) #<< 
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 18 rows containing missing values (`geom_point()`).

facet_wrap() vs facet_grid()

Code
p + geom_point(alpha = 0.2) + 
  geom_smooth() +
  facet_grid(sex ~ race) #<< 
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 18 rows containing missing values (`geom_point()`).

Code
p_out <- p + geom_point(alpha = 0.2) + 
  geom_smooth() +
  facet_grid(bigregion ~ race + sex) #<< 

p_out
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Warning: Removed 18 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 18 rows containing missing values (`geom_point()`).