11 — Text as Data

Kieran Healy

March 27, 2024

Text as Data

Load the packages, as always

library(here)      # manage file paths
library(socviz)    # data and some useful functions
library(tidyverse) # your friend and mine

Specialty packages

#install.packages("tidytext")
#install.packages("gutenbergr")
#install.packages("janeaustenr")

library(tidytext)    # Tools for analyzing text
library(gutenbergr)  # Get books from Project Gutenberg
library(janeaustenr) # Pre-organized dataset of Jane Austen's novels

Tidy text

original_books <- austen_books() |>
  group_by(book) |>
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, 
                                     regex("^chapter [\\divxlc]",
                                           ignore_case = TRUE)))) |>
  ungroup()

tidy_books <- original_books |>
  unnest_tokens(word, text)

tidy_books
# A tibble: 725,055 × 4
   book                linenumber chapter word       
   <fct>                    <int>   <int> <chr>      
 1 Sense & Sensibility          1       0 sense      
 2 Sense & Sensibility          1       0 and        
 3 Sense & Sensibility          1       0 sensibility
 4 Sense & Sensibility          3       0 by         
 5 Sense & Sensibility          3       0 jane       
 6 Sense & Sensibility          3       0 austen     
 7 Sense & Sensibility          5       0 1811       
 8 Sense & Sensibility         10       1 chapter    
 9 Sense & Sensibility         10       1 1          
10 Sense & Sensibility         13       1 the        
# ℹ 725,045 more rows

“Stopwords”

  • For many purposes (not always!) very common words like prepositions and articles are not interesting.
data(stop_words)

stop_words
# A tibble: 1,149 × 2
   word        lexicon
   <chr>       <chr>  
 1 a           SMART  
 2 a's         SMART  
 3 able        SMART  
 4 about       SMART  
 5 above       SMART  
 6 according   SMART  
 7 accordingly SMART  
 8 across      SMART  
 9 actually    SMART  
10 after       SMART  
# ℹ 1,139 more rows
tidy_books <- tidy_books |>
  anti_join(stop_words)

Stopwords removed

tidy_books |>
  count(word, sort = TRUE) 
# A tibble: 13,914 × 2
   word       n
   <chr>  <int>
 1 miss    1855
 2 time    1337
 3 fanny    862
 4 dear     822
 5 lady     817
 6 sir      806
 7 day      797
 8 emma     787
 9 sister   727
10 house    699
# ℹ 13,904 more rows

Word frequency

tidy_books |>
  count(word, sort = TRUE) |>
  filter(n > 600) |>
  mutate(word = reorder(word, n)) |>
  ggplot(aes(n, word)) + geom_col() +
  labs(y = NULL)

tf-idf

Stands for “Term Frequency–Inverse Document Frequency”

The idea is to count the frequency of terms in a document, but decrease the weight of commonly used words and increase the weight for words that are not used very much in a corpus.

For example …

book_words <- austen_books() |>
  unnest_tokens(word, text) |>
  count(book, word, sort = TRUE)

total_words <- book_words |> 
  group_by(book) |> 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)

book_words
# A tibble: 40,379 × 4
   book              word      n  total
   <fct>             <chr> <int>  <int>
 1 Mansfield Park    the    6206 160460
 2 Mansfield Park    to     5475 160460
 3 Mansfield Park    and    5438 160460
 4 Emma              to     5239 160996
 5 Emma              the    5201 160996
 6 Emma              and    4896 160996
 7 Mansfield Park    of     4778 160460
 8 Pride & Prejudice the    4331 122204
 9 Emma              of     4291 160996
10 Pride & Prejudice to     4162 122204
# ℹ 40,369 more rows

For example …

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, nrow = 2, scales = "free_y")

Zipf’s Law

“The frequency that a word appears is inversely proportional to its rank.”

freq_by_rank <- book_words |> 
  group_by(book) |> 
  mutate(rank = row_number(), 
         `term frequency` = n/total) |>
  ungroup()

freq_by_rank
# A tibble: 40,379 × 6
   book              word      n  total  rank `term frequency`
   <fct>             <chr> <int>  <int> <int>            <dbl>
 1 Mansfield Park    the    6206 160460     1           0.0387
 2 Mansfield Park    to     5475 160460     2           0.0341
 3 Mansfield Park    and    5438 160460     3           0.0339
 4 Emma              to     5239 160996     1           0.0325
 5 Emma              the    5201 160996     2           0.0323
 6 Emma              and    4896 160996     3           0.0304
 7 Mansfield Park    of     4778 160460     4           0.0298
 8 Pride & Prejudice the    4331 122204     1           0.0354
 9 Emma              of     4291 160996     4           0.0267
10 Pride & Prejudice to     4162 122204     2           0.0341
# ℹ 40,369 more rows

Zipf’s Law

freq_by_rank |> 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(linewidth = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

n-grams

austen_bigrams <- austen_books() |>
  unnest_tokens(bigram, text, token = "ngrams", n = 2) |>
  filter(!is.na(bigram))

austen_bigrams
# A tibble: 662,783 × 2
   book                bigram         
   <fct>               <chr>          
 1 Sense & Sensibility sense and      
 2 Sense & Sensibility and sensibility
 3 Sense & Sensibility by jane        
 4 Sense & Sensibility jane austen    
 5 Sense & Sensibility chapter 1      
 6 Sense & Sensibility the family     
 7 Sense & Sensibility family of      
 8 Sense & Sensibility of dashwood    
 9 Sense & Sensibility dashwood had   
10 Sense & Sensibility had long       
# ℹ 662,773 more rows

n-grams

austen_bigrams |>
  count(bigram, sort = TRUE)
# A tibble: 193,209 × 2
   bigram       n
   <chr>    <int>
 1 of the    2853
 2 to be     2670
 3 in the    2221
 4 it was    1691
 5 i am      1485
 6 she had   1405
 7 of her    1363
 8 to the    1315
 9 she was   1309
10 had been  1206
# ℹ 193,199 more rows

Stopwords again.

n-grams

Split the columns

bigrams_separated <- austen_bigrams |>
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated |>
  filter(!word1 %in% stop_words$word) |>
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered |> 
  count(word1, word2, sort = TRUE)

bigram_counts
# A tibble: 28,974 × 3
   word1   word2         n
   <chr>   <chr>     <int>
 1 sir     thomas      266
 2 miss    crawford    196
 3 captain wentworth   143
 4 miss    woodhouse   143
 5 frank   churchill   114
 6 lady    russell     110
 7 sir     walter      108
 8 lady    bertram     101
 9 miss    fairfax      98
10 colonel brandon      96
# ℹ 28,964 more rows

n-grams

bigrams_united <- bigrams_filtered |>
  unite(bigram, word1, word2, sep = " ")

bigrams_united
# A tibble: 38,913 × 2
   book                bigram                  
   <fct>               <chr>                   
 1 Sense & Sensibility jane austen             
 2 Sense & Sensibility chapter 1               
 3 Sense & Sensibility norland park            
 4 Sense & Sensibility surrounding acquaintance
 5 Sense & Sensibility late owner              
 6 Sense & Sensibility advanced age            
 7 Sense & Sensibility constant companion      
 8 Sense & Sensibility happened ten            
 9 Sense & Sensibility henry dashwood          
10 Sense & Sensibility norland estate          
# ℹ 38,903 more rows

Now we have common bigrams without stopwords.

n-gram tf-idf

bigram_tf_idf <- bigrams_united |>
  count(book, bigram) |>
  bind_tf_idf(bigram, book, n) |>
  arrange(desc(tf_idf))

bigram_tf_idf
# A tibble: 31,391 × 6
   book                bigram                n     tf   idf tf_idf
   <fct>               <chr>             <int>  <dbl> <dbl>  <dbl>
 1 Mansfield Park      sir thomas          266 0.0304  1.79 0.0545
 2 Persuasion          captain wentworth   143 0.0290  1.79 0.0519
 3 Mansfield Park      miss crawford       196 0.0224  1.79 0.0402
 4 Persuasion          lady russell        110 0.0223  1.79 0.0399
 5 Persuasion          sir walter          108 0.0219  1.79 0.0392
 6 Emma                miss woodhouse      143 0.0173  1.79 0.0309
 7 Northanger Abbey    miss tilney          74 0.0165  1.79 0.0295
 8 Sense & Sensibility colonel brandon      96 0.0155  1.79 0.0278
 9 Sense & Sensibility sir john             94 0.0152  1.79 0.0273
10 Pride & Prejudice   lady catherine       87 0.0139  1.79 0.0248
# ℹ 31,381 more rows

Plot them

out <- bigram_tf_idf |>
  arrange(desc(tf_idf)) |>
  group_by(book) |>
  slice_max(tf_idf, n = 10) |>
  ungroup() |>
  mutate(bigram = reorder(bigram, tf_idf)) |>
  ggplot(aes(tf_idf, bigram, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ book, nrow = 2, scales = "free") +
  labs(x = "tf-idf of bigram", y = NULL)
print(out)

Sentiment Analysis

ulysses <- "http://aleph.gutenberg.org/4/3/0/4300/4300-0.txt"
raw_text <- tibble(text = readr::read_lines(ulysses))

raw_text
# A tibble: 33,216 × 1
   text                                                                      
   <chr>                                                                     
 1 "The Project Gutenberg eBook of Ulysses, by James Joyce"                  
 2 ""                                                                        
 3 "This eBook is for the use of anyone anywhere in the United States and"   
 4 "most other parts of the world at no cost and with almost no restrictions"
 5 "whatsoever. You may copy it, give it away or re-use it under the terms"  
 6 "of the Project Gutenberg License included with this eBook or online at"  
 7 "www.gutenberg.org. If you are not located in the United States, you"     
 8 "will have to check the laws of the country where you are located before" 
 9 "using this eBook."                                                       
10 ""                                                                        
# ℹ 33,206 more rows

Sentiment Analysis

raw_text[74,]
# A tibble: 1 × 1
  text                                                                   
  <chr>                                                                  
1 Stately, plump Buck Mulligan came from the stairhead, bearing a bowl of
raw_text[nrow(raw_text) - 360,]
# A tibble: 1 × 1
  text                      
  <chr>                     
1 yes I said yes I will Yes.

Sentiment Analysis

full_text <- raw_text |> 
  mutate(line=row_number()) |>
  slice(-seq(n(), n() - 359)) |> # end
  slice(-seq(1:73)) |> # top
  unnest_tokens(word,text)

Sentiment Analysis

full_text[1:31,] |> 
  print(n = Inf)
# A tibble: 31 × 2
    line word        
   <int> <chr>       
 1    74 stately     
 2    74 plump       
 3    74 buck        
 4    74 mulligan    
 5    74 came        
 6    74 from        
 7    74 the         
 8    74 stairhead   
 9    74 bearing     
10    74 a           
11    74 bowl        
12    74 of          
13    75 lather      
14    75 on          
15    75 which       
16    75 a           
17    75 mirror      
18    75 and         
19    75 a           
20    75 razor       
21    75 lay         
22    75 crossed     
23    75 a           
24    75 yellow      
25    76 dressinggown
26    76 ungirdled   
27    76 was         
28    76 sustained   
29    76 gently      
30    76 behind      
31    76 him         

Sentiment Analysis

tail(full_text, n = 15)
# A tibble: 15 × 2
    line word 
   <int> <chr>
 1 32855 and  
 2 32855 his  
 3 32855 heart
 4 32855 was  
 5 32855 going
 6 32855 like 
 7 32855 mad  
 8 32855 and  
 9 32856 yes  
10 32856 i    
11 32856 said 
12 32856 yes  
13 32856 i    
14 32856 will 
15 32856 yes  

Sentiment Analysis

full_text |>
  anti_join(stop_words) |>
  filter(! str_detect(word, "'")) |>
  filter(! str_detect(word, "’")) |>
  count(word, sort = TRUE) |>
  top_n(20) |>
  mutate(word=reorder(word, n))
# A tibble: 20 × 2
   word        n
   <fct>   <int>
 1 bloom     933
 2 stephen   503
 3 time      376
 4 eyes      329
 5 hand      304
 6 street    293
 7 father    277
 8 day       250
 9 round     239
10 night     232
11 head      222
12 sir       217
13 god       215
14 john      195
15 life      192
16 woman     186
17 poor      185
18 voice     177
19 dedalus   174
20 house     171

Sentiment Analysis

sent <- full_text |> 
  inner_join(get_sentiments("bing"), relationship = "many-to-many") |>
  count(sentiment, word, sort = TRUE) |>
  group_by(sentiment) |>
  top_n(10) |>
  ungroup() |>
  mutate(word=reorder(word,n)) 


sent
# A tibble: 20 × 3
   sentiment word       n
   <chr>     <fct>  <int>
 1 positive  bloom    933
 2 positive  like     731
 3 positive  good     321
 4 positive  well     274
 5 positive  right    238
 6 negative  poor     185
 7 positive  love     161
 8 negative  miss     134
 9 positive  better   132
10 negative  dark     127
11 positive  best     127
12 positive  great    127
13 negative  dead     123
14 negative  bloody   104
15 positive  wonder    97
16 negative  bad       90
17 negative  lost      87
18 negative  death     82
19 negative  slowly    71
20 negative  hard      64

Sentiment Analysis

out <- sent  |> 
  ggplot(mapping = aes(x = n, 
                       y = word, 
                       fill=sentiment)) +
  geom_col() +
  guides(fill = "none") + 
  facet_wrap(~ sentiment, 
             ncol = 1,
             scales = "free_y")
print(out)

Sentiment Analysis

out <- full_text |> 
  inner_join(get_sentiments("bing"), relationship = "many-to-many") |>
  count(index = line %/% 2000, sentiment) |>
  pivot_wider(names_from = sentiment, 
              values_from = n, values_fill = 0) |> 
  mutate(sentiment = positive - negative) 

out
# A tibble: 17 × 4
   index negative positive sentiment
   <dbl>    <int>    <int>     <int>
 1     0      454      314      -140
 2     1      605      546       -59
 3     2      496      475       -21
 4     3      373      422        49
 5     4      480      497        17
 6     5      357      381        24
 7     6      490      469       -21
 8     7      457      531        74
 9     8      524      745       221
10     9      772      778         6
11    10      543      522       -21
12    11      572      577         5
13    12      464      468         4
14    13      553      710       157
15    14      340      373        33
16    15      607      653        46
17    16      321      449       128

Sentiment Analysis

out |> 
  ggplot(mapping = aes(factor(index), sentiment)) +
  geom_col(show.legend = FALSE)

Pronouns

pronouns <- raw_text |>
  unnest_tokens(bigram, text, token = "ngrams", n = 2) |>
  separate(bigram, c("word1", "word2"), sep=" ") |>
  filter(word1 %in% c("he", "she", "they")) |>
  filter(!word2 %in% stop_words$word, !str_detect(word2, "'")) |>
  count(word1, word2, sort=TRUE)

pronouns
# A tibble: 1,411 × 3
   word1 word2      n
   <chr> <chr>  <int>
 1 he    looked    29
 2 he    walked    25
 3 he    heard     21
 4 they  call      21
 5 he    stood     20
 6 he    passed    18
 7 he    told      17
 8 he    held      15
 9 he    drew      13
10 he    read      12
# ℹ 1,401 more rows

Pronouns

out <- pronouns |> 
  group_by(word1) |> 
  top_n(15) |> 
  ggplot(mapping = aes(x=n, 
                       y=reorder(word2, n), 
                     fill=word1)) +
  geom_col() +
  guides(fill = "none") + 
  facet_wrap(~ word1, scales="free")
print(out)