library(here) # manage file paths
library(socviz) # data and some useful functions
library(tidyverse) # your friend and mine
March 27, 2024
original_books <- austen_books() |>
group_by(book) |>
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text,
regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) |>
ungroup()
tidy_books <- original_books |>
unnest_tokens(word, text)
tidy_books
# A tibble: 725,055 × 4
book linenumber chapter word
<fct> <int> <int> <chr>
1 Sense & Sensibility 1 0 sense
2 Sense & Sensibility 1 0 and
3 Sense & Sensibility 1 0 sensibility
4 Sense & Sensibility 3 0 by
5 Sense & Sensibility 3 0 jane
6 Sense & Sensibility 3 0 austen
7 Sense & Sensibility 5 0 1811
8 Sense & Sensibility 10 1 chapter
9 Sense & Sensibility 10 1 1
10 Sense & Sensibility 13 1 the
# ℹ 725,045 more rows
tf-idf
Stands for “Term Frequency–Inverse Document Frequency”
The idea is to count the frequency of terms in a document, but decrease the weight of commonly used words and increase the weight for words that are not used very much in a corpus.
book_words <- austen_books() |>
unnest_tokens(word, text) |>
count(book, word, sort = TRUE)
total_words <- book_words |>
group_by(book) |>
summarize(total = sum(n))
book_words <- left_join(book_words, total_words)
book_words
# A tibble: 40,379 × 4
book word n total
<fct> <chr> <int> <int>
1 Mansfield Park the 6206 160460
2 Mansfield Park to 5475 160460
3 Mansfield Park and 5438 160460
4 Emma to 5239 160996
5 Emma the 5201 160996
6 Emma and 4896 160996
7 Mansfield Park of 4778 160460
8 Pride & Prejudice the 4331 122204
9 Emma of 4291 160996
10 Pride & Prejudice to 4162 122204
# ℹ 40,369 more rows
“The frequency that a word appears is inversely proportional to its rank.”
freq_by_rank <- book_words |>
group_by(book) |>
mutate(rank = row_number(),
`term frequency` = n/total) |>
ungroup()
freq_by_rank
# A tibble: 40,379 × 6
book word n total rank `term frequency`
<fct> <chr> <int> <int> <int> <dbl>
1 Mansfield Park the 6206 160460 1 0.0387
2 Mansfield Park to 5475 160460 2 0.0341
3 Mansfield Park and 5438 160460 3 0.0339
4 Emma to 5239 160996 1 0.0325
5 Emma the 5201 160996 2 0.0323
6 Emma and 4896 160996 3 0.0304
7 Mansfield Park of 4778 160460 4 0.0298
8 Pride & Prejudice the 4331 122204 1 0.0354
9 Emma of 4291 160996 4 0.0267
10 Pride & Prejudice to 4162 122204 2 0.0341
# ℹ 40,369 more rows
austen_bigrams <- austen_books() |>
unnest_tokens(bigram, text, token = "ngrams", n = 2) |>
filter(!is.na(bigram))
austen_bigrams
# A tibble: 662,783 × 2
book bigram
<fct> <chr>
1 Sense & Sensibility sense and
2 Sense & Sensibility and sensibility
3 Sense & Sensibility by jane
4 Sense & Sensibility jane austen
5 Sense & Sensibility chapter 1
6 Sense & Sensibility the family
7 Sense & Sensibility family of
8 Sense & Sensibility of dashwood
9 Sense & Sensibility dashwood had
10 Sense & Sensibility had long
# ℹ 662,773 more rows
# A tibble: 193,209 × 2
bigram n
<chr> <int>
1 of the 2853
2 to be 2670
3 in the 2221
4 it was 1691
5 i am 1485
6 she had 1405
7 of her 1363
8 to the 1315
9 she was 1309
10 had been 1206
# ℹ 193,199 more rows
Stopwords again.
Split the columns
bigrams_separated <- austen_bigrams |>
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated |>
filter(!word1 %in% stop_words$word) |>
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered |>
count(word1, word2, sort = TRUE)
bigram_counts
# A tibble: 28,974 × 3
word1 word2 n
<chr> <chr> <int>
1 sir thomas 266
2 miss crawford 196
3 captain wentworth 143
4 miss woodhouse 143
5 frank churchill 114
6 lady russell 110
7 sir walter 108
8 lady bertram 101
9 miss fairfax 98
10 colonel brandon 96
# ℹ 28,964 more rows
# A tibble: 38,913 × 2
book bigram
<fct> <chr>
1 Sense & Sensibility jane austen
2 Sense & Sensibility chapter 1
3 Sense & Sensibility norland park
4 Sense & Sensibility surrounding acquaintance
5 Sense & Sensibility late owner
6 Sense & Sensibility advanced age
7 Sense & Sensibility constant companion
8 Sense & Sensibility happened ten
9 Sense & Sensibility henry dashwood
10 Sense & Sensibility norland estate
# ℹ 38,903 more rows
Now we have common bigrams without stopwords.
bigram_tf_idf <- bigrams_united |>
count(book, bigram) |>
bind_tf_idf(bigram, book, n) |>
arrange(desc(tf_idf))
bigram_tf_idf
# A tibble: 31,391 × 6
book bigram n tf idf tf_idf
<fct> <chr> <int> <dbl> <dbl> <dbl>
1 Mansfield Park sir thomas 266 0.0304 1.79 0.0545
2 Persuasion captain wentworth 143 0.0290 1.79 0.0519
3 Mansfield Park miss crawford 196 0.0224 1.79 0.0402
4 Persuasion lady russell 110 0.0223 1.79 0.0399
5 Persuasion sir walter 108 0.0219 1.79 0.0392
6 Emma miss woodhouse 143 0.0173 1.79 0.0309
7 Northanger Abbey miss tilney 74 0.0165 1.79 0.0295
8 Sense & Sensibility colonel brandon 96 0.0155 1.79 0.0278
9 Sense & Sensibility sir john 94 0.0152 1.79 0.0273
10 Pride & Prejudice lady catherine 87 0.0139 1.79 0.0248
# ℹ 31,381 more rows
out <- bigram_tf_idf |>
arrange(desc(tf_idf)) |>
group_by(book) |>
slice_max(tf_idf, n = 10) |>
ungroup() |>
mutate(bigram = reorder(bigram, tf_idf)) |>
ggplot(aes(tf_idf, bigram, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ book, nrow = 2, scales = "free") +
labs(x = "tf-idf of bigram", y = NULL)
ulysses <- "http://aleph.gutenberg.org/4/3/0/4300/4300-0.txt"
raw_text <- tibble(text = readr::read_lines(ulysses))
raw_text
# A tibble: 33,216 × 1
text
<chr>
1 "The Project Gutenberg eBook of Ulysses, by James Joyce"
2 ""
3 "This eBook is for the use of anyone anywhere in the United States and"
4 "most other parts of the world at no cost and with almost no restrictions"
5 "whatsoever. You may copy it, give it away or re-use it under the terms"
6 "of the Project Gutenberg License included with this eBook or online at"
7 "www.gutenberg.org. If you are not located in the United States, you"
8 "will have to check the laws of the country where you are located before"
9 "using this eBook."
10 ""
# ℹ 33,206 more rows
# A tibble: 31 × 2
line word
<int> <chr>
1 74 stately
2 74 plump
3 74 buck
4 74 mulligan
5 74 came
6 74 from
7 74 the
8 74 stairhead
9 74 bearing
10 74 a
11 74 bowl
12 74 of
13 75 lather
14 75 on
15 75 which
16 75 a
17 75 mirror
18 75 and
19 75 a
20 75 razor
21 75 lay
22 75 crossed
23 75 a
24 75 yellow
25 76 dressinggown
26 76 ungirdled
27 76 was
28 76 sustained
29 76 gently
30 76 behind
31 76 him
full_text |>
anti_join(stop_words) |>
filter(! str_detect(word, "'")) |>
filter(! str_detect(word, "’")) |>
count(word, sort = TRUE) |>
top_n(20) |>
mutate(word=reorder(word, n))
# A tibble: 20 × 2
word n
<fct> <int>
1 bloom 933
2 stephen 503
3 time 376
4 eyes 329
5 hand 304
6 street 293
7 father 277
8 day 250
9 round 239
10 night 232
11 head 222
12 sir 217
13 god 215
14 john 195
15 life 192
16 woman 186
17 poor 185
18 voice 177
19 dedalus 174
20 house 171
sent <- full_text |>
inner_join(get_sentiments("bing"), relationship = "many-to-many") |>
count(sentiment, word, sort = TRUE) |>
group_by(sentiment) |>
top_n(10) |>
ungroup() |>
mutate(word=reorder(word,n))
sent
# A tibble: 20 × 3
sentiment word n
<chr> <fct> <int>
1 positive bloom 933
2 positive like 731
3 positive good 321
4 positive well 274
5 positive right 238
6 negative poor 185
7 positive love 161
8 negative miss 134
9 positive better 132
10 negative dark 127
11 positive best 127
12 positive great 127
13 negative dead 123
14 negative bloody 104
15 positive wonder 97
16 negative bad 90
17 negative lost 87
18 negative death 82
19 negative slowly 71
20 negative hard 64
out <- full_text |>
inner_join(get_sentiments("bing"), relationship = "many-to-many") |>
count(index = line %/% 2000, sentiment) |>
pivot_wider(names_from = sentiment,
values_from = n, values_fill = 0) |>
mutate(sentiment = positive - negative)
out
# A tibble: 17 × 4
index negative positive sentiment
<dbl> <int> <int> <int>
1 0 454 314 -140
2 1 605 546 -59
3 2 496 475 -21
4 3 373 422 49
5 4 480 497 17
6 5 357 381 24
7 6 490 469 -21
8 7 457 531 74
9 8 524 745 221
10 9 772 778 6
11 10 543 522 -21
12 11 572 577 5
13 12 464 468 4
14 13 553 710 157
15 14 340 373 33
16 15 607 653 46
17 16 321 449 128
pronouns <- raw_text |>
unnest_tokens(bigram, text, token = "ngrams", n = 2) |>
separate(bigram, c("word1", "word2"), sep=" ") |>
filter(word1 %in% c("he", "she", "they")) |>
filter(!word2 %in% stop_words$word, !str_detect(word2, "'")) |>
count(word1, word2, sort=TRUE)
pronouns
# A tibble: 1,411 × 3
word1 word2 n
<chr> <chr> <int>
1 he looked 29
2 he walked 25
3 he heard 21
4 they call 21
5 he stood 20
6 he passed 18
7 he told 17
8 he held 15
9 he drew 13
10 he read 12
# ℹ 1,401 more rows