Visualizing Social Data
Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode Toggle Dark/Light/Auto mode

Code

Ulysses

#install.packages("gutenbergr")
#install.packages("tidytext")
#install.packages("textdata")
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

## ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
## ✔ tibble  3.1.6     ✔ dplyr   1.0.8
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1

## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::edition_get()   masks testthat::edition_get()
## ✖ dplyr::filter()        masks stats::filter()
## ✖ purrr::is_null()       masks testthat::is_null()
## ✖ dplyr::lag()           masks stats::lag()
## ✖ readr::local_edition() masks testthat::local_edition()
## ✖ dplyr::matches()       masks tidyr::matches(), testthat::matches()
library(gutenbergr)
library(tidytext)

my_mirror <- "http://mirrors.xmission.com/gutenberg/"

## Get Ulysses
raw_text <- gutenberg_download(4300, mirror = my_mirror) 
full_text <- raw_text %>% 
  mutate(line=row_number()) %>% 
  unnest_tokens(word,text)

full_text[30:51,] %>% 
  print(n = Inf)
## # A tibble: 22 × 3
##    gutenberg_id  line word     
##           <int> <int> <chr>    
##  1         4300    49 stately  
##  2         4300    49 plump    
##  3         4300    49 buck     
##  4         4300    49 mulligan 
##  5         4300    49 came     
##  6         4300    49 from     
##  7         4300    49 the      
##  8         4300    49 stairhead
##  9         4300    49 bearing  
## 10         4300    49 a        
## 11         4300    49 bowl     
## 12         4300    49 of       
## 13         4300    50 lather   
## 14         4300    50 on       
## 15         4300    50 which    
## 16         4300    50 a        
## 17         4300    50 mirror   
## 18         4300    50 and      
## 19         4300    50 a        
## 20         4300    50 razor    
## 21         4300    50 lay      
## 22         4300    50 crossed
tail(full_text, n = 20)
## # A tibble: 20 × 3
##    gutenberg_id  line word   
##           <int> <int> <chr>  
##  1         4300 32830 and    
##  2         4300 32830 his    
##  3         4300 32830 heart  
##  4         4300 32830 was    
##  5         4300 32830 going  
##  6         4300 32830 like   
##  7         4300 32830 mad    
##  8         4300 32830 and    
##  9         4300 32831 yes    
## 10         4300 32831 i      
## 11         4300 32831 said   
## 12         4300 32831 yes    
## 13         4300 32831 i      
## 14         4300 32831 will   
## 15         4300 32831 yes    
## 16         4300 32833 trieste
## 17         4300 32833 zurich 
## 18         4300 32833 paris  
## 19         4300 32835 1914   
## 20         4300 32835 1921

Word counts

tidy_book_count <- full_text %>% 
  count(word, sort=TRUE)

tidy_book_count_stop <- full_text %>% 
  anti_join(stop_words) %>% 
  count(word, sort=TRUE)
## Joining, by = "word"

Remove stopwords

full_text %>%
  anti_join(stop_words) %>%
  filter(! str_detect(word, "'")) %>%
  filter(! str_detect(word, "’")) %>%
  count(word, sort = TRUE) %>%
  top_n(20) %>%
  mutate(word=reorder(word, n))
## Joining, by = "word"
## Selecting by n

## # A tibble: 20 × 2
##    word        n
##    <fct>   <int>
##  1 bloom     933
##  2 stephen   503
##  3 time      376
##  4 eyes      329
##  5 hand      304
##  6 street    293
##  7 father    277
##  8 day       250
##  9 round     239
## 10 night     232
## 11 head      222
## 12 sir       217
## 13 god       215
## 14 john      195
## 15 life      192
## 16 woman     186
## 17 poor      185
## 18 voice     177
## 19 dedalus   174
## 20 house     171

Elementary Setiment Analysis

full_text %>% 
  inner_join(get_sentiments("bing")) %>%
  count(sentiment, word, sort = TRUE) %>%
  group_by(sentiment) %>%
  top_n(20) %>%
  ungroup() %>%
  mutate(word=reorder(word,n)) %>% 
  ggplot(mapping = aes(x = n, 
                       y = word, 
                       fill=sentiment))+
  geom_col() +
  guides(fill = "none") + 
  facet_wrap(~ sentiment, 
             scales = "free")
## Joining, by = "word"
## Selecting by n

Use a different sentiment dictionary

full_text %>% 
  inner_join(get_sentiments("nrc")) %>%
  count(sentiment, word, sort = TRUE) %>%
  group_by(sentiment) %>%
  top_n(20) %>%
  ungroup() %>%
  mutate(word=reorder(word,n)) %>% 
  ggplot(mapping = aes(x = n, 
                       y = word, 
                       fill=sentiment))+
  geom_col() +
  guides(fill = "none") + 
  facet_wrap(~ sentiment, 
             scales = "free", 
             ncol = 3)
## Joining, by = "word"
## Selecting by n

Bigrams

tidy_ngram <- raw_text %>% 
  unnest_tokens(bigram, text, 
                token = "ngrams", n = 2) %>% 
  separate(bigram, c("word1", "word2"), sep=" ") %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE)


he_she <- raw_text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep=" ") %>%
  filter(word1 %in% c("he", "she", "they")) %>%
  filter(!word2 %in% stop_words$word, !str_detect(word2, "'")) %>%
  count(word1, word2, sort=TRUE)

he_she %>% 
  group_by(word1) %>% 
  top_n(20) %>% 
  ggplot(mapping = aes(x=n, y=word2, 
                     fill=word1)) +
  geom_col() +
  guides(fill = "none") + 
  facet_wrap(~ word1, scales="free")
## Selecting by n
bloom_stephen <- raw_text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep=" ") %>%
  filter(word1 %in% c("bloom", "stephen", "mulligan", "molly")) %>%
  filter(!word2 %in% stop_words$word, !str_detect(word2, "'")) %>%
  count(word1, word2, sort=TRUE)

bloom_stephen %>% 
  group_by(word1) %>% 
  top_n(10) %>% 
  ggplot(mapping = aes(x=n, y=word2, 
                     fill=word1)) +
  geom_col() +
  guides(fill = "none") + 
  facet_wrap(~ word1, scales="free")
## Selecting by n

Sequence of words

full_text %>% 
  inner_join(get_sentiments("bing")) %>%
  count(index = line %/% 2000, sentiment) %>%
  pivot_wider(names_from = sentiment, 
              values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative) %>% 
  ggplot(mapping = aes(index, sentiment)) +
  geom_col(show.legend = FALSE) 
## Joining, by = "word"