Chapter 7 Text classification

https://juliasilge.com/blog/tidy-text-classification/

library(tidymodels)
library(gutenbergr)

titles <- c(
  "The War of the Worlds",
  "Pride and Prejudice"
)

books <- gutenberg_works() %>% 
  filter(title %in% titles) %>% 
  gutenberg_download(meta_fields = "title") %>% 
  mutate(document = row_number())

tidy_books <- books %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words) %>% 
  group_by(word) %>%
  filter(n() > 10) %>%
  ungroup()

tidy_books %>%
  count(title, word, sort = TRUE) %>% 
  group_by(title) %>% 
  top_n(20) %>%
  facet_bar(y = word, x = n, by = title) + 
  labs(
    x = NULL, y = "Word count",
    title = "Most frequent words after removing stop words",
    subtitle = "Words like 'said' occupy similar ranks but other words are quite different"
  )