Chapter 7 Text classification
https://juliasilge.com/blog/tidy-text-classification/
library(tidymodels)
library(gutenbergr)
c(
titles <-"The War of the Worlds",
"Pride and Prejudice"
)
gutenberg_works() %>%
books <- filter(title %in% titles) %>%
gutenberg_download(meta_fields = "title") %>%
mutate(document = row_number())
books %>%
tidy_books <- unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
group_by(word) %>%
filter(n() > 10) %>%
ungroup()
%>%
tidy_books count(title, word, sort = TRUE) %>%
group_by(title) %>%
top_n(20) %>%
facet_bar(y = word, x = n, by = title) +
labs(
x = NULL, y = "Word count",
title = "Most frequent words after removing stop words",
subtitle = "Words like 'said' occupy similar ranks but other words are quite different"
)