1.1 The unnest_tokens() function

library(janeaustenr)
austen_books()
#> # A tibble: 73,422 x 2
#>   text                    book               
#> * <chr>                   <fct>              
#> 1 "SENSE AND SENSIBILITY" Sense & Sensibility
#> 2 ""                      Sense & Sensibility
#> 3 "by Jane Austen"        Sense & Sensibility
#> 4 ""                      Sense & Sensibility
#> 5 "(1811)"                Sense & Sensibility
#> 6 ""                      Sense & Sensibility
#> # ... with 7.342e+04 more rows
original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                                 ignore_case = TRUE)))) %>%
  ungroup()

original_books
#> # A tibble: 73,422 x 4
#>   text                    book                linenumber chapter
#>   <chr>                   <fct>                    <int>   <int>
#> 1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
#> 2 ""                      Sense & Sensibility          2       0
#> 3 "by Jane Austen"        Sense & Sensibility          3       0
#> 4 ""                      Sense & Sensibility          4       0
#> 5 "(1811)"                Sense & Sensibility          5       0
#> 6 ""                      Sense & Sensibility          6       0
#> # ... with 7.342e+04 more rows
original_books %>%
  unnest_tokens(word, text)
#> # A tibble: 725,055 x 4
#>   book                linenumber chapter word       
#>   <fct>                    <int>   <int> <chr>      
#> 1 Sense & Sensibility          1       0 sense      
#> 2 Sense & Sensibility          1       0 and        
#> 3 Sense & Sensibility          1       0 sensibility
#> 4 Sense & Sensibility          3       0 by         
#> 5 Sense & Sensibility          3       0 jane       
#> 6 Sense & Sensibility          3       0 austen     
#> # ... with 7.25e+05 more rows
stop_words 
#> # A tibble: 1,149 x 2
#>   word      lexicon
#>   <chr>     <chr>  
#> 1 a         SMART  
#> 2 a's       SMART  
#> 3 able      SMART  
#> 4 about     SMART  
#> 5 above     SMART  
#> 6 according SMART  
#> # ... with 1,143 more rows

tidy_books <- original_books %>%
  unnest_tokens(word, text) %>% 
  anti_join(stop_words)

tidy_books
#> # A tibble: 217,609 x 4
#>   book                linenumber chapter word       
#>   <fct>                    <int>   <int> <chr>      
#> 1 Sense & Sensibility          1       0 sense      
#> 2 Sense & Sensibility          1       0 sensibility
#> 3 Sense & Sensibility          3       0 jane       
#> 4 Sense & Sensibility          3       0 austen     
#> 5 Sense & Sensibility          5       0 1811       
#> 6 Sense & Sensibility         10       1 chapter    
#> # ... with 2.176e+05 more rows