austen_books()#> # A tibble: 73,422 x 2#> text book #> * <chr> <fct> #> 1 "SENSE AND SENSIBILITY" Sense & Sensibility#> 2 "" Sense & Sensibility#> 3 "by Jane Austen" Sense & Sensibility#> 4 "" Sense & Sensibility#> 5 "(1811)" Sense & Sensibility#> 6 "" Sense & Sensibility#> # ... with 7.342e+04 more rows
original_books <-austen_books() %>%group_by(book) %>%mutate(linenumber =row_number(),chapter =cumsum(str_detect(text, regex("^chapter [\\divxlc]",ignore_case =TRUE)))) %>%ungroup()original_books#> # A tibble: 73,422 x 4#> text book linenumber chapter#> <chr> <fct> <int> <int>#> 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0#> 2 "" Sense & Sensibility 2 0#> 3 "by Jane Austen" Sense & Sensibility 3 0#> 4 "" Sense & Sensibility 4 0#> 5 "(1811)" Sense & Sensibility 5 0#> 6 "" Sense & Sensibility 6 0#> # ... with 7.342e+04 more rows
original_books %>%unnest_tokens(word, text)#> # A tibble: 725,055 x 4#> book linenumber chapter word #> <fct> <int> <int> <chr> #> 1 Sense & Sensibility 1 0 sense #> 2 Sense & Sensibility 1 0 and #> 3 Sense & Sensibility 1 0 sensibility#> 4 Sense & Sensibility 3 0 by #> 5 Sense & Sensibility 3 0 jane #> 6 Sense & Sensibility 3 0 austen #> # ... with 7.25e+05 more rows
stop_words #> # A tibble: 1,149 x 2#> word lexicon#> <chr> <chr> #> 1 a SMART #> 2 a's SMART #> 3 able SMART #> 4 about SMART #> 5 above SMART #> 6 according SMART #> # ... with 1,143 more rowstidy_books <-original_books %>%unnest_tokens(word, text) %>%anti_join(stop_words)tidy_books#> # A tibble: 217,609 x 4#> book linenumber chapter word #> <fct> <int> <int> <chr> #> 1 Sense & Sensibility 1 0 sense #> 2 Sense & Sensibility 1 0 sensibility#> 3 Sense & Sensibility 3 0 jane #> 4 Sense & Sensibility 3 0 austen #> 5 Sense & Sensibility 5 0 1811 #> 6 Sense & Sensibility 10 1 chapter #> # ... with 2.176e+05 more rows