In Class Exercise 5

Author

Guan Jhen Lin

Published

May 11, 2024

Modified

May 15, 2024

pacman::p_load(tidyverse, readtext,
               quanteda, tidytext)

data_folder <- "data/articles"

#text_data <- readtext(paste0("data/articles","/*")), class mate found another
#way of writing refer below

text_data <- readtext("data/articles/*")

corpus_text <- corpus(text_data)
summary(corpus_text, 5)

Corpus consisting of 338 documents, showing 5 documents:

                                   Text Types Tokens Sentences
 Alvarez PLC__0__0__Haacklee Herald.txt   206    433        18
    Alvarez PLC__0__0__Lomark Daily.txt   102    170        12
   Alvarez PLC__0__0__The News Buoy.txt    90    200         9
 Alvarez PLC__0__1__Haacklee Herald.txt    96    187         8
    Alvarez PLC__0__1__Lomark Daily.txt   241    504        21

usenet_words <- text_data %>% 
  unnest_tokens(word,text) %>%
  filter (str_detect(word,"[a-z']$"),
          !word %in% stop_words$word)

usenet_words %>%
  count(word, sort = TRUE)

readtext object consisting of 3260 documents and 0 docvars.
# A data frame: 3,260 × 3
  word             n text     
  <chr>        <int> <chr>    
1 fishing       2177 "\"\"..."
2 sustainable   1525 "\"\"..."
3 company       1036 "\"\"..."
4 practices      838 "\"\"..."
5 industry       715 "\"\"..."
6 transactions   696 "\"\"..."
# ℹ 3,254 more rows

You can use stringr package to slice your data, tidyr which is mainly for transforming numerical data, you can use separate wider delim.

text_data_splitted <- text_data %>%
  separate_wider_delim("doc_id",
                       delim = "__0__",
                       names = c("X","Y"), 
                       too_few = "align_end")