install.packages(c("stringi", "WikipediR", "tm","stringi","XML","httr","openssl"),
repos="https://cloud.r-project.org/"
#, lib="/usr/local/lib/R/site-library"
)
library("XML")
library("WikipediR")
library("tm")
library("stringi")
titles <- c("Esino_Lario",
"Riemann_integral",
"Riemann-Stieltjes_integral",
"Derivative",
"Limit_of_a_sequence",
"Edvard_Munch",
"Vincent_van_Gogh",
"Jan_Matejko",
"Lev_Tolstoj",
"Franz_Kafka",
"J._R._R._Tolkien")
articles <- character(length(titles))
for (i in 1:length(titles)) {
articles[i] <- page_content("en","wikipedia", page_name = titles[i])
}
docs <- Corpus(VectorSource(articles))
length(docs)
corpus <- tm_map(docs, function(x) stri_replace_all_regex(x, "<.+?>", " "))
corpus <- tm_map(corpus, function(x) stri_replace_all_fixed(x, "\t", " "))
corpus <- tm_map(corpus, PlainTextDocument)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, function(x) stri_replace_all_regex(x, " . ", " "))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, PlainTextDocument)
dtm <- TermDocumentMatrix(corpus)
tdm <- TermDocumentMatrix(docs)
tdm
freq <- colSums(as.matrix(dtms))
freq