# # Help! # help(c) ?c # # Vector # c(2,3,5) c("aa", "bb", "cc", "dd", "ee") c(TRUE, FALSE, TRUE, FALSE, FALSE) # # Matrix # m <- matrix(data = c(1,2,3,4,5,6,7,8,9,10,11,12), nrow = 3, ncol = 4) m # # List # numbers <- c(2, 3, 5) strings <- c("aa", "bb", "cc", "dd", "ee") my.list <- list(numbers, strings, 3) my.list # # Data frame # teams <- c("PHI","NYM","FLA","ATL","WSN") wins <- c(92,89,94,72,59) losses <- c(70,73,77,90,102) table.data <- data.frame(teams, wins, losses) table.data # # Basic R functions # c() # combine two or more elements into an object class() # explore elements’ data class length() # explore number of first dim. of object dim() # explore dimensions of two-dimensional obj. nrow() # number of rows ncol() # number of columns head() # first few rows of data tail() # last few rows of data str() # explore structure of object names() # names in the named vector - one dimension rownames() # names of rows - two dimensions colnames() # names of columns - two dimensions # # working directory # setwd("C:\\Users\\Lukas\\Desktop\\data\\") setwd("C:/Users/Lukas/Desktop/data/") # # libraries # library("tm") require("tm") detach("package:tm", unload = T) tm::Corpus() # # exporting object - tabular # frequencies <- c(92,89,94,72,59) write.table(frequencies, "frequencies.csv", sep = ",", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8") # # Corpus # require("tm") my.dir <- "C:\\Users\\Lukas\\Desktop\\data\\LDA" directory.source <- DirSource(directory = my.dir, encoding = "UTF-8", ignore.case = T, pattern = ".txt") text.corpus <- Corpus(directory.source) # # corpus operations # ?removePunctuation() ?removeWords() ?stripWhitespace() ?removeNumbers() ?stemDocument() ?PlainTextDocument() # # corpus operations # edited.corpus <- text.corpus edited.corpus <- tm_map(edited.corpus, removeNumbers) edited.corpus <- tm_map(edited.corpus, removePunctuation) edited.corpus <- tm_map(edited.corpus, stripWhitespace) edited.corpus <- tm_map(edited.corpus, removeWords, stopwords("english")) # # Document term matrix # dtm <- DocumentTermMatrix(edited.corpus) dtm dtm <- removeSparseTerms(dtm,sparse = 0.99) dtm dtm.matrixed <- as.matrix(dtm) # # LDA # library(topicmodels) n.topics <- 10 lda.parameters <- list(verbose = 1, iter = 500, thin = 300, burnin = 1000, alpha = 50/n.topics) model <- LDA(x = dtm, k = n.topics, method = "Gibbs", control = lda.parameters) # # Exploring results - terms # terms(model,10) model.terms <- terms(model,10) # # Exploring results - posterior distributions # model.posterior <- posterior(model) model.posterior$topics topic.doc.matrix <- model.posterior$topics model.posterior$terms topic.terms.matrix <- model.posterior$terms # # exporting results as tabular data # write.table(topic.doc.matrix, "topic.doc.matrix.txt", sep = ",", row.names = TRUE, col.names = TRUE, fileEncoding = "UTF-8")