# # hands on - first attempt # one <- 1 one + one two <- one + one two # # data classes - properties of elements # as.numeric(10.49) as.integer(10.49) as.character(-1) as.numeric("anythingwithinquotes") 5 > 10 as.character(5 > 10) # # R objects - vector # c(2,3,5) c("aa", "bb", "cc", "dd", "ee") c(TRUE, FALSE, TRUE, FALSE, FALSE) # # R objects - matrix # m <- matrix(data = c(1,2,3,4,5,6,7,8,9,10,11,12), nrow = 3, ncol = 4) m # # R objects - list # n <- c(2, 3, 5) s <- c("aa", "bb", "cc", "dd", "ee") x <- list(n, s, b, 3) # x contains copy of n, s x # # R objects - data frame # teams <- c("PHI","NYM","FLA","ATL","WSN") wins <- c(92,89,94,72,59) losses <- c(70,73,77,90,102) data <- data.frame(teams,wins,losses) data # # R functions # sqrt(9) sample(x = 0:100, size = 10, rep = FALSE) sample() # # libraries # install.packages("network") library("network") library(network) require("network") detach("package:network", unload = TRUE) # # libraries we might use # install.packages("tm") install.packages("wordcloud") install.packages("quanteda") # # basic functions # c() # combine two or more elements into an object class() # explore elements’ data class length() # explore number of first dim. of object dim() # explore dimensions of two-dimensional obj. nrow() # number of rows ncol() # number of columns head() # first few rows of data tail() # last few rows of data str() # explore structure of object names() # names in the named vector - one dimension rownames() # names of rows - two dimensions colnames() # names of columns - two dimensions # # working directory # setwd("C:\\Users\\Lukas\\Desktop\\data") setwd("C:/Users/Lukas/Desktop/data") # # exporting object - tabular # frequencies <- c(92,89,94,72,59) write.table(frequencies, "frequencies.csv", sep = ",", row.names = FALSE, col.names = TRUE, fileEncoding = "UTF-8") # # explorting object - unstructured # frequencies <- c(92,89,94,72,59) writeLines(frequencies, "frequencies.txt") # # loading tm package # library(tm) # # corpus # getSources() ?Corpus() ?inspect() my.texts <- "C:\\Users\\Lukas\\Desktop\\data\\" directory.source <- DirSource(directory = my.texts) text.corpus <- Corpus(directory.source) text.corpus # # corpus operations # ?removePunctuation() ?removeWords() ?stripWhitespace() ?removeNumbers() ?stemDocument() ?PlainTextDocument() # # corpus operations # edited.corpus <- text.corpus edited.corpus <- tm_map(edited.corpus, removeNumbers) edited.corpus <- tm_map(edited.corpus, removePunctuation) edited.corpus <- tm_map(edited.corpus, stripWhitespace) edited.corpus <- tm_map(edited.corpus, removeWords, stopwords("english")) # # term-document matrix # ?TermDocumentMatrix() ?DocumentTermMatrix() ?as.matrix() # # term-document matrix # tdm <- TermDocumentMatrix(edited.corpus) dtm <- DocumentTermMatrix(edited.corpus) tdm.matrixed <- as.matrix(tdm) # # useful functions # ?removeSparseTerms() ?findFreqTerms() ?findAssocs() # # frequencies # tdm.matrixed <- as.matrix(tdm) frequencies <- rowSums(tdm.matrixed) frequencies <- sort(frequencies, decreasing = T) head(frequencies) # # wordcloud # tdm.matrixed <- as.matrix(tdm) frequencies <- rowSums(tdm.matrixed) frequencies <- sort(frequencies,decreasing = T) frequencies <- frequencies[1:40] terms <- names(frequencies) library(wordcloud) wordcloud(words = terms, freq = frequencies, scale = c(5,0.5), max.words = 150, random.order = F, rot.per = 0, colors = "royalblue") # # well done! #