######## Intro ######## ### 2 vektory area <- c(1:12) rate <- c(1.2, 1.5, 1.02, 1.63, 1.14, 1.86, 2.1, 1.43, 1.76, 2.08, 1.32, 1.6) ### vypsat promenne ls() ### zobrazit area ### matice mat <- cbind(area, rate) ### zobrazit 2 radek matice mat[2, ] ### vykreslit matici plot(mat) ### dalsi vektor area <- c(1:12) ### vytvoreni sekvenci rate <- seq(length=12, from=1, by=.1) mat2 <- cbind(area,rate) plot(mat) ### okno se nepremaze par(new=TRUE) plot(mat2) ### rozlisit a mit stejnou stupnici plot(mat, main="pokusne matice", ylim=c(1, 2.2));par(new=TRUE);plot(mat2, ylim=c(1, 2.2), pch=2) ### pridame popisky legend(x="topleft", legend=c("mat", "mat2"), pch=c(1,2), bg="white", inset=0.01) ### pridame "ohraniceni" abline(v=c(6, 8), lty=4) ### text text(7,1.2,"spatna oblast",srt=90) ######### tm ########## ### Pavel Brazdil, LIAAD, FEP Universidade do Porto ### library(tm) ### data volne dostupna na internetu - 20newsgroups ### http://people.csail.mit.edu/jrennie/20Newsgroups/ electr.train <- Corpus(DirSource("20news-bydate-train/sci.electronics"), readerControl=list(reader=readPlain, language="en_US")) print(electr.train) summary(electr.train) inspect(electr.train[1:3]) ### dalsi 3 korpusy religion.train <- Corpus(DirSource("20news-bydate-train/talk.religion.misc"), readerControl=list(reader=readPlain, language="en_US")) electr.test <- Corpus(DirSource("20news-bydate-test/sci.electronics"), readerControl=list(reader=readPlain, language="en_US")) religion.test <- Corpus(DirSource("20news-bydate-test/talk.religion.misc"), readerControl=list(reader=readPlain, language="en_US")) # indexy jednotlivych dokumentu # electr.train – documents 1 .. 591 # religion.train – documents 592 .. 968 (377 docs) # electr.test – documents 969 .. 1361 (393 docs) # religion.test – documents 1362 .. 1612 (251 docs) all <- c(electr.train, religion.train, electr.test, religion.test) ### predzpracovani all <- tm_map(all, PlainTextDocument) all <- tm_map(all, removeWords, stopwords(language="english")) all <- tm_map(all, stripWhitespace) all <- tm_map(all, tolower) all <- tm_map(all, removePunctuation) all <- tm_map(all, removeNumbers) ### stemming - velmi dlouhe... ### library(rJava); library(rWeka); library(Snowball) ### all <- tm_map(all, stemDocument) DocumentTermMatrix(all) dtm.all <- DocumentTermMatrix( all, control=list(minWordLength=3, minDocFreq=5, weighting=weightTfIdf)) findFreqTerms(dtm.all, 40) dtm.all.frame <- as.data.frame(inspect( dtm.all)) class <- c(rep("sci",591), rep("rel",377), rep("sci",393), rep("rel",251)) dtm.all.frame <- cbind(dtm.all.frame, class) ## pripravime trenovaci data train <- dtm.all.frame[1:968, 1:ncol( dtm.all.frame )] ### testovaci test <- dtm.all.frame[969:1612, 1:ncol( dtm.all.frame ) - 1] testclass <- dtm.all.frame[969:1612,ncol(dtm.all.frame)] ### vyber termu pro klasifikaci clas.formula<-class ~ cable + circuit + ground + neutral + outlets + subject + wire + wiring + judas + christ + elohim + father + god + gods + jehovah + jesus + lord + mcconkie + son + unto ######## rozhodovaci strom library(rpart) dt <- rpart(clas.formula, train) ### predikce dt.predictions <- predict(dt, test, type="class") conf.mx.dt<-table(testclass, dt.predictions) conf.mx.dt error.rate.dt <- (sum(conf.mx.dt)-sum(diag(conf.mx.dt))) / sum(conf.mx.dt) error.rate.dt ######## Neural net library(nnet) nnet.classifier <- nnet(clas.formula, data=train, size=2, rang=0.1, decay=5e-4, maxit=200) preds.nn <- predict(nnet.classifier, test, type="class") conf.mx.nn <- table(testclass, preds.nn) conf.mx.nn error.rate.nn <- (sum(conf.mx.nn)-sum(diag(conf.mx.nn))) / sum(conf.mx.nn) error.rate.nn ######## SVM library(e1071) svm.classifier <- svm(clas.formula, train, scale=F, kernel="linear") preds.svm <- predict(svm.classifier, test) conf.mx.svm <- table(testclass, preds.svm) conf.mx.svm error.rate.svm <- (sum(conf.mx.svm)-sum(diag(conf.mx.svm))) / sum(conf.mx.svm) error.rate.svm ######## NaiveBayes - Weka library(RWeka) NB<-make_Weka_classifier("weka/classifiers/bayes/NaiveBayes") nb.classifier<-NB(clas.formula, train) preds.nb<-predict(nb.classifier, test) conf.mx.nb<-table(testclass, preds.nb) conf.mx.nb error.rate.nb <- (sum(conf.mx.nb)-sum(diag(conf.mx.nb))) / sum(conf.mx.nb) error.rate.nb ####### Prejmenovani termu pro as.formula (termy jako if, then, else, for, break koliduji s prikazy R) rename.terms.in.dtm <- function(dtm) { for (i in 1:length(dtm)) { cat("replaced to ", paste(colnames(dtm)[i],".t", sep=""), "\n") colnames(dtm)[i] <- paste(colnames(dtm)[i],".t", sep="") } #end for i return(dtm) } train<-rename.terms.in.dtm(train) test<-rename.terms.in.dtm(test) ####### Vytvoreni klasifikacni formule ze vsech termu clas.formula<-as.formula(paste("class.t ~",paste(colnames(train)[vybrane], collapse="+")))