######## Intro ######## ### 2 vektory area <- c(1:12) rate <- c(1.2, 1.5, 1.02, 1.63, 1.14, 1.86, 2.1, 1.43, 1.76, 2.08, 1.32, 1.6) ### vypsat promenne ls() ### zobrazit area ### funkce fact<-function(x){ if(x>1) { return (x*fact(x-1)) } else { return(1) } } fact(5) ### matice mat <- cbind(area, rate) ### zobrazit 2 radek matice mat[2, ] ### vykreslit matici plot(mat) ### dalsi vektor area <- c(1:12) ### vytvoreni sekvenci rate <- seq(length=12, from=1, by=.1) mat2 <- cbind(area,rate) plot(mat) ### okno se nepremaze par(new=TRUE) plot(mat2) ### rozlisit a mit stejnou stupnici plot(mat, main="pokusne matice", ylim=c(1, 2.2));par(new=TRUE);plot(mat2, ylim=c(1, 2.2), pch=2) ### pridame popisky legend(x="topleft", legend=c("mat", "mat2"), pch=c(1,2), bg="white", inset=0.01) ### pridame "ohraniceni" abline(v=c(6, 8), lty=4) ### text text(7,1.2,"spatna oblast",srt=90) ###################################################################### ######### klasifikace, rozhodovaci stromy, asociacni pravidla # I. Play or Not To Play ### nacteni souboru z dialogu (TPONTPNom.csv) data <- read.csv(file.choose()) ### Zobrazeni: data ### Prvni radek: data[1,] ### 3-5 sloupec: data[,3:5] ### rozhodovaci strom (package "rpart") library(rpart) ### strom s cilovym atributem Play: tree<-rpart(Play~., data, method="class") ### vypsat strom: tree ### parametr minsplit pomuze vytvorit lepsi strom: tree<-rpart(Play~., data, method="class", control=rpart.control(minsplit=5)) tree ### prvni testovani stromu - na trenovacich datech (jen pro zacatek, jinak NEDELAT!!!) tree.predictions <- predict(tree, data[,1:4], type="class") ### vytvoreni matice zmateni = confusion matrix table(data[,5], tree.predictions) ### cross validation - manualne err.vect <- vector() for(j in 1:10) { # 10 pokusů select <- sample(1:nrow(data), 0.9*nrow(data)) # nahodna permutace dat train <- data[select,] # 90% dat pro trenink test <- data[-select,] # zbyla data pro test (10%) tree <- rpart(Play~., train, control=rpart.control(minsplit=5)) pred <- predict(tree, test[,1:4], type="class") cmx<-table(test[, "Play"], pred) # sloupec Play pro ověření predikce err<- 1 - ( sum(diag(cmx)) / sum(cmx) ) err.vect <- c(err.vect, err) } err.vect mean(err.vect) ### vypocet asociacnich pravidel library(arules) rules<-apriori(data, parameter = list(support = 0.2,confidence = 0.6)) rules ################## # II. Iris dataset ### knihovna obsahujici funkce pro nacteni a zapsani arff souboru library(foreign) ### nacteni souboru z dialogu (iris.arff) data.iris <- read.arff(file.choose()) ### Zobrazeni data.iris library(rpart) ### Vytvoreni stromu ze vsech dat, cilovy atribut tentokrat "class" tree.iris<-rpart(class~., data.iris, method="class") tree.iris ### Pro otestovani stromu na datech iris rozdelime data nahodne na ### 2/3 trenovacich a tretinu testovacich: ### nahodna permutace delky 150, do hodnoty 150 idx<-sample(150,150) ### 2/3 dat pouzijeme jako trenovaci data train.iris <- data.iris[idx[1:100]] ### zbyla 1/3 dat pro test test.iris <- data.iris[idx[101:150] tree.iris <- rpart(class~., train.iris) ### nechame zpocitat predikce pro testovaci data (vypustime posledni sloupec "class") pred.iris <- predict(tree.iris, test.iris[,1:4], type="class") ### “matice zmateni” = confusion matrix cmx.iris <- table(test.iris[, 5], pred.iris) cmx.iris ### priklad vypoctu error rate (pomer spatne klasifikovanych prikladu, ke vsem prikladum) err.iris <- 1 - (sum(diag(cmx.iris)) / sum(cmx.iris) ) err.iris ######################################################################## ######### tm ########## ### Pavel Brazdil, LIAAD, FEP Universidade do Porto ### library(tm) ### data volne dostupna na internetu - 20newsgroups ### http://people.csail.mit.edu/jrennie/20Newsgroups/ electr.train <- Corpus(DirSource("20news-bydate-train/sci.electronics"), readerControl=list(reader=readPlain, language="en_US")) print(electr.train) summary(electr.train) inspect(electr.train[1:3]) ### dalsi 3 korpusy religion.train <- Corpus(DirSource("20news-bydate-train/talk.religion.misc"), readerControl=list(reader=readPlain, language="en_US")) electr.test <- Corpus(DirSource("20news-bydate-test/sci.electronics"), readerControl=list(reader=readPlain, language="en_US")) religion.test <- Corpus(DirSource("20news-bydate-test/talk.religion.misc"), readerControl=list(reader=readPlain, language="en_US")) # indexy jednotlivych dokumentu # electr.train – documents 1 .. 591 # religion.train – documents 592 .. 968 (377 docs) # electr.test – documents 969 .. 1361 (393 docs) # religion.test – documents 1362 .. 1612 (251 docs) all <- c(electr.train, religion.train, electr.test, religion.test) ### predzpracovani all <- tm_map(all, PlainTextDocument) all <- tm_map(all, removeWords, stopwords(language="english")) all <- tm_map(all, stripWhitespace) all <- tm_map(all, tolower) all <- tm_map(all, removePunctuation) all <- tm_map(all, removeNumbers) ### stemming - velmi dlouhe... ### library(rJava); library(rWeka); library(Snowball) ### all <- tm_map(all, stemDocument) DocumentTermMatrix(all) dtm.all <- DocumentTermMatrix( all, control=list(minWordLength=2, minDocFreq=5)) findFreqTerms(dtm.all, 40) dtm.all.frame <- as.data.frame(inspect( dtm.all)) class <- c(rep("sci",591), rep("rel",377), rep("sci",393), rep("rel",251)) dtm.all.frame <- cbind(dtm.all.frame, class) ## pripravime trenovaci data train <- dtm.all.frame[1:968, 1:ncol( dtm.all.frame )] ### testovaci test <- dtm.all.frame[969:1612, 1:ncol( dtm.all.frame ) - 1] testclass <- dtm.all.frame[969:1612,ncol(dtm.all.frame)] ### rozhodovaci strom library(rpart) dt <- rpart(class ~ cable + circuit + ground + neutral + outlets + subject + wire + wiring + judas + ra + christ + elohim + father + god + gods + jehovah + jesus + lord + mcconkie + ps + son + unto, train) ### predikce dt.predictions <- predict(dt, test, type="class") table(testclass, dt.predictions)