d<-read.csv("complaints_2000.csv") class <- d$class library(tm) c <- VCorpus(VectorSource(d$text)) XXXX.replace <- content_transformer( function (x) gsub("XXXX", " ", x) ) c <- tm_map(c, removePunctuation) c <- tm_map(c, removeNumbers) c <- tm_map(c, XXXX.replace) dtm <- DocumentTermMatrix(c, control = list( bounds = list(global = c(3, Inf)), stopwords = TRUE ) ) ncl <- c(2:10) cl <- c() for (centers in ncl) { print(centers) clustering <- kmeans(dtm,centers=centers,iter.max = 100) cl[length(cl)+1] <- clustering$tot.withinss } plot(ncl, cl) nclust <- 6 clustering <- kmeans(dtm,centers=nclust,iter.max = 100) t <- table(clustering$cluster, class) rowsums <- apply(t, 1, sum) maxvalues <- apply(t, 1, max) purities <- maxvalues/rowsums print(purities) print("Average purity") print(weighted.mean(purities, rowsums/sum(rowsums))) # calculating the values needed for Chi squared # values estimation D<-C<-B<-A<-matrix(nrow=nclust, ncol=dim(dtm)[2]) rownames(D)<-rownames(C)<-rownames(B)<-rownames(A)<- c(1:nclust) colnames(D)<-colnames(C)<-colnames(B)<-colnames(A)<- colnames(dtm) for (c in c(1:nclust)) { for (w in c(1:dim(dtm)[2])) { A[c,w] <- sum(dtm[clustering$cluster == c, w] != 0) B[c,w] <- sum(dtm[clustering$cluster != c, w] != 0) C[c,w] <- sum(dtm[clustering$cluster == c, w] == 0) D[c,w] <- sum(dtm[clustering$cluster != c, w] == 0) } } # calculating the values of the Chi square metric # (not in one formula to prevent integer overflow due # to very high numbers) chi <- dim(dtm)[1]*((A*D)-(C*B))^2 chi <- chi/(A+C) chi <- chi/(B+D) chi <- chi/(A+B) chi <- chi/(C+D) # calculating the average tf-idf of terms in the collection avg_frequencies <- colMeans(as.matrix(dtm)) # calculating the average tf-idf of terms in each class tab <- rowsum(as.matrix(dtm), clustering$cluster) numbers_of_documents_in_clusters <- clustering$size tab <- tab/numbers_of_documents_in_clusters # printing 10 most important attributes for each class # (only attributes with the average tf-idf value for a class # higher than the average for all classes) print("The values of Chi square for attributes and classes:") for (c in c(1:nclust)) { print(paste("Class: ", c), sep="") print((sort((chi[c,tab[c, ] > avg_frequencies]), decreasing=TRUE))[1:10]) } # printing 5 documents closest to cluster centroids for (c in c(1:nclust)) { print(paste("Cluster ", c, sep="")) distances <- apply(as.matrix(dtm)[c==clustering$cluster,], 1, function(x) sum((x - clustering$centers[c, ])^2) ) print(d[as.numeric(names(sort(distances)[1:5])),2]) }