d<-read.csv("complaints_2000.csv")

class <- d$class

library(tm)
c <- VCorpus(VectorSource(d$text))

XXXX.replace <- content_transformer( 
         function (x) gsub("XXXX", " ", x)
         )

c <- tm_map(c, removePunctuation)
c <- tm_map(c, removeNumbers)
c <- tm_map(c, XXXX.replace)
dtm <- DocumentTermMatrix(c, 
         control = list(
                     bounds = list(global = c(3, Inf)),
					 stopwords = TRUE
                   )
         )


ncl <- c(2:10)	

cl <- c()
for (centers in ncl) {
	print(centers)
	clustering <- kmeans(dtm,centers=centers,iter.max = 100)
	cl[length(cl)+1] <- clustering$tot.withinss
}
plot(ncl, cl)


nclust <- 6
clustering <- kmeans(dtm,centers=nclust,iter.max = 100)
t <- table(clustering$cluster, class)
rowsums <- apply(t, 1, sum)
maxvalues <- apply(t, 1, max)
purities <- maxvalues/rowsums
print(purities)
print("Average purity")
print(weighted.mean(purities, rowsums/sum(rowsums)))


# calculating the values needed for Chi squared 
# values estimation
D<-C<-B<-A<-matrix(nrow=nclust, ncol=dim(dtm)[2])
rownames(D)<-rownames(C)<-rownames(B)<-rownames(A)<-
           c(1:nclust)
colnames(D)<-colnames(C)<-colnames(B)<-colnames(A)<-
           colnames(dtm)

for (c in c(1:nclust)) {
   for (w in c(1:dim(dtm)[2])) {
      A[c,w] <- sum(dtm[clustering$cluster == c, w] != 0)
      B[c,w] <- sum(dtm[clustering$cluster != c, w] != 0)
      C[c,w] <- sum(dtm[clustering$cluster == c, w] == 0)
      D[c,w] <- sum(dtm[clustering$cluster != c, w] == 0)
   }
}

# calculating the values of the Chi square metric
# (not in one formula to prevent integer overflow due
# to very high numbers)
chi <- dim(dtm)[1]*((A*D)-(C*B))^2
chi <- chi/(A+C)
chi <- chi/(B+D)
chi <- chi/(A+B)
chi <- chi/(C+D)

# calculating the average tf-idf of terms in the collection
avg_frequencies <- colMeans(as.matrix(dtm))

# calculating the average tf-idf of terms in each class
tab <- rowsum(as.matrix(dtm), clustering$cluster)
numbers_of_documents_in_clusters <- clustering$size
tab <- tab/numbers_of_documents_in_clusters
	
# printing 10 most important attributes for each class
# (only attributes with the average tf-idf value for a class 
# higher than the average for all classes)
print("The values of Chi square for attributes and classes:")
for (c in c(1:nclust)) {
   print(paste("Class: ", c), sep="")
   print((sort((chi[c,tab[c, ] > avg_frequencies]),
               decreasing=TRUE))[1:10])
}

# printing 5 documents closest to cluster centroids
for (c in c(1:nclust)) {
   print(paste("Cluster ", c, sep=""))
   distances <- apply(as.matrix(dtm)[c==clustering$cluster,], 
                      1,
                      function(x) 
                        sum((x - clustering$centers[c, ])^2)
                     ) 
   print(d[as.numeric(names(sort(distances)[1:5])),2])
}