# https://data.world/data-society/consumer-complaint-data

d<-read.csv("complaints_2000.csv")
cl <- d$class

library(tm)
c <- VCorpus(VectorSource(d$text))

XXXX.replace <- content_transformer( 
         function (x) gsub("XXXX", " ", x)
         )

c <- tm_map(c, removePunctuation)
c <- tm_map(c, removeNumbers)
c <- tm_map(c, XXXX.replace)

c[[1]]$content

dtm <- DocumentTermMatrix(c, 
         control = list(
                     bounds = list(global = c(2, Inf))
                   )
         )
inspect(dtm)


dtm <- DocumentTermMatrix(c, 
         control = list(
                     bounds = list(global = c(2, Inf)),
					 stopwords = TRUE
                   )
         )
inspect(dtm)

# converting the data to a matrix 
m <- as.matrix(dtm) 

# number of instances
n <- dim(m)[1] 

# indexes of 10% testing samples
number_of_test_samples <- round(n*0.1,0)
test_index <- sample(1:n, number_of_test_samples, replace=FALSE)

# selecting training samples 
train_data <- m[-test_index, ]
train_labels <- cl[-test_index]

# selecting testing samples 
test_data <- m[test_index, ]
test_labels <- cl[test_index]

# training a c5 model (tree and rules)
library(C50)
c5_tree <- C5.0(train_data, as.factor(train_labels),control=C5.0Control(label = "__outcome__"))

# printing the detailed summary for the generated 
# decision tree 
print(summary(c5_tree))

# calculating class predictions for the tree
predictions <- predict(c5_tree, test_data, type="class")

# creating a confusion matrix
cm <- table(test_labels, predictions)
print("Confusion matrix")
print(cm)

# number of instances
n <- sum(cm) 

# number of correctly classified instances for each class
correct <- diag(cm) 

# numbers of instances in each class
instances_in_classes <- apply(cm, 1, sum) 

# numbers of each class predictions
class_predictions <- apply(cm, 2, sum) 

# accuracy
accuracy <- sum(correct)/n

# precision per class
precision <- correct/class_predictions

# recall per class
recall <- correct/instances_in_classes
 
# F1-measure per class
f1 <- 2 * precision * recall / (precision + recall) 

# printing summary information for all classes
df <- data.frame(precision, recall, f1)
print("Detailed classification metrics")
print(df)
print(paste("Accuracy:", accuracy))

# macroaveraging
print("Macro-averaged metrics")
print(colMeans(df))

# microaveraging
print("Micro-averaged metrics")
print(apply(df, 2, function (x)  
                     weighted.mean(x, w=instances_in_classes)))


# looking at real and predicted labels and a piece of text
t <- sapply(d[test_index, 2], function (x) substr(x, start=1,stop=100)) 
o <- data.frame(test_labels, predictions, t)
colnames(o)<-c("real","predicted","text")
print(o)