# https://data.world/data-society/consumer-complaint-data d<-read.csv("complaints_2000.csv") cl <- d$class library(tm) c <- VCorpus(VectorSource(d$text)) XXXX.replace <- content_transformer( function (x) gsub("XXXX", " ", x) ) c <- tm_map(c, removePunctuation) c <- tm_map(c, removeNumbers) c <- tm_map(c, XXXX.replace) c[[1]]$content dtm <- DocumentTermMatrix(c, control = list( bounds = list(global = c(2, Inf)) ) ) inspect(dtm) dtm <- DocumentTermMatrix(c, control = list( bounds = list(global = c(2, Inf)), stopwords = TRUE ) ) inspect(dtm) # converting the data to a matrix m <- as.matrix(dtm) # number of instances n <- dim(m)[1] # indexes of 10% testing samples number_of_test_samples <- round(n*0.1,0) test_index <- sample(1:n, number_of_test_samples, replace=FALSE) # selecting training samples train_data <- m[-test_index, ] train_labels <- cl[-test_index] # selecting testing samples test_data <- m[test_index, ] test_labels <- cl[test_index] # training a c5 model (tree and rules) library(C50) c5_tree <- C5.0(train_data, as.factor(train_labels),control=C5.0Control(label = "__outcome__")) # printing the detailed summary for the generated # decision tree print(summary(c5_tree)) # calculating class predictions for the tree predictions <- predict(c5_tree, test_data, type="class") # creating a confusion matrix cm <- table(test_labels, predictions) print("Confusion matrix") print(cm) # number of instances n <- sum(cm) # number of correctly classified instances for each class correct <- diag(cm) # numbers of instances in each class instances_in_classes <- apply(cm, 1, sum) # numbers of each class predictions class_predictions <- apply(cm, 2, sum) # accuracy accuracy <- sum(correct)/n # precision per class precision <- correct/class_predictions # recall per class recall <- correct/instances_in_classes # F1-measure per class f1 <- 2 * precision * recall / (precision + recall) # printing summary information for all classes df <- data.frame(precision, recall, f1) print("Detailed classification metrics") print(df) print(paste("Accuracy:", accuracy)) # macroaveraging print("Macro-averaged metrics") print(colMeans(df)) # microaveraging print("Micro-averaged metrics") print(apply(df, 2, function (x) weighted.mean(x, w=instances_in_classes))) # looking at real and predicted labels and a piece of text t <- sapply(d[test_index, 2], function (x) substr(x, start=1,stop=100)) o <- data.frame(test_labels, predictions, t) colnames(o)<-c("real","predicted","text") print(o)