Authors: Martin Bartoš, Frederik Hudák, Matěj Radvanský

1. Outlier detection in unclassified data

library(tm)
library(tm.corpus.Reuters21578)
data(Reuters21578)

reuters <- Reuters21578
# Clean corpus
# Some documents are empty

# The most common words carry very little information 
reuters <- tm_map(reuters, removeWords, stopwords("english"))

reuters <- tm_map(reuters, removeNumbers)
reuters <- tm_map(reuters, removePunctuation)
reuters <- tm_map(reuters, stripWhitespace)

Due to the size of the data, a random sample of the documents had to be selected to reduce computation times. Setting a constant seed allows us to get the same random sample every time.

set.seed(30)
sample_size <- 200
reuters_sample <- sample(reuters, sample_size)

Bag of words

To transfrom the corpus into a bag of words model, we use a Term-Document matrix

# Create a term document matrix
dtm <- TermDocumentMatrix(reuters_sample, control=list())

# Number of unique words in dictionary
nrow(dtm)
## [1] 4331

We transpose the matrix to get a Document-Term matrix

dtm_transp <- t(dtm)

K-means document clustering

dtm_tfxidf <- weightTfIdf(dtm)
## Warning in weightTfIdf(dtm): empty document(s): 14405 1710 13367 20066
## 13456 21366 18484 13965 20218 14060 20397 12549 9412 14433 7152 21127 20164
## 14420 6834 11547 1780
# k-means
m <- as.matrix(dtm_tfxidf)
rownames(m) <- 1:nrow(m)

#Normalize the vectors
norm_eucl <- function(m) m/apply(m, MARGIN=1, FUN=function(x) sum(x^2)^.5)
m_norm <- norm_eucl(m)

# Cluster into 10 clusters
cl <- kmeans(m_norm, 10)

# Show clusters using the first 2 principal components
plot(prcomp(m_norm)$x, col=cl$cl)

Additional postprocessing

Various natural language processing methods can be used to further extend our model, such as part-of-speech tagging or sentiment analysis. These methods can also be useful to get a better understanding of the data.

library(coreNLP)
initCoreNLP()

output <- annotateString(reuters_sample[[5]])
head(getDependency(output))
##   sentence governor dependent     type governorIdx dependentIdx govIndex
## 1        1     ROOT   replied     root           0           50       NA
## 2        1   Reagan President compound           2            1        2
## 3        1     said    Reagan    nsubj           3            2        3
## 4        1  replied      said      dep          50            3       50
## 5        1     said       pct     dobj           3            4        3
## 6        1     rate        US compound           8            5        8
##   depIndex
## 1       50
## 2        1
## 3        2
## 4        3
## 5        4
## 6        5
getSentiment(output)
##   id sentimentValue    sentiment
## 1  1              0 Verynegative
getParse(output)
## [1] "(ROOT\r\n  (SINV\r\n    (S\r\n      (NP (NNP President) (NNP Reagan))\r\n      (VP (VBD said)\r\n        (NP\r\n          (NP (NN pct))\r\n          (NP (NNP US) (JJ economic) (NN growth) (NN rate)))\r\n        (NP-TMP (JJ final) (NN quarter))\r\n        (S\r\n          (ADJP (JJ bad)\r\n            (SBAR\r\n              (S\r\n                (NP (DT The) (NNP Commerce) (NNP Department))\r\n                (VP (VBD said)\r\n                  (NP (NN rate) (NN growth))\r\n                  (S\r\n                    (NP (NNP Gross) (NNP National) (NNP Product) (NNP OctoberDecember) (NN period))\r\n                    (ADJP (RB slightly) (RBR less)\r\n                      (SBAR\r\n                        (S\r\n                          (NP\r\n                            (NP (JJ preliminary) (NN estimate) (NN pct))\r\n                            (VP (VBN made)\r\n                              (ADVP (JJR earlier) (IN At))\r\n                              (NP-TMP (NN time))))\r\n                          (VP (VBD said)\r\n                            (NP\r\n                              (NP (NN inflation))\r\n                              (VP (VBN measured)\r\n                                (SBAR\r\n                                  (S\r\n                                    (NP (NNP GNP) (NN price) (NN deflator))\r\n                                    (VP (VBD rose)\r\n                                      (SBAR\r\n                                        (S\r\n                                          (NP (NN pct) (NN period))\r\n                                          (VP (VBD Asked)\r\n                                            (NP\r\n                                              (NP (NN reaction) (NNP GNP) (NN report))\r\n                                              (NP (NNP White) (NNP House) (NN photo) (NN session) (NNP Reagan)))))))))))))))))))))))\r\n    (VP (VBD replied)\r\n      (S\r\n        (NP (PRP It))\r\n        (ADJP (JJ bad))))\r\n    (NP (NNP Reuter))))\r\n\r\n"
head(getToken(output))
##   sentence id     token     lemma CharacterOffsetBegin CharacterOffsetEnd
## 1        1  1 President President                    0                  9
## 2        1  2    Reagan    Reagan                   10                 16
## 3        1  3      said       say                   17                 21
## 4        1  4       pct       pct                   22                 25
## 5        1  5        US        US                   26                 28
## 6        1  6  economic  economic                   29                 37
##   POS      NER Speaker
## 1 NNP        O    PER0
## 2 NNP   PERSON    PER0
## 3 VBD        O    PER0
## 4  NN        O    PER0
## 5 NNP LOCATION    PER0
## 6  JJ        O    PER0
#Frequent Terms & associations
head(findFreqTerms(dtm, 100))
## [1] "billion" "dlrs"    "mln"     "pct"     "reuter"  "said"
head(findAssocs(dtm, "oil", .4))
## $oil
##     assuming       barrel   absolutely      alberta     beaubien 
##         0.82         0.82         0.76         0.76         0.76 
##     becoming        bleak        brake     cautious        class 
##         0.76         0.76         0.76         0.76         0.76 
##    commented  culminating      cushion    deferring      dipping 
##         0.76         0.76         0.76         0.76         0.76 
##       dismal         dome     downturn   dutchshell    exchanges 
##         0.76         0.76         0.76         0.76         0.76 
##        exxon       firmer     frontier       gobert    goverment 
##         0.76         0.76         0.76         0.76         0.76 
##         hans        highs      history         imoa     imperial 
##         0.76         0.76         0.76         0.76         0.76 
##     improved improvements    improving   indication    industrys 
##         0.76         0.76         0.76         0.76         0.76 
##     levesque       lifted         ltds       maciej     majority 
##         0.76         0.76         0.76         0.76         0.76 
##       norcen     optimism    otherwise  outweighing      painted 
##         0.76         0.76         0.76         0.76         0.76 
##        pared       peters         pgrt      picture      plexman 
##         0.76         0.76         0.76         0.76         0.76 
##    prevented    recording      recover       relief       robert 
##         0.76         0.76         0.76         0.76         0.76 
##       robust        royal         save        shell         shot 
##         0.76         0.76         0.76         0.76         0.76 
##         snap        steep strengthened  sustainable      toronto 
##         0.76         0.76         0.76         0.76         0.76 
##   tremendous          txc  uncertainty   unexpected         wilf 
##         0.76         0.76         0.76         0.76         0.76 
##          xon        crude  exploration         flow          low 
##         0.76         0.74         0.74         0.74         0.74 
##      canadas       prices    companies       helped  investments 
##         0.73         0.72         0.70         0.68         0.68 
##     spending   statements       levels   assistance    petroleum 
##         0.68         0.68         0.67         0.66         0.66 
##        sharp       canada      dropped         opec         cash 
##         0.62         0.61         0.61         0.61         0.60 
##  significant    operating         sees   attractive         face 
##         0.60         0.58         0.58         0.56         0.56 
##     industry       source     forecast       accord considerably 
##         0.56         0.56         0.55         0.53         0.53 
##       extent       follow  improvement    necessary       soared 
##         0.53         0.53         0.53         0.53         0.53 
##          ten        while      results       texaco    forecasts 
##         0.53         0.53         0.52         0.52         0.51 
##         kind         many   optimistic      outlook      revenue 
##         0.50         0.50         0.50         0.50         0.50 
##          aid          gas        owned         week        based 
##         0.49         0.49         0.49         0.48         0.47 
##     analysts         came     estimate       issued         last 
##         0.46         0.45         0.45         0.45         0.45 
##    selective       income          ltd      analyst    expressed 
##         0.45         0.44         0.44         0.43         0.43 
## associations      average        bring    difficult    extremely 
##         0.42         0.42         0.42         0.42         0.42 
##      falling       future     packages   reflecting      reports 
##         0.42         0.42         0.42         0.42         0.42 
##    shortterm     somewhat         year      billion       recent 
##         0.42         0.42         0.42         0.41         0.41 
##       supply      revised 
##         0.41         0.40

Local outlier factor

The local outlier factor (LOF) is a measure of outlierness that is calculated for each observation. The LOF takes into consideration the density of the neighbourhood around the observation to determine its outlierness [1]. We used the Rlof package to compute the local outlier factor for each document. To get the outliers, we order the documents by this factor.

library(Rlof)
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
lof_sample <- lof(dtm_transp, k = 10, cores = 4)

outlier_data<-data.frame(docId = rownames(dtm_transp), lof = lof_sample)
order.pop <- order(outlier_data$lof, decreasing = TRUE)
sorted_outliers <- outlier_data[order.pop, ]

Most outlying text:

reuters_sample[[sorted_outliers[1,1]]]$content
## [1] "The Treasury said modifying computerized Coupons Under BookEntry Safekeeping CUBES program permitting line transfers funds payment deposittaking institutions Each institution will able keep CUBES account local Federal Reserve Bank branch handle trading nationwide Treasury release said The new system expected operation late year Currently CUBES accounts maintained line Federal Reserve Bank New York Treasury said Reuter "

Least outlying text:

reuters_sample[[sorted_outliers[length(sorted_outliers),1]]]$content
## character(0)

Unfortunately, the links to the recommended OutRules package on prof. Müller’s website are broken and we could not find it elsewhere or through the package manager.

2. Outlier detection in labeled data

CODB

First we preprocess the data to make an arff file.

library(foreign)
library(tm.corpus.Reuters21578)
data(Reuters21578)
reuters <- Reuters21578

reuters <- tm_map(reuters, removeWords, stopwords("english"))

reuters <- tm_map(reuters, removeNumbers)
reuters <- tm_map(reuters, removePunctuation)
reuters <- tm_map(reuters, stripWhitespace)

set.seed(30)

sample_size=50
reuters_sample <- sample(tm_filter(reuters, function(x) length(x$meta$places) != 0 && length(strsplit(x$meta$places, " ")) == 1 && (x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)

# bag of words
bag <- TermDocumentMatrix(reuters_sample, control=list())
x=inspect(bag)

bag_transp <- t(bag)
x=inspect(bag_transp)
nr=nrow(x)

a=c()
#create places vector from metadata
for(i in c(1:nr)){
  atr=reuters_sample[[i]]$meta$places
  # add to vector if the sample is labeled
  if(length(atr)>0){
    a <- c(a, atr)
  }
  else{
    #N/A for unlabeled sample
    a <- c(a, "N/A")
  }
}

# merge TermDocumentMatrix with places vector
x <- data.frame(x,a)

#.arff needs a class attribute, if class is also a term, rename it to classX
names(x)[names(x)=="class"] <- "classX"

#change last column name to class
colnames(x)[ncol(x)]="class"

# write to .arff file
o_name="bag2"
arff_name=paste(o_name,"arff",sep=".")

write.arff(x, arff_name)

#WEKA-CODB
out=system(paste("java -cp WEKA-CODB.jar motaz.CODB -t ",arff_name,sep=""), intern=TRUE)

#write output of WEKA-CODB to .out file
write(out, paste(o_name,"out",sep="."))

We can also print the top outliers:

#print top num_out outliers (filter WEKA-CODB output)
position=pmatch("=== Top 10 COF ===",out)

num_out=5
i=0

while(i<num_out){
  #Instance number
  print(gsub(".*\\((.*)\\).*", "\\1", out[position+1]))
  
  #PCL, Deviation, KDist
  print(out[position+2])
  
  #---------
  print(out[position+3])
  
  i=i+1
  position=position+3
}
## [1] "16."
## [1] "PCL = 6.0, Deviation = 167,66, KDist = 15,55"
## [1] "------------------------------------------"
## [1] "15."
## [1] "PCL = 6.0, Deviation = 170,39, KDist = 15,91"
## [1] "------------------------------------------"
## [1] "10."
## [1] "PCL = 6.0, Deviation = 172,73, KDist = 16,58"
## [1] "------------------------------------------"
## [1] "42."
## [1] "PCL = 6.0, Deviation = 171,53, KDist = 18,44"
## [1] "------------------------------------------"
## [1] "44."
## [1] "PCL = 6.0, Deviation = 171,81, KDist = 18,48"
## [1] "------------------------------------------"
i=0
num_out=5
rms=c()
position=pmatch("=== Top 10 COF ===",out)

while(i<num_out){
  #Instance number
  ins_n=gsub(".*\\((.*)\\).*", "\\1", out[position+1])
  ins_n=gsub("\\.", "", ins_n)
  print(ins_n)
  
  #append instance number to rms
  #arff counts from 0, bag_transp from 1
  rms <- c(rms, as.numeric(ins_n)+1)
  
  #PCL, Deviation, KDist
  print(out[position+2])
  
  #---------
  print(out[position+3])
  
  i=i+1
  position=position+3
}
## [1] "16"
## [1] "PCL = 6.0, Deviation = 167,66, KDist = 15,55"
## [1] "------------------------------------------"
## [1] "15"
## [1] "PCL = 6.0, Deviation = 170,39, KDist = 15,91"
## [1] "------------------------------------------"
## [1] "10"
## [1] "PCL = 6.0, Deviation = 172,73, KDist = 16,58"
## [1] "------------------------------------------"
## [1] "42"
## [1] "PCL = 6.0, Deviation = 171,53, KDist = 18,44"
## [1] "------------------------------------------"
## [1] "44"
## [1] "PCL = 6.0, Deviation = 171,81, KDist = 18,48"
## [1] "------------------------------------------"
#remove outliers
x_rm<-x
x_rm<-x_rm[-rms,]

RF-OEX (Random Forest Outlier Explanation)

We were also able to analyze class-based outliers By using RF-OEX. The results for the smaller dataset are attached in rf-oex50.txt. So far, we haven’t been able to analyze the larger dataset because of the runtime.

Top 15 outliers from the text file:

=== Summary Outlier Score ===

( 0.) Instance 44 Class: usa Result Outlier Score: 4,33.

( 1.) Instance 39 Class: usa Result Outlier Score: 3,32.

( 2.) Instance 48 Class: hong-kong Result Outlier Score: 2,70.

( 3.) Instance 21 Class: uk Result Outlier Score: 2,62.

( 4.) Instance 41 Class: usa Result Outlier Score: 1,78.

( 5.) Instance 30 Class: japan Result Outlier Score: 1,76.

( 6.) Instance 37 Class: usa Result Outlier Score: 1,73.

( 7.) Instance 1 Class: usa Result Outlier Score: 1,48.

( 8.) Instance 7 Class: usa Result Outlier Score: 1,42.

( 9.) Instance 18 Class: japan Result Outlier Score: 1,14.

(10.) Instance 23 Class: usa Result Outlier Score: 1,13.

(11.) Instance 4 Class: bangladesh Result Outlier Score: 1,01.

(12.) Instance 5 Class: sweden Result Outlier Score: 1,01.

(13.) Instance 13 Class: west-germany Result Outlier Score: 1,01.

(14.) Instance 19 Class: brazil Result Outlier Score: 1,01.

(15.) Instance 26 Class: australia Result Outlier Score: 1,01.

Experiment: K-nearest neighbors clustering

We also tried using the knn method for category labels. For simplicity, we chose only documents with one category. The dataset already had labels for training and test data.

library("class")

knn_sample <- sample(tm_filter(reuters, function(x) length(x$meta$places) != 0 && length(strsplit(x$meta$places, " ")) == 1 && (x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)

getIds = function(corpus){
  result = list()
  for(i in c(1:length(corpus))){
    result = c(result, corpus[[i]]$meta$id)
  }
  return(result)
}

byId = function(corpus, id){
  return(tm_filter(corpus, function(x) x$meta$id == id))
}

dtm <- TermDocumentMatrix(knn_sample, control=list())
dtm_transp <- t(dtm)

wordMatrix = as.data.frame(as.matrix(dtm_transp))

trainDocs <- tm_filter(knn_sample, function(x) x$meta$lewissplit == "TRAIN")
testDocs <- tm_filter(knn_sample, function(x) x$meta$lewissplit == "TEST")

place_frequencies<-sapply(testDocs, function(x) x$meta$places)
as.data.frame(table(place_frequencies))
##    place_frequencies Freq
## 1             canada    1
## 2             france    1
## 3            hungary    1
## 4          indonesia    1
## 5              japan    4
## 6            lebanon    1
## 7          singapore    1
## 8              spain    1
## 9                 uk    2
## 10               usa   40
trainIds = rapply(getIds(trainDocs), c)
testIds = rapply(getIds(testDocs), c)

ml.train<-wordMatrix[trainIds, ]
ml.trainLabels<-rapply(lapply(trainIds, function(x) reuters[[x]]$meta$places), c)

ml.test<-wordMatrix[testIds, ]
ml.testLabels<-rapply(lapply(testIds, function(x) reuters[[x]]$meta$places), c)

knnResult <- knn(train = ml.train, test = ml.test, cl = ml.trainLabels, k = 3)
a <- as.data.frame(table(ml.testLabels, knnResult))

order.pop <- order(a$Freq, decreasing = TRUE)
sorted_by_freq <- a[order.pop, ]
sorted_by_freq[sorted_by_freq$Freq != 0, ]
##     ml.testLabels knnResult Freq
## 180           usa       usa   39
## 175         japan       usa    3
## 179            uk       usa    2
## 70            usa    france    1
## 165         japan        uk    1
## 171        canada       usa    1
## 172        france       usa    1
## 173       hungary       usa    1
## 174     indonesia       usa    1
## 176       lebanon       usa    1
## 177     singapore       usa    1
## 178         spain       usa    1

From the result we can see that the classifier learned to consider english-speaking countries to be similar, but it is also heavily biased towards the most frequent category.

knn_sample <- sample(tm_filter(reuters, function(x) x$meta$topics=="YES" && length(strsplit(x$meta$topics_cat, " ")) == 1 && (x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)

dtm <- TermDocumentMatrix(knn_sample, control=list())
dtm_transp <- t(dtm)

wordMatrix = as.data.frame(as.matrix(dtm_transp))

trainDocs <- tm_filter(knn_sample, function(x) x$meta$lewissplit == "TRAIN")
testDocs <- tm_filter(knn_sample, function(x) x$meta$lewissplit == "TEST")

topic_frequencies<-sapply(testDocs, function(x) x$meta$topics_cat)
as.data.frame(table(topic_frequencies))
##    topic_frequencies Freq
## 1                acq   13
## 2            carcass    1
## 3              cocoa    1
## 4            coconut    1
## 5             coffee    1
## 6              crude    2
## 7               earn   19
## 8               heat    1
## 9           interest    1
## 10          money-fx    1
## 11      money-supply    1
## 12              ship    2
## 13             sugar    1
## 14             trade    3
## 15           veg-oil    1
trainIds = rapply(getIds(trainDocs), c)
testIds = rapply(getIds(testDocs), c)

ml.train<-wordMatrix[trainIds, ]
ml.trainLabels<-rapply(lapply(trainIds, function(x) reuters[[x]]$meta$topics_cat), c)

ml.test<-wordMatrix[testIds, ]
ml.testLabels<-rapply(lapply(testIds, function(x) reuters[[x]]$meta$topics_cat), c)

knnResult <- knn(train = ml.train, test = ml.test, cl = ml.trainLabels, k = 3)
a <- as.data.frame(table(ml.testLabels, knnResult))

order.pop <- order(a$Freq, decreasing = TRUE)
sorted_by_freq <- a[order.pop, ]
sorted_by_freq[sorted_by_freq$Freq != 0, ]
##     ml.testLabels knnResult Freq
## 112          earn      earn   16
## 1             acq       acq   11
## 7            earn       acq    3
## 6           crude       acq    2
## 12           ship       acq    2
## 106           acq      earn    2
## 119         trade      earn    2
## 5          coffee       acq    1
## 8            heat       acq    1
## 9        interest       acq    1
## 10       money-fx       acq    1
## 11   money-supply       acq    1
## 13          sugar       acq    1
## 14          trade       acq    1
## 15        veg-oil       acq    1
## 107       carcass      earn    1
## 109       coconut      earn    1
## 213         cocoa  money-fx    1

We can see a similar bias with category labels. The same result was observed for different values of k.

3. Classification on different learning algorithms

We used the RTextTools package to evaluate classification on several learning algorithms. A short introduction to RTextTools can be found here: https://journal.r-project.org/archive/2013-1/collingwood-jurka-boydstun-etal.pdf

All methods were run on a sample of 5000 documents, with a 3000:2000 split between training and testing data.

data(Reuters21578)

reuters <- Reuters21578
reuters <- tm_map(reuters, removeWords, stopwords("english"))

reuters <- tm_map(reuters, removeNumbers)
reuters <- tm_map(reuters, removePunctuation)
reuters <- tm_map(reuters, stripWhitespace)

# Filter
reuters_sample() <- sample(tm_filter(reuters, function(x) x$meta$topics=="YES" &&
                      length(strsplit(x$meta$topics_cat, " ")) == 1 && 
                      (x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)

sample_size <- 5000                      
reuters_sample <- sample(tm_filter(reuters, function(x) length(x$meta$places) != 0 && 
                      length(strsplit(x$meta$places, " ")) == 1 && 
                      (x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)



# Convert corpus to data frame
library(RTextTools)
reuters_sample <- tm_filter(reuters_sample, function(x) length(x$content) != 0)

dataframe<-data.frame(text=unlist(sapply(reuters_sample, function(x) x$content)), 
                      id=unlist(sapply(reuters_sample, function(x) x$meta$id)),
                      places=unlist(sapply(reuters_sample, function(x) paste(x$meta$places, collapse = " "))),
                      stringsAsFactors=F)

# Recode places
places <- unique(dataframe$places)
codes <- c(1:length(places))
library(plyr)
dataframe$codes <- mapvalues(dataframe$places, from = places, to = codes)


doc_matrix <- create_matrix(dataframe$text, language="english", removeNumbers=TRUE, stemWords=TRUE, removeSparseTerms=.998)
container <- create_container(doc_matrix, dataframe$codes, trainSize=1:3000,testSize=3001:5000, virgin=FALSE)

SVM <- train_model(container,"SVM")
MAXENT <- train_model(container,"MAXENT")
SLDA <- train_model(container,"SLDA")
BOOSTING <- train_model(container,"BOOSTING")
BAGGING <- train_model(container,"BAGGING")
RF <- train_model(container,"RF")
NNET <- train_model(container,"NNET")
TREE <- train_model(container,"TREE")

SVM_CLASSIFY <- classify_model(container, SVM)
MAXENT_CLASSIFY <- classify_model(container, MAXENT)
SLDA_CLASSIFY <- classify_model(container, SLDA)
BOOSTING_CLASSIFY <- classify_model(container, BOOSTING)
BAGGING_CLASSIFY <- classify_model(container, BAGGING)
RF_CLASSIFY <- classify_model(container, RF)
NNET_CLASSIFY <- classify_model(container, NNET)
TREE_CLASSIFY <- classify_model(container, TREE)

analytics <- create_analytics(container,
                              cbind(SVM_CLASSIFY, SLDA_CLASSIFY,
                                    BOOSTING_CLASSIFY, BAGGING_CLASSIFY,
                                    RF_CLASSIFY,
                                    NNET_CLASSIFY,TREE_CLASSIFY,
                                    MAXENT_CLASSIFY))
summary(analytics)

Due to the fact that the script has a runtime in the order of hours, a screenshot of the results is provided.