library(tm)
library(tm.corpus.Reuters21578)
data(Reuters21578)
reuters <- Reuters21578
# Clean corpus
# Some documents are empty
# The most common words carry very little information
reuters <- tm_map(reuters, removeWords, stopwords("english"))
reuters <- tm_map(reuters, removeNumbers)
reuters <- tm_map(reuters, removePunctuation)
reuters <- tm_map(reuters, stripWhitespace)
Due to the size of the data, a random sample of the documents had to be selected to reduce computation times. Setting a constant seed allows us to get the same random sample every time.
set.seed(30)
sample_size <- 200
reuters_sample <- sample(reuters, sample_size)
To transfrom the corpus into a bag of words model, we use a Term-Document matrix
# Create a term document matrix
dtm <- TermDocumentMatrix(reuters_sample, control=list())
# Number of unique words in dictionary
nrow(dtm)
## [1] 4331
We transpose the matrix to get a Document-Term matrix
dtm_transp <- t(dtm)
dtm_tfxidf <- weightTfIdf(dtm)
## Warning in weightTfIdf(dtm): empty document(s): 14405 1710 13367 20066
## 13456 21366 18484 13965 20218 14060 20397 12549 9412 14433 7152 21127 20164
## 14420 6834 11547 1780
# k-means
m <- as.matrix(dtm_tfxidf)
rownames(m) <- 1:nrow(m)
#Normalize the vectors
norm_eucl <- function(m) m/apply(m, MARGIN=1, FUN=function(x) sum(x^2)^.5)
m_norm <- norm_eucl(m)
# Cluster into 10 clusters
cl <- kmeans(m_norm, 10)
# Show clusters using the first 2 principal components
plot(prcomp(m_norm)$x, col=cl$cl)
Various natural language processing methods can be used to further extend our model, such as part-of-speech tagging or sentiment analysis. These methods can also be useful to get a better understanding of the data.
library(coreNLP)
initCoreNLP()
output <- annotateString(reuters_sample[[5]])
head(getDependency(output))
## sentence governor dependent type governorIdx dependentIdx govIndex
## 1 1 ROOT replied root 0 50 NA
## 2 1 Reagan President compound 2 1 2
## 3 1 said Reagan nsubj 3 2 3
## 4 1 replied said dep 50 3 50
## 5 1 said pct dobj 3 4 3
## 6 1 rate US compound 8 5 8
## depIndex
## 1 50
## 2 1
## 3 2
## 4 3
## 5 4
## 6 5
getSentiment(output)
## id sentimentValue sentiment
## 1 1 0 Verynegative
getParse(output)
## [1] "(ROOT\r\n (SINV\r\n (S\r\n (NP (NNP President) (NNP Reagan))\r\n (VP (VBD said)\r\n (NP\r\n (NP (NN pct))\r\n (NP (NNP US) (JJ economic) (NN growth) (NN rate)))\r\n (NP-TMP (JJ final) (NN quarter))\r\n (S\r\n (ADJP (JJ bad)\r\n (SBAR\r\n (S\r\n (NP (DT The) (NNP Commerce) (NNP Department))\r\n (VP (VBD said)\r\n (NP (NN rate) (NN growth))\r\n (S\r\n (NP (NNP Gross) (NNP National) (NNP Product) (NNP OctoberDecember) (NN period))\r\n (ADJP (RB slightly) (RBR less)\r\n (SBAR\r\n (S\r\n (NP\r\n (NP (JJ preliminary) (NN estimate) (NN pct))\r\n (VP (VBN made)\r\n (ADVP (JJR earlier) (IN At))\r\n (NP-TMP (NN time))))\r\n (VP (VBD said)\r\n (NP\r\n (NP (NN inflation))\r\n (VP (VBN measured)\r\n (SBAR\r\n (S\r\n (NP (NNP GNP) (NN price) (NN deflator))\r\n (VP (VBD rose)\r\n (SBAR\r\n (S\r\n (NP (NN pct) (NN period))\r\n (VP (VBD Asked)\r\n (NP\r\n (NP (NN reaction) (NNP GNP) (NN report))\r\n (NP (NNP White) (NNP House) (NN photo) (NN session) (NNP Reagan)))))))))))))))))))))))\r\n (VP (VBD replied)\r\n (S\r\n (NP (PRP It))\r\n (ADJP (JJ bad))))\r\n (NP (NNP Reuter))))\r\n\r\n"
head(getToken(output))
## sentence id token lemma CharacterOffsetBegin CharacterOffsetEnd
## 1 1 1 President President 0 9
## 2 1 2 Reagan Reagan 10 16
## 3 1 3 said say 17 21
## 4 1 4 pct pct 22 25
## 5 1 5 US US 26 28
## 6 1 6 economic economic 29 37
## POS NER Speaker
## 1 NNP O PER0
## 2 NNP PERSON PER0
## 3 VBD O PER0
## 4 NN O PER0
## 5 NNP LOCATION PER0
## 6 JJ O PER0
#Frequent Terms & associations
head(findFreqTerms(dtm, 100))
## [1] "billion" "dlrs" "mln" "pct" "reuter" "said"
head(findAssocs(dtm, "oil", .4))
## $oil
## assuming barrel absolutely alberta beaubien
## 0.82 0.82 0.76 0.76 0.76
## becoming bleak brake cautious class
## 0.76 0.76 0.76 0.76 0.76
## commented culminating cushion deferring dipping
## 0.76 0.76 0.76 0.76 0.76
## dismal dome downturn dutchshell exchanges
## 0.76 0.76 0.76 0.76 0.76
## exxon firmer frontier gobert goverment
## 0.76 0.76 0.76 0.76 0.76
## hans highs history imoa imperial
## 0.76 0.76 0.76 0.76 0.76
## improved improvements improving indication industrys
## 0.76 0.76 0.76 0.76 0.76
## levesque lifted ltds maciej majority
## 0.76 0.76 0.76 0.76 0.76
## norcen optimism otherwise outweighing painted
## 0.76 0.76 0.76 0.76 0.76
## pared peters pgrt picture plexman
## 0.76 0.76 0.76 0.76 0.76
## prevented recording recover relief robert
## 0.76 0.76 0.76 0.76 0.76
## robust royal save shell shot
## 0.76 0.76 0.76 0.76 0.76
## snap steep strengthened sustainable toronto
## 0.76 0.76 0.76 0.76 0.76
## tremendous txc uncertainty unexpected wilf
## 0.76 0.76 0.76 0.76 0.76
## xon crude exploration flow low
## 0.76 0.74 0.74 0.74 0.74
## canadas prices companies helped investments
## 0.73 0.72 0.70 0.68 0.68
## spending statements levels assistance petroleum
## 0.68 0.68 0.67 0.66 0.66
## sharp canada dropped opec cash
## 0.62 0.61 0.61 0.61 0.60
## significant operating sees attractive face
## 0.60 0.58 0.58 0.56 0.56
## industry source forecast accord considerably
## 0.56 0.56 0.55 0.53 0.53
## extent follow improvement necessary soared
## 0.53 0.53 0.53 0.53 0.53
## ten while results texaco forecasts
## 0.53 0.53 0.52 0.52 0.51
## kind many optimistic outlook revenue
## 0.50 0.50 0.50 0.50 0.50
## aid gas owned week based
## 0.49 0.49 0.49 0.48 0.47
## analysts came estimate issued last
## 0.46 0.45 0.45 0.45 0.45
## selective income ltd analyst expressed
## 0.45 0.44 0.44 0.43 0.43
## associations average bring difficult extremely
## 0.42 0.42 0.42 0.42 0.42
## falling future packages reflecting reports
## 0.42 0.42 0.42 0.42 0.42
## shortterm somewhat year billion recent
## 0.42 0.42 0.42 0.41 0.41
## supply revised
## 0.41 0.40
The local outlier factor (LOF) is a measure of outlierness that is calculated for each observation. The LOF takes into consideration the density of the neighbourhood around the observation to determine its outlierness [1]. We used the Rlof package to compute the local outlier factor for each document. To get the outliers, we order the documents by this factor.
library(Rlof)
## Loading required package: doParallel
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
lof_sample <- lof(dtm_transp, k = 10, cores = 4)
outlier_data<-data.frame(docId = rownames(dtm_transp), lof = lof_sample)
order.pop <- order(outlier_data$lof, decreasing = TRUE)
sorted_outliers <- outlier_data[order.pop, ]
Most outlying text:
reuters_sample[[sorted_outliers[1,1]]]$content
## [1] "The Treasury said modifying computerized Coupons Under BookEntry Safekeeping CUBES program permitting line transfers funds payment deposittaking institutions Each institution will able keep CUBES account local Federal Reserve Bank branch handle trading nationwide Treasury release said The new system expected operation late year Currently CUBES accounts maintained line Federal Reserve Bank New York Treasury said Reuter "
Least outlying text:
reuters_sample[[sorted_outliers[length(sorted_outliers),1]]]$content
## character(0)
Unfortunately, the links to the recommended OutRules package on prof. Müller’s website are broken and we could not find it elsewhere or through the package manager.
First we preprocess the data to make an arff file.
library(foreign)
library(tm.corpus.Reuters21578)
data(Reuters21578)
reuters <- Reuters21578
reuters <- tm_map(reuters, removeWords, stopwords("english"))
reuters <- tm_map(reuters, removeNumbers)
reuters <- tm_map(reuters, removePunctuation)
reuters <- tm_map(reuters, stripWhitespace)
set.seed(30)
sample_size=50
reuters_sample <- sample(tm_filter(reuters, function(x) length(x$meta$places) != 0 && length(strsplit(x$meta$places, " ")) == 1 && (x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)
# bag of words
bag <- TermDocumentMatrix(reuters_sample, control=list())
x=inspect(bag)
bag_transp <- t(bag)
x=inspect(bag_transp)
nr=nrow(x)
a=c()
#create places vector from metadata
for(i in c(1:nr)){
atr=reuters_sample[[i]]$meta$places
# add to vector if the sample is labeled
if(length(atr)>0){
a <- c(a, atr)
}
else{
#N/A for unlabeled sample
a <- c(a, "N/A")
}
}
# merge TermDocumentMatrix with places vector
x <- data.frame(x,a)
#.arff needs a class attribute, if class is also a term, rename it to classX
names(x)[names(x)=="class"] <- "classX"
#change last column name to class
colnames(x)[ncol(x)]="class"
# write to .arff file
o_name="bag2"
arff_name=paste(o_name,"arff",sep=".")
write.arff(x, arff_name)
#WEKA-CODB
out=system(paste("java -cp WEKA-CODB.jar motaz.CODB -t ",arff_name,sep=""), intern=TRUE)
#write output of WEKA-CODB to .out file
write(out, paste(o_name,"out",sep="."))
We can also print the top outliers:
#print top num_out outliers (filter WEKA-CODB output)
position=pmatch("=== Top 10 COF ===",out)
num_out=5
i=0
while(i<num_out){
#Instance number
print(gsub(".*\\((.*)\\).*", "\\1", out[position+1]))
#PCL, Deviation, KDist
print(out[position+2])
#---------
print(out[position+3])
i=i+1
position=position+3
}
## [1] "16."
## [1] "PCL = 6.0, Deviation = 167,66, KDist = 15,55"
## [1] "------------------------------------------"
## [1] "15."
## [1] "PCL = 6.0, Deviation = 170,39, KDist = 15,91"
## [1] "------------------------------------------"
## [1] "10."
## [1] "PCL = 6.0, Deviation = 172,73, KDist = 16,58"
## [1] "------------------------------------------"
## [1] "42."
## [1] "PCL = 6.0, Deviation = 171,53, KDist = 18,44"
## [1] "------------------------------------------"
## [1] "44."
## [1] "PCL = 6.0, Deviation = 171,81, KDist = 18,48"
## [1] "------------------------------------------"
i=0
num_out=5
rms=c()
position=pmatch("=== Top 10 COF ===",out)
while(i<num_out){
#Instance number
ins_n=gsub(".*\\((.*)\\).*", "\\1", out[position+1])
ins_n=gsub("\\.", "", ins_n)
print(ins_n)
#append instance number to rms
#arff counts from 0, bag_transp from 1
rms <- c(rms, as.numeric(ins_n)+1)
#PCL, Deviation, KDist
print(out[position+2])
#---------
print(out[position+3])
i=i+1
position=position+3
}
## [1] "16"
## [1] "PCL = 6.0, Deviation = 167,66, KDist = 15,55"
## [1] "------------------------------------------"
## [1] "15"
## [1] "PCL = 6.0, Deviation = 170,39, KDist = 15,91"
## [1] "------------------------------------------"
## [1] "10"
## [1] "PCL = 6.0, Deviation = 172,73, KDist = 16,58"
## [1] "------------------------------------------"
## [1] "42"
## [1] "PCL = 6.0, Deviation = 171,53, KDist = 18,44"
## [1] "------------------------------------------"
## [1] "44"
## [1] "PCL = 6.0, Deviation = 171,81, KDist = 18,48"
## [1] "------------------------------------------"
#remove outliers
x_rm<-x
x_rm<-x_rm[-rms,]
We were also able to analyze class-based outliers By using RF-OEX. The results for the smaller dataset are attached in rf-oex50.txt. So far, we haven’t been able to analyze the larger dataset because of the runtime.
Top 15 outliers from the text file:
=== Summary Outlier Score ===
( 0.) Instance 44 Class: usa Result Outlier Score: 4,33.
( 1.) Instance 39 Class: usa Result Outlier Score: 3,32.
( 2.) Instance 48 Class: hong-kong Result Outlier Score: 2,70.
( 3.) Instance 21 Class: uk Result Outlier Score: 2,62.
( 4.) Instance 41 Class: usa Result Outlier Score: 1,78.
( 5.) Instance 30 Class: japan Result Outlier Score: 1,76.
( 6.) Instance 37 Class: usa Result Outlier Score: 1,73.
( 7.) Instance 1 Class: usa Result Outlier Score: 1,48.
( 8.) Instance 7 Class: usa Result Outlier Score: 1,42.
( 9.) Instance 18 Class: japan Result Outlier Score: 1,14.
(10.) Instance 23 Class: usa Result Outlier Score: 1,13.
(11.) Instance 4 Class: bangladesh Result Outlier Score: 1,01.
(12.) Instance 5 Class: sweden Result Outlier Score: 1,01.
(13.) Instance 13 Class: west-germany Result Outlier Score: 1,01.
(14.) Instance 19 Class: brazil Result Outlier Score: 1,01.
(15.) Instance 26 Class: australia Result Outlier Score: 1,01.
We also tried using the knn method for category labels. For simplicity, we chose only documents with one category. The dataset already had labels for training and test data.
library("class")
knn_sample <- sample(tm_filter(reuters, function(x) length(x$meta$places) != 0 && length(strsplit(x$meta$places, " ")) == 1 && (x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)
getIds = function(corpus){
result = list()
for(i in c(1:length(corpus))){
result = c(result, corpus[[i]]$meta$id)
}
return(result)
}
byId = function(corpus, id){
return(tm_filter(corpus, function(x) x$meta$id == id))
}
dtm <- TermDocumentMatrix(knn_sample, control=list())
dtm_transp <- t(dtm)
wordMatrix = as.data.frame(as.matrix(dtm_transp))
trainDocs <- tm_filter(knn_sample, function(x) x$meta$lewissplit == "TRAIN")
testDocs <- tm_filter(knn_sample, function(x) x$meta$lewissplit == "TEST")
place_frequencies<-sapply(testDocs, function(x) x$meta$places)
as.data.frame(table(place_frequencies))
## place_frequencies Freq
## 1 canada 1
## 2 france 1
## 3 hungary 1
## 4 indonesia 1
## 5 japan 4
## 6 lebanon 1
## 7 singapore 1
## 8 spain 1
## 9 uk 2
## 10 usa 40
trainIds = rapply(getIds(trainDocs), c)
testIds = rapply(getIds(testDocs), c)
ml.train<-wordMatrix[trainIds, ]
ml.trainLabels<-rapply(lapply(trainIds, function(x) reuters[[x]]$meta$places), c)
ml.test<-wordMatrix[testIds, ]
ml.testLabels<-rapply(lapply(testIds, function(x) reuters[[x]]$meta$places), c)
knnResult <- knn(train = ml.train, test = ml.test, cl = ml.trainLabels, k = 3)
a <- as.data.frame(table(ml.testLabels, knnResult))
order.pop <- order(a$Freq, decreasing = TRUE)
sorted_by_freq <- a[order.pop, ]
sorted_by_freq[sorted_by_freq$Freq != 0, ]
## ml.testLabels knnResult Freq
## 180 usa usa 39
## 175 japan usa 3
## 179 uk usa 2
## 70 usa france 1
## 165 japan uk 1
## 171 canada usa 1
## 172 france usa 1
## 173 hungary usa 1
## 174 indonesia usa 1
## 176 lebanon usa 1
## 177 singapore usa 1
## 178 spain usa 1
From the result we can see that the classifier learned to consider english-speaking countries to be similar, but it is also heavily biased towards the most frequent category.
knn_sample <- sample(tm_filter(reuters, function(x) x$meta$topics=="YES" && length(strsplit(x$meta$topics_cat, " ")) == 1 && (x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)
dtm <- TermDocumentMatrix(knn_sample, control=list())
dtm_transp <- t(dtm)
wordMatrix = as.data.frame(as.matrix(dtm_transp))
trainDocs <- tm_filter(knn_sample, function(x) x$meta$lewissplit == "TRAIN")
testDocs <- tm_filter(knn_sample, function(x) x$meta$lewissplit == "TEST")
topic_frequencies<-sapply(testDocs, function(x) x$meta$topics_cat)
as.data.frame(table(topic_frequencies))
## topic_frequencies Freq
## 1 acq 13
## 2 carcass 1
## 3 cocoa 1
## 4 coconut 1
## 5 coffee 1
## 6 crude 2
## 7 earn 19
## 8 heat 1
## 9 interest 1
## 10 money-fx 1
## 11 money-supply 1
## 12 ship 2
## 13 sugar 1
## 14 trade 3
## 15 veg-oil 1
trainIds = rapply(getIds(trainDocs), c)
testIds = rapply(getIds(testDocs), c)
ml.train<-wordMatrix[trainIds, ]
ml.trainLabels<-rapply(lapply(trainIds, function(x) reuters[[x]]$meta$topics_cat), c)
ml.test<-wordMatrix[testIds, ]
ml.testLabels<-rapply(lapply(testIds, function(x) reuters[[x]]$meta$topics_cat), c)
knnResult <- knn(train = ml.train, test = ml.test, cl = ml.trainLabels, k = 3)
a <- as.data.frame(table(ml.testLabels, knnResult))
order.pop <- order(a$Freq, decreasing = TRUE)
sorted_by_freq <- a[order.pop, ]
sorted_by_freq[sorted_by_freq$Freq != 0, ]
## ml.testLabels knnResult Freq
## 112 earn earn 16
## 1 acq acq 11
## 7 earn acq 3
## 6 crude acq 2
## 12 ship acq 2
## 106 acq earn 2
## 119 trade earn 2
## 5 coffee acq 1
## 8 heat acq 1
## 9 interest acq 1
## 10 money-fx acq 1
## 11 money-supply acq 1
## 13 sugar acq 1
## 14 trade acq 1
## 15 veg-oil acq 1
## 107 carcass earn 1
## 109 coconut earn 1
## 213 cocoa money-fx 1
We can see a similar bias with category labels. The same result was observed for different values of k.
We used the RTextTools package to evaluate classification on several learning algorithms. A short introduction to RTextTools can be found here: https://journal.r-project.org/archive/2013-1/collingwood-jurka-boydstun-etal.pdf
All methods were run on a sample of 5000 documents, with a 3000:2000 split between training and testing data.
data(Reuters21578)
reuters <- Reuters21578
reuters <- tm_map(reuters, removeWords, stopwords("english"))
reuters <- tm_map(reuters, removeNumbers)
reuters <- tm_map(reuters, removePunctuation)
reuters <- tm_map(reuters, stripWhitespace)
# Filter
reuters_sample() <- sample(tm_filter(reuters, function(x) x$meta$topics=="YES" &&
length(strsplit(x$meta$topics_cat, " ")) == 1 &&
(x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)
sample_size <- 5000
reuters_sample <- sample(tm_filter(reuters, function(x) length(x$meta$places) != 0 &&
length(strsplit(x$meta$places, " ")) == 1 &&
(x$meta$lewissplit == "TRAIN" || x$meta$lewissplit == "TEST")), sample_size)
# Convert corpus to data frame
library(RTextTools)
reuters_sample <- tm_filter(reuters_sample, function(x) length(x$content) != 0)
dataframe<-data.frame(text=unlist(sapply(reuters_sample, function(x) x$content)),
id=unlist(sapply(reuters_sample, function(x) x$meta$id)),
places=unlist(sapply(reuters_sample, function(x) paste(x$meta$places, collapse = " "))),
stringsAsFactors=F)
# Recode places
places <- unique(dataframe$places)
codes <- c(1:length(places))
library(plyr)
dataframe$codes <- mapvalues(dataframe$places, from = places, to = codes)
doc_matrix <- create_matrix(dataframe$text, language="english", removeNumbers=TRUE, stemWords=TRUE, removeSparseTerms=.998)
container <- create_container(doc_matrix, dataframe$codes, trainSize=1:3000,testSize=3001:5000, virgin=FALSE)
SVM <- train_model(container,"SVM")
MAXENT <- train_model(container,"MAXENT")
SLDA <- train_model(container,"SLDA")
BOOSTING <- train_model(container,"BOOSTING")
BAGGING <- train_model(container,"BAGGING")
RF <- train_model(container,"RF")
NNET <- train_model(container,"NNET")
TREE <- train_model(container,"TREE")
SVM_CLASSIFY <- classify_model(container, SVM)
MAXENT_CLASSIFY <- classify_model(container, MAXENT)
SLDA_CLASSIFY <- classify_model(container, SLDA)
BOOSTING_CLASSIFY <- classify_model(container, BOOSTING)
BAGGING_CLASSIFY <- classify_model(container, BAGGING)
RF_CLASSIFY <- classify_model(container, RF)
NNET_CLASSIFY <- classify_model(container, NNET)
TREE_CLASSIFY <- classify_model(container, TREE)
analytics <- create_analytics(container,
cbind(SVM_CLASSIFY, SLDA_CLASSIFY,
BOOSTING_CLASSIFY, BAGGING_CLASSIFY,
RF_CLASSIFY,
NNET_CLASSIFY,TREE_CLASSIFY,
MAXENT_CLASSIFY))
summary(analytics)
Due to the fact that the script has a runtime in the order of hours, a screenshot of the results is provided.