# rm(list=ls())
library(caret)
library(glmnet)
library(tree)
library(rpart)
library(rpart.plot)
library(ranger)
library(factoextra)  # Something for later
library(fastDummies) # Create dummy variables
library(cluster)     # Something for clustering
library(pROC)
library(foreach)
library(doParallel)
library(gbm)
library(factoextra)  # Something for later

# Load data
# zopa          <- read.csv(file='.....//zsnew.csv')
zopa          <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//Week 9//zsnew.csv')
# Number of observations
NN            <- dim(zopa)[1]
# Under-represented dummy variables
nms.rm        <- c('WA5','LU7','EH54','PC.HS','RH10','CV6','PR7','BS16','FK2','PC.LD','PC.EC','SE9','SE18','NG9','PC.ZE','PC.WC','PC.RUNC','PC.NH','PC.BG','PC.BF')
zopa[,nms.rm] <- NULL
names(zopa)
# Data is unbalanced
table(zopa$default)

# Address data imbalance
# Random under-sampling of the majority class
idx           <- which(zopa$default==0)
set.seed(100)
good.loans    <- zopa[idx[sample(1:length(idx),size=sum(zopa$default==1),replace=F)],]
bad..loans    <- zopa[zopa$default==1,]
balanced.zopa <- rbind(good.loans,bad..loans)
# Random shuffle
NT            <- dim(balanced.zopa)[1]
set.seed(110)
balanced.zopa <- balanced.zopa[sample(1:NT,size=NT,replace=F),]
# Now we can separate the data
# Training & Testing
set.seed(115)
idx           <- sample(1:NT,size=floor(0.8*NT),replace=F)
train.zopa    <- balanced.zopa[idx, ]
test.zopa     <- balanced.zopa[-idx,]

# Specification with all possible features
specs = gen.fm(dep='default',x=names(train.zopa)[c(7,11:167,201,207)])

# The predicted outcome
true <- as.factor(test.zopa$default)

# Store results
results = matrix(NA,nrow=22,ncol=5)
colnames(results) = c('accuracy','balanced accuracy','sensitivity','specificity','auc')
rownames(results) = c('PLM','LR','LASSO-1SE','RF','XGB',
                      'PLM-KM1','LR-KM1','LASSO-KM1','RF-KM1','XGB-KM1',
                      'KM1',
                      'PLM-KM2','LR-KM2','LASSO-KM2','RF-KM2','XGB-KM2',
                      'KM2',
                      'PAM','PLM-PC','LR-PC','LASSO-PC','RF-PC')

##################################
# Benchmark models -> from previous sessions
##################################
# Probability linear model
m0 = lm(specs, data=train.zopa)
p0 = predict(m0,new=test.zopa)
p0[p0<0] = min(p0[p0>0])
p0[p0>1] = max(p0[p0<1])
tmp <- confusionMatrix(as.factor((p0>0.5)*1),reference=true,positive='1')
results[1,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc0 <- roc(response=true,predictor=p0)
results[1,5]   <- as.numeric(auc(roc0))

# Logistic regression
m1 = glm(specs, data=train.zopa, family='binomial')
p1 = (1/(1+exp(-predict(m1, new=test.zopa))))
tmp <- confusionMatrix(as.factor((p1>0.5)*1),reference=true,positive='1')
results[2,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc1 <- roc(response=true,predictor=p1)
results[2,5]   <- as.numeric(auc(roc1))

# The LASSO
x = as.matrix(train.zopa[,all.vars(specs)[-1]])
newx = as.matrix(test.zopa[,all.vars(specs)[-1]])
y = as.matrix(train.zopa[,all.vars(specs)[1]])
lasso.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=1)
p21se = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.1se')))))
tmp <- confusionMatrix(as.factor((p21se>0.5)*1),reference=true,positive='1')
results[3,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc21se <- roc(response=true,predictor=p21se)
results[3,5]   <- as.numeric(auc(roc21se))

# Random forest
rf.tree = ranger(specs,data=train.zopa,
                 num.trees=1000,mtry=9,min.node.size=5,max.depth=0)  
p3 = predict(rf.tree,data=test.zopa)$predictions
tmp <- confusionMatrix(as.factor((p3>0.5)*1),reference=true,positive='1')
results[4,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc3 <- roc(response=true,predictor=p3)
results[4,5]   <- as.numeric(auc(roc3))
results

# Boosted classification tree (xgb)
boostedtree <- gbm(specs,data=train.zopa,n.trees=2000,shrinkage=0.001,
                   distribution='bernoulli')
p8 = predict(boostedtree,newdata=test.zopa,type='response')
# Evaluation
tmp <- confusionMatrix(as.factor((p8>0.5)*1),reference=true,positive='1')
results[5,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc8 <- roc(response=true,predictor=p8)
results[5,5]   <- as.numeric(auc(roc8))

##################################
# K-Means
##################################
# Distance matrix
zscore.train <- scale(train.zopa[,c('amount','term','rate','time.start','time.start2')])
# We do the same with testing dataset!
zscore.test  <- scale(test.zopa[,c('amount','term','rate','time.start','time.start2')])
# Euclidean distance
dist.eucl    <- dist(zscore.train,method='euclidean')
dist.eucl.mat<- as.matrix(dist.eucl)
# We can check
dist.eucl.mat[1:5,1:5]
# Yes it seems symmetric, elements all positive and 0 on diagonal -> as expected.
# Vizualize distance - it will take some time
# fviz_dist(dist.eucl)
# dev.off()

# Given the distance matrix - Find 30 clusters
K = 50
set.seed(150)
res.hw <- kmeans(zscore.train,centers=K,iter.max=50,nstart=2000,algorithm='Hartigan-Wong')
# Check number of observations in cluster
table(res.hw$cluster)
# Are clusters (by any chance) related to defaults?
clus_char <- table(res.hw$cluster,train.zopa$default)
clus_char
clus_char <- cbind(clus_char,clus_char[,2]/apply(clus_char,1,sum))
colnames(clus_char) <- c('NumberGood','NumberBad','ProbD')
clus_char
# It looks promising
cor(clus_char[res.hw$cluster,3],train.zopa$default,method='kendall')
# Can we use that information in forecasting?

#----------------------------------------------------------------------------------------------------
# Feature engineering
# Perhaps we can create variables that will tell to which Cluster the given observation Belongs
# Now let's create dummies corresponding to K-1 clusters, i.e. create new features. 1 if the observation belongs to the given cluster and 0 otherwise
clu.train = dummy_cols(res.hw$cluster)
head(clu.train)
clu.train = clu.train[,-1]
colnames(clu.train) = paste('hw',c(1:K),sep='')
head(clu.train)
# The 'clu.train' are our new features 0/1 that we can use to train a model
train1 = data.frame(train.zopa,clu.train)
#----------------------------------------------------------------------------------------------------
# Now we need to assign each observation from TESTING data to one of the clusters. How do I know to which cluster the given observation belongs?
# Where is the center of each cluster?
centroid <- res.hw$centers
head(centroid)
clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K
colnames(clu.test) <- colnames(clu.train)
for (i in 1:nrow(zscore.test)) {
  # We need to find the distance between given observation from testing data to each of the centroid
  # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset)
  cld = rbind(centroid,zscore.test[i,])
  # Now find the distances between all those observations
  tmp = as.matrix(dist(cld,method='euclidean'))
  # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation
  # tmp[dim(tmp)[1],1:K]
  # Find to which centroid the observation is closest to
  opt.clus = which(tmp[dim(tmp)[1],1:K]==min(tmp[dim(tmp)[1],1:K]))
  # Just add 1 there
  clu.test[i,opt.clus] = 1
}
# Check if we have observations in every cluster
apply(clu.test,2,mean)
# Note: there might be observations that do not belong to any cluster
# We create two training and testing datasets with either one or the second approach of assignment of an observation to a cluster
test1 = data.frame(test.zopa,clu.test)

#----------------------------------------------------------------------------------------------------
# Training & Evaluation of a model with clusters as features
#----------------------------------------------------------------------------------------------------
# What variables to use? The one from before + K related to clusters
nms <- c(all.vars(specs)[-1],paste('hw',c(1:K),sep=''))
specs.new = gen.fm(dep='default',x=nms)
#----------------------------------------------------------------------------------------------------
# Probability linear model
m0 = lm(specs.new, data=train1)
p0 = predict(m0,new=test1)
p0[p0<0] = min(p0[p0>0])
p0[p0>1] = max(p0[p0<1])
tmp <- confusionMatrix(as.factor((p0>0.5)*1),reference=true,positive='1')
results[6,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc0 <- roc(response=true,predictor=p0)
results[6,5]   <- as.numeric(auc(roc0))
#----------------------------------------------------------------------------------------------------
# Logistic regression
m1 = glm(specs.new, data=train1, family='binomial')
p1 = (1/(1+exp(-predict(m1, new=test1))))
tmp <- confusionMatrix(as.factor((p1>0.5)*1),reference=true,positive='1')
results[7,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc1 <- roc(response=true,predictor=p1)
results[7,5]   <- as.numeric(auc(roc1))
#----------------------------------------------------------------------------------------------------
# The LASSO
x = as.matrix(train1[,all.vars(specs.new)[-1]])
newx = as.matrix(test1[,all.vars(specs.new)[-1]])
y = as.matrix(train1[,all.vars(specs.new)[1]])
lasso.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=1)
p21se = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.1se')))))
tmp <- confusionMatrix(as.factor((p21se>0.5)*1),reference=true,positive='1')
results[8,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc21se <- roc(response=true,predictor=p21se)
results[8,5]   <- as.numeric(auc(roc21se))
#----------------------------------------------------------------------------------------------------
# RF
rf.tree = ranger(specs.new,data=train1,
                 num.trees=1000,mtry=12,min.node.size=5,max.depth=0)  
p3 = predict(rf.tree,data=test1)$predictions
tmp <- confusionMatrix(as.factor((p3>0.5)*1),reference=true,positive='1')
results[9,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc3 <- roc(response=true,predictor=p3)
results[9,5]   <- as.numeric(auc(roc3))
#----------------------------------------------------------------------------------------------------
# XGB - Boosted classification tree (xgb)
boostedtree <- gbm(specs.new,data=train1,n.trees=2000,shrinkage=0.001,
                   distribution='bernoulli')
p8 = predict(boostedtree,newdata=test1,type='response')
# Evaluation
tmp <- confusionMatrix(as.factor((p8>0.5)*1),reference=true,positive='1')
results[10,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc8 <- roc(response=true,predictor=p8)
results[10,5]   <- as.numeric(auc(roc8))
#----------------------------------------------------------------------------------------------------
results

#----------------------------------------------------------------------------------------------------
# Can we use clusters to make predictions directly
#----------------------------------------------------------------------------------------------------
# Indeed - recall
head(clus_char)
# From test1 we know which observation belongs to which cluster.
# We can set-up a rule
# If ProbD > 0.5 => defaulted loan
pKM <- c()
for (i in 1:dim(test1)[1]) pKM[i] = clus_char[which(clu.test[i,] == 1),3]
tmp <- confusionMatrix(as.factor((pKM>0.5)*1),reference=true,positive='1')
results[11,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
rocKM <- roc(response=true,predictor=pKM)
results[11,5]   <- as.numeric(auc(rocKM))
results
# Not bad for un-supervised method - not bad at all.
#----------------------------------------------------------------------------------------------------


#----------------------------------------------------------------------------------------------------
# Many 'linkage' functions that assign an observation from the training to the given cluster
# What if we are not sure
# Now we need to assign each observation from TESTING data to one of the clusters. How do I know to which cluster the given observation belongs?
# Where is the center of each cluster?
#----------------------------------------------------------------------------------------------------
# Feature engineering - again
#----------------------------------------------------------------------------------------------------
# Now we need to assign each observation from training data to one of the clusters. How do I know to which cluster the given observation belongs?
# Where is the center of each cluster?
clu.train = matrix(0,nrow=dim(zscore.train)[1],ncol=K) # Number of clusters K
colnames(clu.train) <- paste('hw',c(1:K),sep='')
for (i in 1:nrow(zscore.train)) {
  # We need to find the distance between given observation from testing data to each of the centroid
  # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset)
  cld = rbind(centroid,zscore.train[i,])
  # Now find the distances between all those observations
  tmp = as.matrix(dist(cld,method='euclidean'))
  # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation
  # Find to what degree/magnitude/percentage the given observation belongs to each cluster
  clu.train[i,] = (1/tmp[nrow(tmp),1:K])/sum((1/tmp[nrow(tmp),1:K]))
}
# We create two training and testing datasets with either one or the second approach of assignment of an observation to a cluster
train2 = data.frame(train.zopa,clu.train)
# I also need to create variables for the testing dataset
#----------------------------------------------------------------------------------------------------
clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K
colnames(clu.test) <- colnames(clu.train)
for (i in 1:nrow(zscore.test)) {
  # We need to find the distance between given observation from testing data to each of the centroid
  # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset)
  cld = rbind(centroid,zscore.test[i,])
  # Now find the distances between all those observations
  tmp = as.matrix(dist(cld,method='euclidean'))
  # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation
  # Just add 1 there
  clu.test[i,] = (1/tmp[nrow(tmp),1:K])/sum(1/tmp[nrow(tmp),1:K])
}
# Check if we have observations in every cluster
test2 = data.frame(test.zopa,clu.test)

#----------------------------------------------------------------------------------------------------
# Train & Evaluate models with these new features
# What variables to use? The one from before + K related to clusters
nms <- c(all.vars(specs)[-1],paste('hw',c(1:K),sep=''))
specs.new = gen.fm(dep='default',x=nms)
#----------------------------------------------------------------------------------------------------
# Probability linear model
m0 = lm(specs.new, data=train2)
p0 = predict(m0,new=test2)
p0[p0<0] = min(p0[p0>0])
p0[p0>1] = max(p0[p0<1])
tmp <- confusionMatrix(as.factor((p0>0.5)*1),reference=true,positive='1')
results[12,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc0 <- roc(response=true,predictor=p0)
results[12,5]   <- as.numeric(auc(roc0))
#----------------------------------------------------------------------------------------------------
# Logistic regression
m1 = glm(specs.new, data=train2, family='binomial')
p1 = (1/(1+exp(-predict(m1, new=test2))))
tmp <- confusionMatrix(as.factor((p1>0.5)*1),reference=true,positive='1')
results[13,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc1 <- roc(response=true,predictor=p1)
results[13,5]   <- as.numeric(auc(roc1))
#----------------------------------------------------------------------------------------------------
# The LASSO
x = as.matrix(train2[,all.vars(specs.new)[-1]])
newx = as.matrix(test2[,all.vars(specs.new)[-1]])
y = as.matrix(train2[,all.vars(specs.new)[1]])
lasso.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=1)
p21se = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.1se')))))
tmp <- confusionMatrix(as.factor((p21se>0.5)*1),reference=true,positive='1')
results[14,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc21se <- roc(response=true,predictor=p21se)
results[14,5]   <- as.numeric(auc(roc21se))
#----------------------------------------------------------------------------------------------------
# RF
rf.tree = ranger(specs.new,data=train2,
                 num.trees=1000,mtry=12,min.node.size=5,max.depth=0)  
p3 = predict(rf.tree,data=test2)$predictions
tmp <- confusionMatrix(as.factor((p3>0.5)*1),reference=true,positive='1')
results[15,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc3 <- roc(response=true,predictor=p3)
results[15,5]   <- as.numeric(auc(roc3))
#----------------------------------------------------------------------------------------------------
# XGB - Boosted classification tree (xgb)
boostedtree <- gbm(specs.new,data=train2,n.trees=2000,shrinkage=0.001,
                   distribution='bernoulli')
p8 = predict(boostedtree,newdata=test2,type='response')
# Evaluation
tmp <- confusionMatrix(as.factor((p8>0.5)*1),reference=true,positive='1')
results[16,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc8 <- roc(response=true,predictor=p8)
results[16,5]   <- as.numeric(auc(roc8))
#----------------------------------------------------------------------------------------------------
results

#----------------------------------------------------------------------------------------------------
# Can we use clusters to make predictions directly - use weighted prediction?
#----------------------------------------------------------------------------------------------------
# Indeed - recall
head(clus_char)
# From test1 we know which observation belongs to which cluster.
pKM <- c()
for (i in 1:dim(test1)[1]) pKM[i] = weighted.mean(clus_char[,3],w=clu.test[i,])
tmp <- confusionMatrix(as.factor((pKM>0.5)*1),reference=true,positive='1')
results[17,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
rocKM <- roc(response=true,predictor=pKM)
results[17,5]   <- as.numeric(auc(rocKM))
results
#----------------------------------------------------------------------------------------------------