# rm(list=ls()) library(caret) library(glmnet) library(tree) library(rpart) library(rpart.plot) library(ranger) library(factoextra) # Something for later library(fastDummies) # Create dummy variables library(cluster) # Something for clustering library(pROC) library(foreach) library(doParallel) library(gbm) library(factoextra) # Something for later # Load data # zopa <- read.csv(file='.....//zsnew.csv') zopa <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//Week 9//zsnew.csv') # Number of observations NN <- dim(zopa)[1] # Under-represented dummy variables nms.rm <- c('WA5','LU7','EH54','PC.HS','RH10','CV6','PR7','BS16','FK2','PC.LD','PC.EC','SE9','SE18','NG9','PC.ZE','PC.WC','PC.RUNC','PC.NH','PC.BG','PC.BF') zopa[,nms.rm] <- NULL names(zopa) # Data is unbalanced table(zopa$default) # Address data imbalance # Random under-sampling of the majority class idx <- which(zopa$default==0) set.seed(100) good.loans <- zopa[idx[sample(1:length(idx),size=sum(zopa$default==1),replace=F)],] bad..loans <- zopa[zopa$default==1,] balanced.zopa <- rbind(good.loans,bad..loans) # Random shuffle NT <- dim(balanced.zopa)[1] set.seed(110) balanced.zopa <- balanced.zopa[sample(1:NT,size=NT,replace=F),] # Now we can separate the data # Training & Testing set.seed(115) idx <- sample(1:NT,size=floor(0.8*NT),replace=F) train.zopa <- balanced.zopa[idx, ] test.zopa <- balanced.zopa[-idx,] # Specification with all possible features specs = gen.fm(dep='default',x=names(train.zopa)[c(7,11:167,201,207)]) # The predicted outcome true <- as.factor(test.zopa$default) # Store results results = matrix(NA,nrow=22,ncol=5) colnames(results) = c('accuracy','balanced accuracy','sensitivity','specificity','auc') rownames(results) = c('PLM','LR','LASSO-1SE','RF','XGB', 'PLM-KM1','LR-KM1','LASSO-KM1','RF-KM1','XGB-KM1', 'KM1', 'PLM-KM2','LR-KM2','LASSO-KM2','RF-KM2','XGB-KM2', 'KM2', 'PAM','PLM-PC','LR-PC','LASSO-PC','RF-PC') ################################## # Benchmark models -> from previous sessions ################################## # Probability linear model m0 = lm(specs, data=train.zopa) p0 = predict(m0,new=test.zopa) p0[p0<0] = min(p0[p0>0]) p0[p0>1] = max(p0[p0<1]) tmp <- confusionMatrix(as.factor((p0>0.5)*1),reference=true,positive='1') results[1,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc0 <- roc(response=true,predictor=p0) results[1,5] <- as.numeric(auc(roc0)) # Logistic regression m1 = glm(specs, data=train.zopa, family='binomial') p1 = (1/(1+exp(-predict(m1, new=test.zopa)))) tmp <- confusionMatrix(as.factor((p1>0.5)*1),reference=true,positive='1') results[2,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc1 <- roc(response=true,predictor=p1) results[2,5] <- as.numeric(auc(roc1)) # The LASSO x = as.matrix(train.zopa[,all.vars(specs)[-1]]) newx = as.matrix(test.zopa[,all.vars(specs)[-1]]) y = as.matrix(train.zopa[,all.vars(specs)[1]]) lasso.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=1) p21se = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.1se'))))) tmp <- confusionMatrix(as.factor((p21se>0.5)*1),reference=true,positive='1') results[3,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc21se <- roc(response=true,predictor=p21se) results[3,5] <- as.numeric(auc(roc21se)) # Random forest rf.tree = ranger(specs,data=train.zopa, num.trees=1000,mtry=9,min.node.size=5,max.depth=0) p3 = predict(rf.tree,data=test.zopa)$predictions tmp <- confusionMatrix(as.factor((p3>0.5)*1),reference=true,positive='1') results[4,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc3 <- roc(response=true,predictor=p3) results[4,5] <- as.numeric(auc(roc3)) results # Boosted classification tree (xgb) boostedtree <- gbm(specs,data=train.zopa,n.trees=2000,shrinkage=0.001, distribution='bernoulli') p8 = predict(boostedtree,newdata=test.zopa,type='response') # Evaluation tmp <- confusionMatrix(as.factor((p8>0.5)*1),reference=true,positive='1') results[5,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc8 <- roc(response=true,predictor=p8) results[5,5] <- as.numeric(auc(roc8)) ################################## # K-Means ################################## # Distance matrix zscore.train <- scale(train.zopa[,c('amount','term','rate','time.start','time.start2')]) # We do the same with testing dataset! zscore.test <- scale(test.zopa[,c('amount','term','rate','time.start','time.start2')]) # Euclidean distance dist.eucl <- dist(zscore.train,method='euclidean') dist.eucl.mat<- as.matrix(dist.eucl) # We can check dist.eucl.mat[1:5,1:5] # Yes it seems symmetric, elements all positive and 0 on diagonal -> as expected. # Vizualize distance - it will take some time # fviz_dist(dist.eucl) # dev.off() # Given the distance matrix - Find 30 clusters K = 50 set.seed(150) res.hw <- kmeans(zscore.train,centers=K,iter.max=50,nstart=2000,algorithm='Hartigan-Wong') # Check number of observations in cluster table(res.hw$cluster) # Are clusters (by any chance) related to defaults? clus_char <- table(res.hw$cluster,train.zopa$default) clus_char clus_char <- cbind(clus_char,clus_char[,2]/apply(clus_char,1,sum)) colnames(clus_char) <- c('NumberGood','NumberBad','ProbD') clus_char # It looks promising cor(clus_char[res.hw$cluster,3],train.zopa$default,method='kendall') # Can we use that information in forecasting? #---------------------------------------------------------------------------------------------------- # Feature engineering # Perhaps we can create variables that will tell to which Cluster the given observation Belongs # Now let's create dummies corresponding to K-1 clusters, i.e. create new features. 1 if the observation belongs to the given cluster and 0 otherwise clu.train = dummy_cols(res.hw$cluster) head(clu.train) clu.train = clu.train[,-1] colnames(clu.train) = paste('hw',c(1:K),sep='') head(clu.train) # The 'clu.train' are our new features 0/1 that we can use to train a model train1 = data.frame(train.zopa,clu.train) #---------------------------------------------------------------------------------------------------- # Now we need to assign each observation from TESTING data to one of the clusters. How do I know to which cluster the given observation belongs? # Where is the center of each cluster? centroid <- res.hw$centers head(centroid) clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K colnames(clu.test) <- colnames(clu.train) for (i in 1:nrow(zscore.test)) { # We need to find the distance between given observation from testing data to each of the centroid # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset) cld = rbind(centroid,zscore.test[i,]) # Now find the distances between all those observations tmp = as.matrix(dist(cld,method='euclidean')) # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation # tmp[dim(tmp)[1],1:K] # Find to which centroid the observation is closest to opt.clus = which(tmp[dim(tmp)[1],1:K]==min(tmp[dim(tmp)[1],1:K])) # Just add 1 there clu.test[i,opt.clus] = 1 } # Check if we have observations in every cluster apply(clu.test,2,mean) # Note: there might be observations that do not belong to any cluster # We create two training and testing datasets with either one or the second approach of assignment of an observation to a cluster test1 = data.frame(test.zopa,clu.test) #---------------------------------------------------------------------------------------------------- # Training & Evaluation of a model with clusters as features #---------------------------------------------------------------------------------------------------- # What variables to use? The one from before + K related to clusters nms <- c(all.vars(specs)[-1],paste('hw',c(1:K),sep='')) specs.new = gen.fm(dep='default',x=nms) #---------------------------------------------------------------------------------------------------- # Probability linear model m0 = lm(specs.new, data=train1) p0 = predict(m0,new=test1) p0[p0<0] = min(p0[p0>0]) p0[p0>1] = max(p0[p0<1]) tmp <- confusionMatrix(as.factor((p0>0.5)*1),reference=true,positive='1') results[6,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc0 <- roc(response=true,predictor=p0) results[6,5] <- as.numeric(auc(roc0)) #---------------------------------------------------------------------------------------------------- # Logistic regression m1 = glm(specs.new, data=train1, family='binomial') p1 = (1/(1+exp(-predict(m1, new=test1)))) tmp <- confusionMatrix(as.factor((p1>0.5)*1),reference=true,positive='1') results[7,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc1 <- roc(response=true,predictor=p1) results[7,5] <- as.numeric(auc(roc1)) #---------------------------------------------------------------------------------------------------- # The LASSO x = as.matrix(train1[,all.vars(specs.new)[-1]]) newx = as.matrix(test1[,all.vars(specs.new)[-1]]) y = as.matrix(train1[,all.vars(specs.new)[1]]) lasso.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=1) p21se = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.1se'))))) tmp <- confusionMatrix(as.factor((p21se>0.5)*1),reference=true,positive='1') results[8,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc21se <- roc(response=true,predictor=p21se) results[8,5] <- as.numeric(auc(roc21se)) #---------------------------------------------------------------------------------------------------- # RF rf.tree = ranger(specs.new,data=train1, num.trees=1000,mtry=12,min.node.size=5,max.depth=0) p3 = predict(rf.tree,data=test1)$predictions tmp <- confusionMatrix(as.factor((p3>0.5)*1),reference=true,positive='1') results[9,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc3 <- roc(response=true,predictor=p3) results[9,5] <- as.numeric(auc(roc3)) #---------------------------------------------------------------------------------------------------- # XGB - Boosted classification tree (xgb) boostedtree <- gbm(specs.new,data=train1,n.trees=2000,shrinkage=0.001, distribution='bernoulli') p8 = predict(boostedtree,newdata=test1,type='response') # Evaluation tmp <- confusionMatrix(as.factor((p8>0.5)*1),reference=true,positive='1') results[10,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc8 <- roc(response=true,predictor=p8) results[10,5] <- as.numeric(auc(roc8)) #---------------------------------------------------------------------------------------------------- results #---------------------------------------------------------------------------------------------------- # Can we use clusters to make predictions directly #---------------------------------------------------------------------------------------------------- # Indeed - recall head(clus_char) # From test1 we know which observation belongs to which cluster. # We can set-up a rule # If ProbD > 0.5 => defaulted loan pKM <- c() for (i in 1:dim(test1)[1]) pKM[i] = clus_char[which(clu.test[i,] == 1),3] tmp <- confusionMatrix(as.factor((pKM>0.5)*1),reference=true,positive='1') results[11,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) rocKM <- roc(response=true,predictor=pKM) results[11,5] <- as.numeric(auc(rocKM)) results # Not bad for un-supervised method - not bad at all. #---------------------------------------------------------------------------------------------------- #---------------------------------------------------------------------------------------------------- # Many 'linkage' functions that assign an observation from the training to the given cluster # What if we are not sure # Now we need to assign each observation from TESTING data to one of the clusters. How do I know to which cluster the given observation belongs? # Where is the center of each cluster? #---------------------------------------------------------------------------------------------------- # Feature engineering - again #---------------------------------------------------------------------------------------------------- # Now we need to assign each observation from training data to one of the clusters. How do I know to which cluster the given observation belongs? # Where is the center of each cluster? clu.train = matrix(0,nrow=dim(zscore.train)[1],ncol=K) # Number of clusters K colnames(clu.train) <- paste('hw',c(1:K),sep='') for (i in 1:nrow(zscore.train)) { # We need to find the distance between given observation from testing data to each of the centroid # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset) cld = rbind(centroid,zscore.train[i,]) # Now find the distances between all those observations tmp = as.matrix(dist(cld,method='euclidean')) # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation # Find to what degree/magnitude/percentage the given observation belongs to each cluster clu.train[i,] = (1/tmp[nrow(tmp),1:K])/sum((1/tmp[nrow(tmp),1:K])) } # We create two training and testing datasets with either one or the second approach of assignment of an observation to a cluster train2 = data.frame(train.zopa,clu.train) # I also need to create variables for the testing dataset #---------------------------------------------------------------------------------------------------- clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K colnames(clu.test) <- colnames(clu.train) for (i in 1:nrow(zscore.test)) { # We need to find the distance between given observation from testing data to each of the centroid # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset) cld = rbind(centroid,zscore.test[i,]) # Now find the distances between all those observations tmp = as.matrix(dist(cld,method='euclidean')) # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation # Just add 1 there clu.test[i,] = (1/tmp[nrow(tmp),1:K])/sum(1/tmp[nrow(tmp),1:K]) } # Check if we have observations in every cluster test2 = data.frame(test.zopa,clu.test) #---------------------------------------------------------------------------------------------------- # Train & Evaluate models with these new features # What variables to use? The one from before + K related to clusters nms <- c(all.vars(specs)[-1],paste('hw',c(1:K),sep='')) specs.new = gen.fm(dep='default',x=nms) #---------------------------------------------------------------------------------------------------- # Probability linear model m0 = lm(specs.new, data=train2) p0 = predict(m0,new=test2) p0[p0<0] = min(p0[p0>0]) p0[p0>1] = max(p0[p0<1]) tmp <- confusionMatrix(as.factor((p0>0.5)*1),reference=true,positive='1') results[12,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc0 <- roc(response=true,predictor=p0) results[12,5] <- as.numeric(auc(roc0)) #---------------------------------------------------------------------------------------------------- # Logistic regression m1 = glm(specs.new, data=train2, family='binomial') p1 = (1/(1+exp(-predict(m1, new=test2)))) tmp <- confusionMatrix(as.factor((p1>0.5)*1),reference=true,positive='1') results[13,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc1 <- roc(response=true,predictor=p1) results[13,5] <- as.numeric(auc(roc1)) #---------------------------------------------------------------------------------------------------- # The LASSO x = as.matrix(train2[,all.vars(specs.new)[-1]]) newx = as.matrix(test2[,all.vars(specs.new)[-1]]) y = as.matrix(train2[,all.vars(specs.new)[1]]) lasso.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=1) p21se = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.1se'))))) tmp <- confusionMatrix(as.factor((p21se>0.5)*1),reference=true,positive='1') results[14,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc21se <- roc(response=true,predictor=p21se) results[14,5] <- as.numeric(auc(roc21se)) #---------------------------------------------------------------------------------------------------- # RF rf.tree = ranger(specs.new,data=train2, num.trees=1000,mtry=12,min.node.size=5,max.depth=0) p3 = predict(rf.tree,data=test2)$predictions tmp <- confusionMatrix(as.factor((p3>0.5)*1),reference=true,positive='1') results[15,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc3 <- roc(response=true,predictor=p3) results[15,5] <- as.numeric(auc(roc3)) #---------------------------------------------------------------------------------------------------- # XGB - Boosted classification tree (xgb) boostedtree <- gbm(specs.new,data=train2,n.trees=2000,shrinkage=0.001, distribution='bernoulli') p8 = predict(boostedtree,newdata=test2,type='response') # Evaluation tmp <- confusionMatrix(as.factor((p8>0.5)*1),reference=true,positive='1') results[16,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc8 <- roc(response=true,predictor=p8) results[16,5] <- as.numeric(auc(roc8)) #---------------------------------------------------------------------------------------------------- results #---------------------------------------------------------------------------------------------------- # Can we use clusters to make predictions directly - use weighted prediction? #---------------------------------------------------------------------------------------------------- # Indeed - recall head(clus_char) # From test1 we know which observation belongs to which cluster. pKM <- c() for (i in 1:dim(test1)[1]) pKM[i] = weighted.mean(clus_char[,3],w=clu.test[i,]) tmp <- confusionMatrix(as.factor((pKM>0.5)*1),reference=true,positive='1') results[17,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) rocKM <- roc(response=true,predictor=pKM) results[17,5] <- as.numeric(auc(rocKM)) results #----------------------------------------------------------------------------------------------------