# rm(list=ls()) # Load packages library(caret) library(glmnet) library(tree) library(rpart) library(rpart.plot) library(ranger) library(pROC) library(foreach) library(doParallel) library(gbm) # Load data zopa <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//Week 9//zsnew.csv') # Number of observations NN <- dim(zopa)[1] # Under-represented dummy variables nms.rm <- c('WA5','LU7','EH54','PC.HS','RH10','CV6','PR7','BS16','FK2','PC.LD','PC.EC','SE9','SE18','NG9','PC.ZE','PC.WC','PC.RUNC','PC.NH','PC.BG','PC.BF') zopa[,nms.rm] <- NULL names(zopa) # Data is unbalanced table(zopa$default) # Address data imbalance # Random under-sampling of the majority class idx <- which(zopa$default==0) set.seed(100) good.loans <- zopa[idx[sample(1:length(idx),size=sum(zopa$default==1),replace=F)],] bad..loans <- zopa[zopa$default==1,] balanced.zopa <- rbind(good.loans,bad..loans) # Random shuffle NT <- dim(balanced.zopa)[1] set.seed(110) balanced.zopa <- balanced.zopa[sample(1:NT,size=NT,replace=F),] # Now we can separate the data # Training & Testing set.seed(115) idx <- sample(1:NT,size=floor(0.8*NT),replace=F) train.zopa <- balanced.zopa[idx, ] test.zopa <- balanced.zopa[-idx,] # Cost weighted approach # This is an unbalanced dataset of the same size as train.zopa set.seed(120) train.zopa.cw <- zopa[sample(1:NN,size=dim(train.zopa)[1],replace=F),] # Now we create a weight vector wgt <- rep(0,dim(train.zopa)[1]) wgt[train.zopa.cw$default==1] <- 0.5/sum(train.zopa.cw$default==1) wgt[train.zopa.cw$default==0] <- 0.5/sum(train.zopa.cw$default==0) train.zopa.cw$wgt <- wgt # Specification with all possible features specs = gen.fm(dep='default',x=names(train.zopa)[c(7,11:167,201,207)]) # The predicted outcome true <- as.factor(test.zopa$default) # Store results results = matrix(NA,nrow=13,ncol=5) colnames(results) = c('accuracy','balanced accuracy','sensitivity','specificity','auc') rownames(results) = c('PLM','Logistic Regression', 'LASSO MIN', 'LASSO 1SE', 'RIDGE MIN', 'RIDGE 1SE', 'EN MIN' , 'EN 1SE', 'DT Deep' , 'DT Pruned', 'DT Bagg', 'RF', 'XGB') # The benchmarks # Probability linear model m0 = lm(specs, data=train.zopa) p0 = predict(m0,new=test.zopa) p0[p0<0] = min(p0[p0>0]) p0[p0>1] = max(p0[p0<1]) # Evaluation tmp <- confusionMatrix(as.factor((p0>0.5)*1),reference=true,positive='1') results[1,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc0 <- roc(response=true,predictor=p0) plot(roc0) results[1,5] <- as.numeric(auc(roc0)) # Logistic regression m1 = glm(specs, data=train.zopa, family='binomial') p1 = (1/(1+exp(-predict(m1, new=test.zopa)))) # Evaluation tmp <- confusionMatrix(as.factor((p1>0.5)*1),reference=true,positive='1') results[2,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc1 <- roc(response=true,predictor=p1) plot(roc1, col = 'blue', lty = 3, add = TRUE) results[2,5] <- as.numeric(auc(roc1)) #------------------------------------------------------------------------------- # Now we go to new class of models #------------------------------------------------------------------------------- # The LASSO x = as.matrix(train.zopa[,all.vars(specs)[-1]]) newx = as.matrix(test.zopa[,all.vars(specs)[-1]]) y = as.matrix(train.zopa[,all.vars(specs)[1]]) # This takes couple of seconds.... lasso.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=1) # You can check coefficients if you are into that stuff plot(lasso.lr) coefficients(lasso.lr,s='lambda.min') p2min = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.min'))))) p21se = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.1se'))))) # Evaluation - min tmp <- confusionMatrix(as.factor((p2min>0.5)*1),reference=true,positive='1') results[3,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc2min <- roc(response=true,predictor=p2min) plot(roc0) plot(roc1, col = 'blue', add=TRUE) plot(roc2min, col = 'darkgreen', lty = 2, add = TRUE) results[3,5] <- as.numeric(auc(roc2min)) # Evaluation - 1se tmp <- confusionMatrix(as.factor((p21se>0.5)*1),reference=true,positive='1') results[4,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc21se <- roc(response=true,predictor=p21se) plot(roc0) plot(roc1, col = 'blue', add=TRUE) plot(roc2min, col = 'darkgreen', lty = 2, add = TRUE) plot(roc21se, col = 'darkblue', lty = 2, add = TRUE) results[4,5] <- as.numeric(auc(roc21se)) # The RIDGE # This takes couple of seconds.... ridge.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=0) # You can check coefficients if you are into that stuff plot(ridge.lr) coefficients(ridge.lr,s='lambda.min') p3min = (1/(1+exp(-predict(ridge.lr,newx=newx,s='lambda.min')))) p31se = (1/(1+exp(-predict(ridge.lr,newx=newx,s='lambda.1se')))) # Evaluation - min # Evaluation - 1se # The ELASTIC NET # Use elastic net with alpha (0,1) with lambda.min & lambda.1se -> who wins? #------------------------------------------------------------------------------- # Now we go to new class of models #------------------------------------------------------------------------------- # Decision Trees # Deep unprunned tree t4 = rpart(specs,data=train.zopa, method='class',model=TRUE, control=rpart.control(cp=0,xval=10,maxdepth=10)) t4$cptable t4 <- prune(t4, cp = 0) rpart.plot(t4) p4 = predict(t4,new=test.zopa)[,2] # Evaluation tmp <- confusionMatrix(as.factor((p4>0.5)*1),reference=true,positive='1') results[9,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc4 <- roc(response=true,predictor=p4) plot(roc0) plot(roc1, col = 'blue', add=TRUE) plot(roc2min, col = 'darkgreen', lty = 2, add = TRUE) plot(roc21se, col = 'darkblue', lty = 2, add = TRUE) plot(roc4, col='red',add=TRUE) results[9,5] <- as.numeric(auc(roc4)) # Deep prunned tree t5 = rpart(specs,data=train.zopa, method='class',model=TRUE, control=rpart.control(cp=0,xval=10,maxdepth=10)) t5$cptable rpart.plot(t5) t5 <- prune(t5, cp = 0.0025740026) rpart.plot(t5) p5 = predict(t5,new=test.zopa)[,2] # Evaluation tmp <- confusionMatrix(as.factor((p5>0.5)*1),reference=true,positive='1') results[10,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc5 <- roc(response=true,predictor=p5) plot(roc0) plot(roc1, col = 'blue', add=TRUE) plot(roc2min, col = 'darkgreen', lty = 2, add = TRUE) plot(roc21se, col = 'darkblue', lty = 2, add = TRUE) plot(roc4, col='red',add=TRUE) plot(roc5, col='purple',add=TRUE) results[10,5] <- as.numeric(auc(roc5)) # Bagging - decision trees ##################################### # We meed to store predictions based on bootstrapped data bagg.pred = array(NA,dim=c(dim(test.zopa)[1],1000)) # We loop over bootstrapped data # This takes some time nc = detectCores()/2 # Set number of bootstrap sample to 250 if you have less than 5 cores # Set number of bootstrap sample to 500 if you have 5+ but less than 5 cores cl = makeCluster(nc); registerDoParallel(cl) TMP <- foreach(b = 1:1000,.packages='rpart') %dopar% { print(b) # Randomly select observations idx = sample(1:dim(train.zopa)[1],size=dim(train.zopa)[1],replace=T) # Estimate the tree tb = rpart(specs,data=train.zopa[idx,], method='class',model=TRUE, control=rpart.control(cp=0,xval=10)) tb <- prune(tb, cp = 0) # Generate and store predictions return(predict(tb,new=test.zopa)[,2]) } stopCluster(cl) # Store into the for (b in 1:1000) bagg.pred[,b] = TMP[[b]] # Calculate the average prediction (probability) p6 = apply(bagg.pred,1,mean) # Evaluation tmp <- confusionMatrix(as.factor((p6>0.5)*1),reference=true,positive='1') results[11,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc6 <- roc(response=true,predictor=p6) plot(roc0) plot(roc1, col = 'blue', add=TRUE) plot(roc2min, col = 'darkgreen', lty = 2, add = TRUE) plot(roc21se, col = 'darkblue', lty = 2, add = TRUE) plot(roc4, col='red',add=TRUE) plot(roc5, col='purple',add=TRUE) plot(roc6, col='darkgrey',add=TRUE) results[11,5] <- as.numeric(auc(roc6)) # Visualize the distribution of predictions for a selected two observations dev.off() par(mfrow=c(1, 2)) par(cex = 1.1) par(oma = c(1, 1.5, 0.0, 0.0)) par(tcl = -0.25) par(mgp = c(2, 0.6, 0)) par(mar = c(2.0, 2.0, 1.5, 0.5)) hist(bagg.pred[1,],breaks=50) abline(v=0.5,col='red',lwd=2) hist(bagg.pred[80,],breaks=50) abline(v=0.5,col='red',lwd=2) # Random forest rf.tree = ranger(specs, data=train.zopa, num.trees=1000,mtry=9,min.node.size=5,max.depth=0) p7 = predict(rf.tree,data=test.zopa)$predictions # Evaluation tmp <- confusionMatrix(as.factor((p7>0.5)*1),reference=true,positive='1') results[12,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc7 <- roc(response=true,predictor=p7) dev.off() plot(roc0) plot(roc1, col = 'blue', add=TRUE) plot(roc2min, col = 'darkgreen', lty = 2, add = TRUE) plot(roc21se, col = 'darkblue', lty = 2, add = TRUE) plot(roc4, col='red',add=TRUE) plot(roc5, col='purple',add=TRUE) plot(roc6, col='darkgrey',add=TRUE) plot(roc7, col='darkred',add=TRUE,lty=3,lwd=3) results[12,5] <- as.numeric(auc(roc7)) # Boosted classification tree boostedtree <- gbm(specs,data=train.zopa,n.trees=2000,shrinkage=0.001, distribution='bernoulli') p8 = predict(boostedtree,newdata=test.zopa,type='response') # Evaluation tmp <- confusionMatrix(as.factor((p8>0.5)*1),reference=true,positive='1') results[13,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc8 <- roc(response=true,predictor=p8) dev.off() plot(roc0) plot(roc1, col = 'blue', add=TRUE) plot(roc2min, col = 'darkgreen', lty = 2, add = TRUE) plot(roc21se, col = 'darkblue', lty = 2, add = TRUE) plot(roc4, col='red',add=TRUE) plot(roc5, col='purple',add=TRUE) plot(roc6, col='darkgrey',add=TRUE) plot(roc7, col='darkred',add=TRUE,lty=3,lwd=3) plot(roc8, col='lightblue',add=TRUE,lty=1,lwd=4) results[13,5] <- as.numeric(auc(roc8)) results ##################################### # TASK ##################################### # Let's play with Titanic # Choose LR and Two extra models that you think would perform the best ##################################### titanic = read.csv(file='....\\Titanic.csv') head(titanic) titanic = titanic[complete.cases(titanic),] set.seed(100) N = dim(titanic)[1] idx = sample(1:N,size=floor(0.8*N),replace=F) train = titanic[+idx,] test = titanic[-idx,]