#--- #title: "Seminar7RM" #author: "Stefan Lyocsa" #date: "2024-30-10" #output: html_document #--- # Content # * Import data, load packages # * Addressing data imbalance # + Under-sample majority # + Under-sample majority over-sample minority # + Majority and minority weights (cost-weights) # * Simple decision tree # + Estimate # + Visualize # + Predict # + Evaluate evaluation # * Probability linear model # * Logistic regression model # * Classification model evaluation # * Titanic # * Import data, load packages # Load necessary packages for today library(caret) # To evaluate classification tasks library(pROC) # To evaluate classification tasks # We have a dataset of Peer-to-Peer (P2P) loans. We would like to create a credit-risk model, # to predict the default of a loan. # 1 - the loans has defaulted # 0 - the loans has been repayed zsnew = read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//Week 7//zsnew.csv') # First, let's separate the dataset into a training and testing set.seed(200) idx = sample(1:dim(zsnew)[1],size=floor(dim(zsnew)[1]*0.8),replace=F) train = zsnew[+idx,] test = zsnew[-idx,] # Second, let's inspect the variable. summary(train$default) # Fascinating... 8.094% of all loans are defaulted. Is that a balanced dataset? Does not seem to be the case # What should we do? We have 3-4 standard approaches. ## Under-sample majority # Which loans are good (default = 0) and which bad (default=1) idx.good = which(train$default==0) NG = length(idx.good) idx.bad = which(train$default==1) NB = length(idx.bad) # Good loans are the 'majority class' now. # Take the majority class and under-sample. How many? I should have the same number of 'good' loans as I have the 'bad' loans. set.seed(300) tmp = idx.good[sample(1:NG,size=NB,replace=F)] # Now we can combine under-sampled good loans with bad loans train.um = rbind(train[tmp,],train[idx.bad,]) # See how many observations we have and check if the dataset is balanced now. dim(train.um) summary(train.um$default) ## Under-sample majority and over-sampling minority # Similar as before, but we select 'R' times the number of minority class # Let R = 2. set.seed(300) # We first under-sample majority, almost as before, only size is now NB*2 tmp.good = idx.good[sample(1:NG,size=NB*2,replace=F)] # Next we over-sample minority, and set replacement = T - why? tmp.bad = idx.bad[sample(1:NB,size=NB*2,replace=T)] # Now we can combine under-sampled good loans with over-sampled bad loans train.uo = rbind(train[tmp.good,],train[tmp.bad,]) # Make some data checks dim(train.uo) summary(train.uo$default) ## Cost-weighted approach # With the following approach we do not loose (sacrifice) any data. We give different weights to data in # a way, where if you sum the weights for all majority and minority classes they will be equal. # The ML methods need to support that estimation approach (i.e. weighted cases) wgt = rep(0,dim(train)[1]) wgt[train$default==1] = 0.5/NB wgt[train$default==0] = 0.5/NG train$wgt = wgt # The following obviously holds = 0.5 sum(train$wgt[train$default==1]) sum(train$wgt[train$default==0]) # Probability linear model # Let's estimate the following specificaion spec_one = as.formula(default~amount_ln+term+rate+IRD+term*rate+time.start+time.start2) # We use different estimation training samples # 1) Unbalanced data p1 = predict(lm(spec_one,data=train),new=test) summary(p1) # Oh my! PLM leads to negative probability estimates... not even possible. It is what it... # 2) Majority under-sampling p2 = predict(lm(spec_one,data=train.um),new=test) # 3) Under-sampling & Over-sampling p3 = predict(lm(spec_one,data=train.uo),new=test) # 4) Cost-weighted (case weighted) approach - WLS (weighted least squares) p4 = predict(lm(spec_one,data=train,weights=wgt),new=test) # Logistic regression # We estimate logistic regression models as well - same approach p5 = predict(glm(spec_one,data=train,family='binomial'),new=test) # The prediction from glm() function are NOT probabilities but 'XB' from the logistic function. The transformation to probability is: p5 = 1/(1+exp(-p5)) p6 = predict(glm(spec_one,data=train.um,family='binomial'),new=test) p6 = 1/(1+exp(-p6)) p7 = predict(glm(spec_one,data=train.uo,family='binomial'),new=test) p7 = 1/(1+exp(-p7)) p8 = predict(glm(spec_one,data=train,weights=wgt,family='binomial'),new=test) p8 = 1/(1+exp(-p8)) # Evaluate models # Let's store the models into: predicts = cbind(test$default,p1,p2,p3,p4,p5,p6,p7,p8) # I ll create a table (data-frame) with results: Accuracy, Specificity, Sensitivity, Balanced Accuracy, Precision, AUC, BS, CE results = data.frame(metrics = c('Accuracy','Specificity','Sensitivity', 'Precision','AUC','Brier score','Cross entropy','Balanced Accuracy'), p1=NA,p2=NA,p3=NA,p4=NA,p5=NA,p6=NA,p7=NA,p8=NA) reference = as.factor(predicts[,1]) probthr = 0.5 for (i in 2:9) { # Recall - we had negative probability predictions (and from PLM we could have >1 as well) y = predicts[,i] y[y<0] = 0.00001 y[y>1] = 0.99999 prediction = as.factor((y>probthr)*1) tmp = confusionMatrix(data=prediction,reference=reference,positive='1') # Brier score brier.loss = (y - test$default)^2 # Cross entropy # PLM cross.loss = -1*(log(y)*test$default + log(1-y)*(1-test$default)) results[,i] = c(tmp$overall[1], tmp$byClass[c(2,1,5)], as.numeric(roc(response=reference,predictor=y)$auc), mean(brier.loss),mean(cross.loss),mean(tmp$byClass[c(2,1)])) } # Can anyone improve the zsnew model? # Do NOT use: MIRR, MIRR2, RINV, FV, pri.coll, int.coll, numpay! They are known only AFTER the loan ends (look-ahead bias) # Let's try the Titanic titanic = read.csv(file='~\\Titanic.csv')