#---
#title: "Seminar7RM"
#author: "Stefan Lyocsa"
#date: "2024-30-10"
#output: html_document
#---

# Content
# * Import data, load packages
# * Addressing data imbalance
#   + Under-sample majority
#   + Under-sample majority over-sample minority
#   + Majority and minority weights (cost-weights)
# * Simple decision tree
#   + Estimate
#   + Visualize
#   + Predict
#   + Evaluate evaluation
# * Probability linear model
# * Logistic regression model
# * Classification model evaluation
# * Titanic

# * Import data, load packages
# Load necessary packages for today
library(caret) # To evaluate classification tasks
library(pROC)  # To evaluate classification tasks
# We have a dataset of Peer-to-Peer (P2P) loans. We would like to create a credit-risk model,
# to predict the default of a loan.
# 1 - the loans has defaulted
# 0 - the loans has been repayed
zsnew = read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//Week 7//zsnew.csv')
# First, let's separate the dataset into a training and testing
set.seed(200)
idx = sample(1:dim(zsnew)[1],size=floor(dim(zsnew)[1]*0.8),replace=F)
train = zsnew[+idx,]
test  = zsnew[-idx,]
# Second, let's inspect the variable.
summary(train$default)
# Fascinating... 8.094% of all loans are defaulted. Is that a balanced dataset? Does not seem to be the case
# What should we do? We have 3-4 standard approaches.

##  Under-sample majority
# Which loans are good (default = 0) and which bad (default=1)
idx.good = which(train$default==0)
NG       = length(idx.good)
idx.bad  = which(train$default==1)
NB       = length(idx.bad)
# Good loans are the 'majority class' now.
# Take the majority class and under-sample. How many? I should have the same number of 'good' loans as I have the 'bad' loans.
set.seed(300)
tmp      = idx.good[sample(1:NG,size=NB,replace=F)]
# Now we can combine under-sampled good loans with bad loans
train.um = rbind(train[tmp,],train[idx.bad,])
# See how many observations we have and check if the dataset is balanced now.
dim(train.um)
summary(train.um$default)

## Under-sample majority and over-sampling minority
# Similar as before, but we select 'R' times the number of minority class
# Let R = 2.
set.seed(300)
# We first under-sample majority, almost as before, only size is now NB*2
tmp.good = idx.good[sample(1:NG,size=NB*2,replace=F)]
# Next we over-sample minority, and set replacement = T - why?
tmp.bad  = idx.bad[sample(1:NB,size=NB*2,replace=T)]
# Now we can combine under-sampled good loans with over-sampled bad loans
train.uo = rbind(train[tmp.good,],train[tmp.bad,])
# Make some data checks
dim(train.uo)
summary(train.uo$default)

## Cost-weighted approach
# With the following approach we do not loose (sacrifice) any data. We give different weights to data in
# a way, where if you sum the weights for all majority and minority classes they will be equal.
# The ML methods need to support that estimation approach (i.e. weighted cases)
wgt   = rep(0,dim(train)[1])
wgt[train$default==1] = 0.5/NB
wgt[train$default==0] = 0.5/NG
train$wgt = wgt
# The following obviously holds = 0.5
sum(train$wgt[train$default==1])
sum(train$wgt[train$default==0])

# Probability linear model
# Let's estimate the following specificaion
spec_one = as.formula(default~amount_ln+term+rate+IRD+term*rate+time.start+time.start2)
# We use different estimation training samples
# 1) Unbalanced data
p1 = predict(lm(spec_one,data=train),new=test)
summary(p1)
# Oh my! PLM leads to negative probability estimates... not even possible. It is what it...
# 2) Majority under-sampling
p2 = predict(lm(spec_one,data=train.um),new=test)
# 3) Under-sampling & Over-sampling
p3 = predict(lm(spec_one,data=train.uo),new=test)
# 4) Cost-weighted (case weighted) approach - WLS (weighted least squares)
p4 = predict(lm(spec_one,data=train,weights=wgt),new=test)

# Logistic regression
# We estimate logistic regression models as well - same approach
p5 = predict(glm(spec_one,data=train,family='binomial'),new=test)
# The prediction from glm() function are NOT probabilities but 'XB' from the logistic function. The transformation to probability is:
p5 = 1/(1+exp(-p5))
p6 = predict(glm(spec_one,data=train.um,family='binomial'),new=test)
p6 = 1/(1+exp(-p6))
p7 = predict(glm(spec_one,data=train.uo,family='binomial'),new=test)
p7 = 1/(1+exp(-p7))
p8 = predict(glm(spec_one,data=train,weights=wgt,family='binomial'),new=test)
p8 = 1/(1+exp(-p8))

# Evaluate models
# Let's store the models into:
predicts = cbind(test$default,p1,p2,p3,p4,p5,p6,p7,p8)
# I ll create a table (data-frame) with results: Accuracy, Specificity, Sensitivity, Balanced Accuracy, Precision, AUC, BS, CE
results = data.frame(metrics = c('Accuracy','Specificity','Sensitivity', 'Precision','AUC','Brier score','Cross entropy','Balanced Accuracy'),
                     p1=NA,p2=NA,p3=NA,p4=NA,p5=NA,p6=NA,p7=NA,p8=NA)
reference = as.factor(predicts[,1])
probthr = 0.5
for (i in 2:9) {
  # Recall - we had negative probability predictions (and from PLM we could have >1 as well)
  y          = predicts[,i]
  y[y<0]     = 0.00001
  y[y>1]     = 0.99999
  prediction = as.factor((y>probthr)*1)
  tmp = confusionMatrix(data=prediction,reference=reference,positive='1')
  
  # Brier score
  brier.loss = (y - test$default)^2
  
  # Cross entropy
  # PLM
  cross.loss = -1*(log(y)*test$default + log(1-y)*(1-test$default))
  
  results[,i] = c(tmp$overall[1], tmp$byClass[c(2,1,5)],
                  as.numeric(roc(response=reference,predictor=y)$auc),
                  mean(brier.loss),mean(cross.loss),mean(tmp$byClass[c(2,1)]))
}

# Can anyone improve the zsnew model?
# Do NOT use: MIRR, MIRR2, RINV, FV, pri.coll, int.coll, numpay! They are known only AFTER the loan ends (look-ahead bias)

# Let's try the Titanic
titanic = read.csv(file='~\\Titanic.csv')