# The following command clears the workspace!
rm(list=ls())
# This will clear the plot device
dev.off()
# We load the data-file. FinTechP2P is an R object already.
load(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P')
# Alternatively, we could load the *.csv file
DT = read.csv(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P.csv')
# The name of the dataset is DT. There are 4157 observations and 43 variables
# Row is a loan
# Column is a variable
# Variables names are just abbreviations. We can take a look at them.
names(DT)
# More details about variables in Case study 3

##############################################
# Case study 1: Simple linear regression model
##############################################
# We are interested in explaining the internal rate of return of micro-loans (on a P2P market)
# via a variable, that signals whether the income of the borrower was NOT verified.

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Visualization
# %%%%%%%%%%%%%%%%%%%%%%%%%

# Plot the internal rate of return variable. The name of the variable is RR2
plot(DT$RR2,type='p',pch=19,cex=0.25,xlab='Loans',ylab='Internal rate of return')
abline(h=0,lwd=2,col='red')
hist(DT$RR2,breaks=100,xlab='Return',prob=T,main='Distribution of returns')
abline(v=0,lwd=2,col='red')
# Plot the variable that codes whether the income of the borrower was NOT verified.
# The name of the variable is ver2
pie(table(DT$ver2),labels=c("Not verified", "Verified"),col=c("white","red"))
# We can see the percentages of 1 (not verified) and 0 (verified) as (including rounding on 2 decimal places):
prct = round(100*table(DT$ver2)/sum(table(DT$ver2)),2)
prct
# And a little tweak of the piechart with percentages in it
pie(table(DT$ver2),labels=paste(c("Not verified", "Verified")," - ",prct,"%",sep=""),col=c("white","red"))
# We can visualize rate of return conditional on verification together
boxplot(DT$RR2~DT$ver2,pch=19,cex=0.25)

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Descriptive statistics
# %%%%%%%%%%%%%%%%%%%%%%%%%

# For convenience we create a new object out of the dependent variable and name it 'y'
y = DT$RR2
# We now need a new package - because of functions for skewness and kurtosis
# Run the following line only once
# install.packages('moments')
# Now we 'load' the package into the workspace
library(moments)
# Now we calculate the mean, standard deviation, min, median, max, skewness, kurtosis
# We also use rounding, on 2 decimal places, to make the results more readable
round(c(mean(y),sd(y),min(y),median(y),max(y),skewness(y),kurtosis(y)),2)
# We could also take a look at the % of loans that were competly defaulted
round(100*sum(y==-100)/length(y),2)
# Or at the % of loans with return below 0, but also not fully defaulted
round(100*sum(y>-100 & y<0)/length(y),2)
# As the independent variable is a 'dummy' variable, it can be fully summarized by the
# table
table(DT$ver2)
# Or percentages
prct

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Model estimation: OLS
# %%%%%%%%%%%%%%%%%%%%%%%%%

# The following function lm() estimates the model. Note that constant is assumed
# one does not need to write RR2~1+ver2, it is enought to write RR2~ver2
m1 = lm(RR2~ver2,data=DT)
# Coefficient estimates are
m1
# More detailed results
summary(m1)
# Take a look at the coefficients
# Take a look at the (uncertainty) variance of coefficient estimates (hence the p-value)
# Take a look at the R2 of the model
# Take a look at important (testable) model assumptions
# With cross-sectional data - particularly heteroscedasticity
# We need to download a new library and install it. Run this line only once
# install.packages("lmtest")
# We need to load the library into our current workspace
library(lmtest)
# Now we run a very basic heteroscedasticity test of Breusch and Pagan that is implemented in library 'lmtest' in the function bptest()
bptest(m1)
# Further tests to consider: Normality of residuals, Model specifications
# As heteroscedasticity is present (p-value < \alpha = 0.05) the inference is made using consistent estimator(s) of standard error of coefficients
# We need a new library
# install.packages("sandwich")
library(sandwich)
coeftest(m1, vcov=vcovHC(m1,type='HC0'))
# Now interprete statistical significance of coefficients
# Futher issues to consider: residual diagnostics (for outliers), for dependence structures (clustering, spatial, in time)

##############################################
# Case study 1: Alternative version
##############################################

################################
# We are interested in explaining the interest on the loan 
# via a variable, that of the annualized interest rate on that loan.
################################

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Visualization
# %%%%%%%%%%%%%%%%%%%%%%%%%

# Let's take a look at the interest rate variable. The name of the variable is int
plot(y=DT$int,x=DT$end_date,type='p',pch=19,cex=0.25,xlab='Date',ylab='Annualized interest rate')
hist(DT$int,breaks=100,xlab='Interest rate',prob=T,main='Distribution of interest rates')
# We can visualize the interest rate conditional on verification together
boxplot(DT$int~DT$ver2,pch=19,cex=0.25)

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Descriptive statistics
# %%%%%%%%%%%%%%%%%%%%%%%%%
 
y = DT$int
round(c(mean(y),sd(y),min(y),median(y),max(y),skewness(y),kurtosis(y)),2)

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Model estimation: OLS
# %%%%%%%%%%%%%%%%%%%%%%%%%

m2 = lm(int~ver2,data=DT)
m2
summary(m2)
bptest(m2)
coeftest(m2, vcov=vcovHC(m2,type='HC0'))

##############################################
# Case study 2: 
##############################################

# We are interested in explaining the return on the loan (RR2) via
# interest rate.

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Visualization
# %%%%%%%%%%%%%%%%%%%%%%%%%

plot(y=DT$int,x=DT$end_date,type='p',pch=19,cex=0.25,xlab='Date',ylab='Annualized interest rate')
hist(DT$int,breaks=100,xlab='Interest rate',prob=T,main='Distribution of interest rates')
plot(y=DT$RR2,x=DT$int,pch=19,cex=0.25,xlab='Interest rate',ylab='Realized return')

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Model estimation: OLS
# %%%%%%%%%%%%%%%%%%%%%%%%%

m3 = lm(RR2~int,data=DT)
m3
summary(m3)
bptest(m3)
coeftest(m3, vcov=vcovHC(m3,type='HC0'))

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#########################################################
# You can check the multivariate regression model example
#########################################################
m1=lm(RR2~ver2,data=DT);summary(m1)
res1 = residuals(m1)
m4=lm(int~ver2,data=DT);summary(m4)
res4 = residuals(m4)
# The coefficient here ...
m5=lm(res1~-1+res4); summary(m5)
# ... should be (and is) the same as the coefficient on 'int' here ...
m6 = lm(RR2~int+ver2,data=DT)
summary(m6)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


################################################
# Case study 3: Multiple linear regression model
################################################

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Data description
# %%%%%%%%%%%%%%%%%%%%%%%%%

###########################################################################
# We are interested in explaining the return on the loan (RR2) via multiple
# indpendent variables:

# new - 1 if the customer is new, 0 otherwise
# ver2 - 1 if income is not verified, 0 otherwise
# ver3 - 1 if income is verified, 0 otherwise
# ver4 - 1 if income and expenses are verified, 0 otherwise

# lfi - 1 if the language of the borrower is Finish, 0 otherwise
# lee - 1 if the language of the borrower is Estonian, 0 otherwise
# luk - 1 if the language of the borrower is English, 0 otherwise
# lrs - 1 if the language of the borrower is Russian, 0 otherwise
# lsk - 1 if the language of the borrower is Slovakian, 0 otherwise
# German and Spanish languages are the remaining ones

# age - age of the borrower

# Female - 1 if the borrower is a female, 0 otherwise
# undG - 1 if the gender is left undefined, 0 otherwise
# Male is the one missing here

# lamt - the log (natural henceforth) of the amount the borrower received
# int - interest on the loan
# durm - planned loan duration (in months)

# educprim - 1 primary education, 0 otherwise
# educbasic - 1 primary education, 0 otherwise
# educvocat - 1 vocational education, 0 otherwise
# educsec - 1 secondary education, 0 otherwise
# Higher education is the one missing

# msmar - 1 married, 0 otherwise
# msco - 1 cohabitation, 0 otherwise
# mssi - 1 single, 0 otherwise
# msdi - 1 divorced, 0 otherwise
# Widow status is the one missing

# espem - 1 partially employed, 0 otherwise
# esfue - 1 fully employed, 0 otherwise
# essem - 1 self employed, 0 otherwise
# esent - 1 enterpreneur, 0 otherwise
# esret - 1 retiree, 0 otherwise
# Unemployed is the one missing

# dures - Current employment duration: in ordered categories, 0 - nothing, 1 - TrialPeriod, 2 - UpTo1Year, 3 - UpTo2Years, 4- UpTo3Years, 5 - UpTo4Years, 6 - UpTo5Years, 7 - MoreThan5Years
# exper - Work experience: in ordered categories, no - 0, LessThan2Years - 1, 2To5Years - 2, 5To10Years- 3, 10To15Years - 4, 15To25Years - 5, MoreThan25Years- 6
# linctot - Log of total income of the borrower
# noliab - Number of existing liabilities of the borrower
# lliatot - Log of the total amount of current liabilities
# norli - Number of refinanced liabilities
# noplo - Number of previous loans
# lamountplo - Log of the total amount of the previous loans
# lamntplor - Log of the total amount of previous loans that were paid off
# lamntplor - Log of the total amount of previous loans that were paid off before the loan's duration
# nopearlyrep - Number of previous early re-paid loans
# nrodep - Number of dependants in the household

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Sample spliting
# %%%%%%%%%%%%%%%%%%%%%%%%%

# Number of forecasted loans
NF = 100
# Overall number of observations
N = dim(DT)[1]
# The sample to use to estimate the model
Sample1 = DT[1:(N-NF),]
# The sample to use to predict (out-of-sample) loan return
Sample2 = DT[(N-NF+1):N,]

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Model estimation
# %%%%%%%%%%%%%%%%%%%%%%%%%

# Model estimation
m7 = lm(RR2~new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+
          female+lamt+int+durm+educprim+educbasic+
          educvocat+educsec+msmar+msco+mssi+msdi+nrodep+
          espem+esfue+essem+esent+esret+dures+exper+
          linctot+noliab+lliatot+norli+noplo+lamountplo+
          lamntplr+lamteprl+nopearlyrep,data=Sample1)
# We take a look at the model-fit, coefficients
summary(m7)
# Heteroscedasticity in these types of models is almost guaranteed as they are not well specified (many non-linear effects)
bptest(m7)
# (Usually) more conservative standard errors 
coeftest(m7, df = Inf, vcov = vcovHC(m7, type = "HC0"))
# Check the non-perfect colinearity using variance inflation factor
# For checking the non-perfect colinearity we need a new library
# install.packages('car')
# Calling the library
library(car)
# Now calculating Variance Inflation Factor (VIF)
# If any of the variables has VIF > 10 & the coefficients are non-significant, multicolinearity might be an issue here.
which(vif(m7)>10)
# Removal of 'mco' will lead to a model with no co-linearity issues, but it will not help in improving the model. We thus work with the m7 model

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Return prediction
# %%%%%%%%%%%%%%%%%%%%%%%%%

# Let's predict loans using the estimated model
yhat = predict(m7,new=Sample2)
# We save the realized (true) loan returns
ytrue = Sample2$RR2
# We can visualize our accuracy
plot(y=ytrue,x=yhat,pch=19,cex=0.25,ylim=c(min(yhat,ytrue),max(yhat,ytrue)),
     xlim=c(min(yhat,ytrue),max(yhat,ytrue)),xlab='Predicted returns',ylab='Realized returns')
# We can take a look at the true and forecasted returns
cbind(yhat,ytrue)
# In many occasions, errors are small, but particularly for negative returns, errors tend to increase a lot
# There are several 'metrics' to measure the accuracy of the forecast
# The most standard one is the Means Squared Error
# We can use the Mean Squared Forecast Error to Compare forecasts between different model
# And choose the one, with lowest MSFE.
mean((yhat-ytrue)^2)
# Mean absolute deviation
mean(abs(yhat-ytrue))
# Histogram of forecast errors
hist(abs(yhat-ytrue),main='Forecast errors')



# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#########################################################
# Examples of overfitting: Model bias & Variance
#########################################################

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Generate data
# %%%%%%%%%%%%%%%%%%%%%%%%%

# Number of observations
N = 50
# Random seed number
rs = 10
# Set random seed
set.seed(rs)
# Generate independent variables
x = abs(rnorm(N))+1
# Set random seed
set.seed(rs+7)
# Generate error temrs
u = rnorm(N,0,0.25)
# Generate the 'true' relationship not influenced by noise
yt = (2 + log(x))
# Generate the dependent variable
y = (yt + u)
# Stack data into a data frame
sms = data.frame(x,y,yt,u)
# Sort the data according to the 'x' variable
sms = sms[order(sms$x),]
# Plot the observations, 'y' against 'x'
plot(x=sms$x,y=sms$y,type='p',pch=19,xlab='x',ylab='y')
# Draw the true 'line'
lines(x=sms$x,y=sms$yt,col='red',lwd=2)
legend('bottomright',legend=c('Observed value', 'True relationship'), pch = 19, col=c('black','red'), bty='n')

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Sample spliting
# %%%%%%%%%%%%%%%%%%%%%%%%%

# Now we split the sample into 2 parts
# Testing sample
set.seed(rs)
# Which observations (rows) are we going to select for the testing sample?
# We select N/2 observations for the test sample
idx = c(sample(1:(N/2),20),sample((N/2+1):N,5))
tst = sms[idx,]
# Validation sample
# which observations (rows) are we going to select for the validation sample? (The remaining ones)
val = sms[-idx,]
# Let's plot the data from testing dataset
# Plot the observations, 'y' against 'x'
plot(x=tst$x,y=tst$y,type='p',pch=19,xlab='x',ylab='y',xlim=c(min(sms$x),max(sms$x)),ylim=c(min(sms$y),max(sms$y)))
# To have the perspective on the true line, let's see
lines(x=sms$x,y=sms$yt,col='red',lwd=2)
points(x=val$x,y=val$y,col='green',pch=19)
legend('topleft',legend=c('Testing sample','Validation sample','True relationship'),pch=19,bty='n',col=c('black','green','red'))

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Model estimation: OLS
# %%%%%%%%%%%%%%%%%%%%%%%%%

# Estimate a simple OLS model
m8 = lm(y~x,data=tst)
m8
# Plot the line
plot(x=tst$x,y=tst$y,type='p',pch=19,xlab='x',ylab='y',xlim=c(min(sms$x),max(sms$x)),ylim=c(min(sms$y),max(sms$y)))
lines(x=sms$x,y=sms$yt,col='red',lwd=2)
abline(m8,col='blue',lwd=3)
legend('topleft',legend=c('Testing sample','True relationship','Linear model'),pch=19,bty='n',col=c('black','red','blue'))

# Now let's estimate a polynomial model via OLS (increasing complexity)
m9 = lm(y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5),data=tst)
m9
# Plot the curve
poly = data.frame(tst$x,fitted(m9)); names(poly) = c("x","ytp")
poly = poly[order(poly$x),]
plot(x=tst$x,y=tst$y,type='p',pch=19,xlab='x',ylab='y',xlim=c(min(sms$x),max(sms$x)),ylim=c(min(sms$y),max(sms$y)))
abline(m8,col='blue',lwd=3)
lines(x=poly$x,y=poly$ytp,col='purple',lwd=2)
legend('topleft',legend=c('Testing sample','Linear model','Polynomial'),pch=19,bty='n',col=c('black','blue','purple'))

# Purple curve fits the data much better... on the testing sample
# Let's compare the 'fit' using mean squared errors
# First for the line
# MSE Line
MSE_m8=mean((tst$y-fitted(m8))^2)
MSE_m8
# MSE Polynomial
MSE_m9=mean((tst$y-fitted(m9))^2)
MSE_m9
# The improvement?
100*(MSE_m8-MSE_m9)/MSE_m8


# %%%%%%%%%%%%%%%%%%%%%%%%%
# Predictions
# %%%%%%%%%%%%%%%%%%%%%%%%%

# Is the polynomial going to out-perform the simple linear model?
# We now predict 'y' from the validation sample
# First using the linear model
fm8 = predict(m8,new=val)
# Second using the polynomial model
fm9 = predict(m9,new=val)
# Let's plot the 'new' points and 'old' curves
plot(x=val$x,y=val$y,pch=19,col='green',
     xlim=c(min(sms$x),max(sms$x)),
     ylim=c(min(c(sms$y,fm8,fm9)),max(c(sms$y,fm8,fm9))),
     xlab='x',ylab='y')
abline(m8,col='blue',lwd=3)
lines(x=poly$x,y=poly$ytp,col='purple',lwd=2)
points(x=val$x,y=val$y,pch=19,col='green')
legend('topleft',legend=c('Testing sample','Linear model','Polynomial','Validation sample'),pch=19,bty='n',col=c('black','blue','purple','green'))

# We now plot the forecasts
plot(x=val$x,y=val$y,pch=19,col='green',
     xlim=c(min(sms$x),max(sms$x)),
     ylim=c(min(c(sms$y,fm8,fm9)),max(c(sms$y,fm8,fm9))),
     xlab='x',ylab='y')
points(x=val$x,y=fm8,col='blue',pch=19)
points(x=val$x,y=fm9,col='purple',pch=19)
legend('topleft', bty='n',pch=19,
       legend=c("True values","Line", "Curve"), col=c('green','blue','purple'))
# Now let's check the MSF(orecast)E
MSFE_m8 = mean((val$y-fm8)^2)
MSFE_m8
MSFE_m9 = mean((val$y-fm9)^2)
MSFE_m9
# The improvement?
100*(MSFE_m8-MSFE_m9)/MSFE_m8

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



################################################
# Case study 4: Ridge regression
################################################

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Sample splitting
# %%%%%%%%%%%%%%%%%%%%%%%%%

# We split the sample as before
# Number of observations we use for the out-of-sample study
NF = 100
N = dim(DT)[1]
# Now we create the two samples
tst = DT[1:(N-NF),] # in-sample (testing sample)
val = DT[((N-NF)+1):N,] #out-of-sample (validation sample)

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Model estimation
# %%%%%%%%%%%%%%%%%%%%%%%%%

# We again estimate the OLS model
# Model estimation
m7 = lm(RR2~new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+
          female+lamt+int+durm+educprim+educbasic+
          educvocat+educsec+msmar+msco+mssi+msdi+nrodep+
          espem+esfue+essem+esent+esret+dures+exper+
          linctot+noliab+lliatot+norli+noplo+lamountplo+
          lamntplr+lamteprl+nopearlyrep,data=tst)

# Now the Ridge regression
# We need to instal the library - do it only once.
# Install.packages('glmnet')
library(glmnet)
# We first need to prepare the data into a format that
# the function glmnet understands
# We need a matrix of independent variables
indep = as.matrix(tst[,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                         "undG","female","lamt","int","durm","educprim","educbasic",
                         "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                         "espem","esfue","essem","esent","esret","dures","exper",
                         "linctot","noliab","lliatot","norli","noplo","lamountplo",
                         "lamntplr","lamteprl","nopearlyrep")])
dep = tst[,"RR2"]
# Select variables for prediction purposes - to be used latter.
pred = as.matrix(val[,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                        "undG","female","lamt","int","durm","educprim","educbasic",
                        "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                        "espem","esfue","essem","esent","esret","dures","exper",
                        "linctot","noliab","lliatot","norli","noplo","lamountplo",
                        "lamntplr","lamteprl","nopearlyrep")])
# We set the number of groups for cross-validation purposes.
# We set it to 30
CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0)
# We plot the relationship between lambda and the MSE
plot(CV)
# Which lambda minimizes MSE?
CV$lambda.min
# Which lambda corresponds to (min(MSE)+1SD(MSE))?
CV$lambda.1se
# You can take a look at the coefficients of the RIDGE regression and compare it with OLS
round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4)
# This is interesting as it shows, how coefficients are Shrinked. Some are
# shrinked towards zero, while others are allowed to be even further ways from 0
# suggesting, that these coefficients play an important role in predictions.

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Predictions
# %%%%%%%%%%%%%%%%%%%%%%%%%

# We perform the prediction using coefficients from 'm7' and new data from Sample2
yOLS = predict(m7,new=val)
# We save the realized (true) loan returns
ytrue = val$RR2
# We calculate the MSFE
MSEOLS = mean((yOLS-ytrue)^2)
###############
# Now the RIDGE using the glmnet package
yRIDGEmin=predict(CV,newx=pred,s=CV$lambda.min)
MSER1 = mean((ytrue-yRIDGEmin)^2)
yRIDGE1se=predict(CV,newx=pred,s=CV$lambda.1se)
MSER2 = mean((ytrue-yRIDGE1se)^2)
# Compare errors
cbind(MSEOLS, MSER1, MSER2)
# Both Ridge regression predictions have slightly out-performed the OLS!



################################################
# Case study 5: Lasso regression
################################################

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Model estimation: LASSO
# %%%%%%%%%%%%%%%%%%%%%%%%%
# We use objects from previous case study
# Perform the CV (note the difference is that now alpha=1 instead of 0)
CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1)
# We plot the relationship between lambda and the MSE
plot(CV)
# Which lambda minimizes MSE?
CV$lambda.min
# Which lambda corresponds to (min(MSE)+1SD(MSE))?
CV$lambda.1se
# You can take a look at the coefficients of the LASSO regression and compare it with OLS
round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4)
# Several coefficients are reduced to 0! These variables are interpreted as not being very
# useful in predicting the dependent variable.

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Predictions
# %%%%%%%%%%%%%%%%%%%%%%%%%

yLASSOmin=predict(CV,newx=pred,s=CV$lambda.min)
MSEL1 = mean((ytrue-yLASSOmin)^2)
yLASSO1se=predict(CV,newx=pred,s=CV$lambda.1se)
MSEL2 = mean((ytrue-yLASSO1se)^2)
# Compare errors
cbind(MSEOLS, MSER1, MSER2, MSEL1, MSEL2)
# Lasso L2 out-performed the OLS and the Ridge!



################################################
# Case study 6: Elastic net
################################################

# %%%%%%%%%%%%%%%%%%%%%%%%%%
# Predictions: EN alpha=0.25
# %%%%%%%%%%%%%%%%%%%%%%%%%%

CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.25)
CV$lambda.min
CV$lambda.1se
round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4)
yNET025min=predict(CV,newx=pred,s=CV$lambda.min)
MSEEN1.1 = mean((ytrue-yNET025min)^2)
yNET0251se=predict(CV,newx=pred,s=CV$lambda.1se)
MSEEN1.2 = mean((ytrue-yNET0251se)^2)

# %%%%%%%%%%%%%%%%%%%%%%%%%%
# Predictions: EN alpha=0.25
# %%%%%%%%%%%%%%%%%%%%%%%%%%

CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.50)
CV$lambda.min
CV$lambda.1se
round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4)
yNET050min=predict(CV,newx=pred,s=CV$lambda.min)
MSEEN2.1 = mean((ytrue-yNET050min)^2)
yNET0501se=predict(CV,newx=pred,s=CV$lambda.1se)
MSEEN2.2 = mean((ytrue-yNET0501se)^2)

# %%%%%%%%%%%%%%%%%%%%%%%%%%
# Predictions: EN alpha=0.75
# %%%%%%%%%%%%%%%%%%%%%%%%%%

CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.75)
CV$lambda.min
CV$lambda.1se
round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4)
yNET075min=predict(CV,newx=pred,s=CV$lambda.min)
MSEEN3.1 = mean((ytrue-yNET075min)^2)
yNET0751se=predict(CV,newx=pred,s=CV$lambda.1se)
MSEEN3.2 = mean((ytrue-yNET0751se)^2)

# %%%%%%%%%%%%%%%%%%%%%%%%%%
# Compare all results
# %%%%%%%%%%%%%%%%%%%%%%%%%%

MSEs = c(MSEOLS,MSER1,MSER2,MSEL1,MSEL2,MSEEN1.1,MSEEN1.2,MSEEN2.1,MSEEN2.2,MSEEN3.1,MSEEN3.2)
names(MSEs) = c("OLS","Ridge_M","Ridge_1","LASSO_M","LASSO_1",
                "EN25_M","EN25_1","EN50_M","EN50_1","EN75_M","EN75_1")
MSEs = sort(MSEs)
cbind(MSEs)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

####################################################
# FIGURES & CODES FOR EXAMPLE ON LOGISTIC REGRESSION
####################################################

# The dependent variable, 1 - if return on investment was less than 0, 0 - otherwise
DEF = (DT$RR2<0)*1
# The independent variable is the annualized interest rate
# First, fit the OLS model (not a good option)
m10 = lm(DEF~int,data=DT)
summary(m10)
# Visualize the relationship
plot(y=DEF,x=DT$int,pch=19,cex=0.50,xlab='Annualized interest rate [%]',ylab='(Probability of) default')
# Plot the regression line
abline(m10,col='red',lwd=2)

# Now estimate a logit model
m11 = glm(formula = DEF ~ int, family = binomial(link = "logit"),data = DT)
summary(m11)
# Plotting the S-curve
yfit=fitted(m11)
points(y=yfit,x=DT$int,col='blue',pch=19,cex=0.25)

# Visualizing how unit change in interest influences the probability
interest = seq(from=0,to=250,by=1)
B0=coefficients(m11)[1]
B1=coefficients(m11)[2]
fpi = (exp(B0+B1*interest))/(1+exp(B0+B1*interest))
dat = data.frame(fpi,interest); names(dat) = c("FittedProb","Interest")
dat$ME = c(NA,diff(dat$FittedProb))
plot(x=dat$Interest[-1],y=dat$ME[-1],pch=19,cex=0.25,xlab='Interest',ylab="Marginal effect")

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



################################################
# Case study 7: Logistic regression
################################################

# %%%%%%%%%%%%%%%%%%%%%%%%%%
# Model estimation: Logit
# %%%%%%%%%%%%%%%%%%%%%%%%%%

# Storing the dependent variables for each of the samples
tst$DEF = (tst$RR2<0)*1
val$DEF = (val$RR2<0)*1
# Estimating the model
m12 = glm(formula = DEF ~ new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+
            female+lamt+int+durm+educprim+educbasic+
            educvocat+educsec+msmar+msco+mssi+msdi+nrodep+
            espem+esfue+essem+esent+esret+dures+exper+
            linctot+noliab+lliatot+norli+noplo+lamountplo+
            lamntplr+lamteprl+nopearlyrep,
            family = binomial(link = "logit"),data = tst)
summary(m12)

# %%%%%%%%%%%%%%%%%%%%%%%%%%
# Prediction of defaults
# %%%%%%%%%%%%%%%%%%%%%%%%%%

# Now we perform the prediction on the validation sample.
# Predicted log odds
ypred=predict(m12,new=val)
# Predicted probabilities
ppred=exp(ypred)/(1+exp(ypred))

# Comparing model predictions
true = val$DEF
# Let's code the response as follows. IF the predicted probability is above 0.5
# we assume that the loan is going for a default, 0 otherwise.
# Now, let's compare truth with predictions.
predicted = as.numeric((ppred>0.5)*1)
# We create a table for that
TBL = table(predicted,true)
TBL
# We can calculate the
# Accuracy of predictions (on the diagonal)
sum(diag(TBL))/100
# Sensitivity
TBL[1,1]/sum(TBL[1,])
# Specificity
TBL[2,2]/sum(TBL[,2])
# High accuracy driven by high sensitivity but low specificity. The model
# predict non-failures good, but failures poorly. What to do?

# We could change the threshold (in-sample approach now)
ths = seq(from=0.01,to=0.99,by=0.01)
logit_eval = matrix(NA,nrow=length(ths),ncol=4)
logit_eval[,4] = ths
for (i in 1:length(ths)) {
  predicted = as.numeric((ppred>ths[i])*1)
  # We create a table for that
  TBL = table(predicted,true)
  if (dim(TBL)[1]==1 | dim(TBL)[2]==1) next
  # Accuracy of predictions (on the diagonal)
  logit_eval[i,1]=100*sum(diag(TBL))/100
  # Sensitivity
  logit_eval[i,2]=100*TBL[1,1]/sum(TBL[1,])
  # Specificity
  logit_eval[i,3]=100*TBL[2,2]/sum(TBL[,2])
}
# And visualize how: Accuracy, Sensitivity & Specificity change given a treshold

plot(x=ths,y=logit_eval[,1],pch=19,cex=0.25,ylim=c(0,100),xlim=c(-0.2,1),xlab='Treshold',ylab='Forecast evaluation')
points(x=ths,y=logit_eval[,2],pch=19,cex=0.25,col='red')
points(x=ths,y=logit_eval[,3],pch=19,cex=0.25,col='blue')
legend('bottomleft',legend=c("Accuracy","Sensitivity","Specificity"),
       col=c('black',"red",'blue'),pch=19,bty='n')
abline(v=0.5,col='black',lty=2)