# The following command clears the workspace!
rm(list=ls())
# This will clear the plot device
dev.off()
# We load the data-file. FinTechP2P is an R object already.
load(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P')
# Alternatively, we could load the *.csv file
DT = read.csv(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P.csv')
# The name of the dataset is DT. There are 4157 observations and 43 variables
# Row is a loan
# Column is a variable
# Variables names are just abbreviations. We can take a look at them.
names(DT)
# More details about variables see Case study 3 from previous session

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# The goal is to verify, whether complex network-wise relationships are useful in
# predicting loan returns.
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# To evaluate the benefits of using network-wise variables, we need to re-estimate
# models from previous session. We first estimate:
# 1) OLS model (1a - estimate model, 1b - forecast)
# 2) LASSO (2a - estimate model, 2b - forecast)
# 3) RIDGE (3a - estimate model, 3b - forecast)
# 4) Elastic net (EN) with alpha = 0.25, 0.50, 0.75 (4a - estimate model, 4b - forecast)
# 5) We create a network and network-wise variables
# 6) We estimate LASSO, RIDGE, EN models with network-wise variables (6a - estimate model, 6b - forecast)
# 7) We compare the forecasting accuracy using mean squared error (MSE)
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Sample spliting
# %%%%%%%%%%%%%%%%%%%%%%%%%
# Number of forecasted loans: Loans where we want to predict the return
NF = 100
# Overall number of observations
N = dim(DT)[1]
# The sample to use to estimate the model
S1 = DT[1:(N-NF),]
# The sample to use to predict (out-of-sample) loan return
S2 = DT[(N-NF+1):N,]

# %%%%%%%%%%%%%%%%%%%%%%%%%
# 1) OLS model
# %%%%%%%%%%%%%%%%%%%%%%%%%
# 1a) Estimate model
m1 = lm(RR2~new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+
          female+lamt+int+durm+educprim+educbasic+
          educvocat+educsec+msmar+msco+mssi+msdi+nrodep+
          espem+esfue+essem+esent+esret+dures+exper+
          linctot+noliab+lliatot+norli+noplo+lamountplo+
          lamntplr+lamteprl+nopearlyrep,data=S1)
# 1b) Forecast loan returns
yhat = predict(m1,new=S2)
# Calculate Means squared error
ytrue = S2[,"RR2"]
OLS = mean((yhat-ytrue)^2)
plot(y=ytrue,x=yhat,pch=19,cex=0.5,ylim=c(min(yhat,ytrue),max(yhat,ytrue)),col='red',
     xlim=c(min(yhat,ytrue),max(yhat,ytrue)),xlab='Predicted returns',ylab='Realized returns')
lines(x=c(-100,100),y=c(-100,100),lty=1,lwd=2,col='black')

# %%%%%%%%%%%%%%%%%%%%%%%%%
# 2) LASSO model
# %%%%%%%%%%%%%%%%%%%%%%%%%
# install.packages('glmnet') # Install only if you have not installed the package before
library(glmnet)
# 2a) Estimate model
# We need a matrix of independent variables
indep = as.matrix(S1[,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                         "undG","female","lamt","int","durm","educprim","educbasic",
                         "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                         "espem","esfue","essem","esent","esret","dures","exper",
                         "linctot","noliab","lliatot","norli","noplo","lamountplo",
                         "lamntplr","lamteprl","nopearlyrep")])
dep = S1$RR2
m2 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1)
# 2b) Forecast loan returns
# Select variables for prediction purposes - to be used latter.
pred = as.matrix(S2[,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                        "undG","female","lamt","int","durm","educprim","educbasic",
                        "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                        "espem","esfue","essem","esent","esret","dures","exper",
                        "linctot","noliab","lliatot","norli","noplo","lamountplo",
                        "lamntplr","lamteprl","nopearlyrep")])
yhat = predict(m2,newx=pred,s=m2$lambda.1se)
# Calculate Means squared error
ytrue = S2$RR2
LASSO = mean((yhat-ytrue)^2)
points(y=ytrue,x=yhat,pch=19,cex=0.5,col='blue')

# %%%%%%%%%%%%%%%%%%%%%%%%%
# 3) RIDGE model
# %%%%%%%%%%%%%%%%%%%%%%%%%
# 3a) Estimate model
m3 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0)
# 3b) Forecast loan returns
yhat = predict(m3,newx=pred,s=m3$lambda.1se)
# Calculate Means squared error
ytrue = S2$RR2
RIDGE = mean((yhat-ytrue)^2)
points(y=ytrue,x=yhat,pch=19,cex=0.5,col='brown')

# %%%%%%%%%%%%%%%%%%%%%%%%%
# 4) Elastic net model
# %%%%%%%%%%%%%%%%%%%%%%%%%
# 4a) Estimate model
m4_25 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.25)
m4_50 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.50)
m4_75 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.75)
# 4b) Forecast loan returns
ytrue = S2$RR2
# alpha = 0.25
yhat = predict(m4_25,newx=pred,s=m4_25$lambda.1se)
EN25 = mean((yhat-ytrue)^2)
# alpha = 0.50
yhat = predict(m4_50,newx=pred,s=m4_50$lambda.1se)
EN50 = mean((yhat-ytrue)^2)
# alpha = 0.75
yhat = predict(m4_75,newx=pred,s=m4_75$lambda.1se)
EN75 = mean((yhat-ytrue)^2)
points(y=ytrue,x=yhat,pch=19,cex=0.5,col='green')

MSEs = c(OLS,LASSO,RIDGE,EN25,EN50,EN75)
names(MSEs) = c("OLS","LASSO","RIDGE","EN25","EN50","EN75")
MSEs = sort(MSEs)
cbind(MSEs)

# %%%%%%%%%%%%%%%%%%%%%%%%%
# 5) Network & variables
# %%%%%%%%%%%%%%%%%%%%%%%%%

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 5a) We select variables that we want to use to create a distance matrix
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
DMV = DT[,c("int","durm","linctot","noliab")]
# Using Euclidean distance metric - we create the distance variables
DM = as.matrix(dist(scale(DMV)))
DM[1:5,1:5]
hist(DM,breaks=200,prob=T,main='Distribution of distances')

# 5b) We first create a network for the whole sample
# install.packages('igraph') # Install only if you have not installed it before
library(igraph)
# This will create an adjacency matrix (weighted but indirected)
g = graph_from_adjacency_matrix(DM, mode = "undirected", weighted = TRUE) # we define the graph 
# As this is a complete graph, we ll extract a sub-graph
# Minimum spanning tree is a possibility
# This might take a while - do not worry
g_mst = mst(g)

# Visualize the 'Minimum Spanning Tree'
# Now let's plot the graph, highligting loans where return was negative (default)
status = (S1$RR2<0)*1
V(g_mst)$status = status
V(g_mst)[status == 1]$color = "firebrick1" # color defaulted companies red
V(g_mst)[status == 0]$color = "lightgreen" # color active companies green
# We create a plot
plot(g_mst, graph = "MST",
     vertex.label=NA, 
     vertex.size = 3, 
     main = "MST of the P2P applicants networks")

# Generate vertex level variables
DT$Deg = igraph::degree(g_mst) #degree centrality
DT$Str = igraph::strength(g_mst) #strenght centrality
DT$Clos = igraph::closeness(g_mst)*10^4 #closeness centrality
DT$Bet = igraph::betweenness(g_mst) #betweenness centrality
com = cluster_louvain(g_mst) #community detection via 'Louvain method'
# How many communities do we have?
length(unique(com$membership))
# We know which loan belongs to a which community - now we need to define community
# specific variables - dummies (there are 124 communities = 124 new variable!)
# install.packages('dummies') # if not installed - install
library(dummies)
CD=dummy(com$membership)
DT = data.frame(DT,CD)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 6) Estimate LASSO, RIDGE, EN models with graph-level variables
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# We need a matrix of independent variables: for-testing sample
indep = as.matrix(DT[1:(N-NF),c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                        "undG","female","lamt","int","durm","educprim","educbasic",
                        "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                        "espem","esfue","essem","esent","esret","dures","exper",
                        "linctot","noliab","lliatot","norli","noplo","lamountplo",
                        "lamntplr","lamteprl","nopearlyrep","Deg","Str","Clos","Bet",
                        paste("membership",1:124,sep=''))])
dep = DT[1:(N-NF),"RR2"]
# Variables for the predicted loans
pred = as.matrix(DT[(N-NF+1):N,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                       "undG","female","lamt","int","durm","educprim","educbasic",
                       "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                       "espem","esfue","essem","esent","esret","dures","exper",
                       "linctot","noliab","lliatot","norli","noplo","lamountplo",
                       "lamntplr","lamteprl","nopearlyrep","Deg","Str","Clos","Bet",
                       paste("membership",1:124,sep=''))])
ytrue = S2$RR2

# 6a) Estimate LASSO model
m5_L = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1)
coef(m5_L,s='lambda.1se')
# 6b) Forecast loan returns
yhat = predict(m5_L,newx=pred,s=m5_L$lambda.1se)
# Calculate Means squared error
LASSO_N = mean((yhat-ytrue)^2)

# 6a) Estimate RIDGE model
m5_R = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1)
coef(m5_R,s='lambda.1se')
# 6b) Forecast loan returns
yhat = predict(m5_R,newx=pred,s=m5_R$lambda.1se)
# Calculate Means squared error
RIDGE_N = mean((yhat-ytrue)^2)

# 6a) Estimate EN alpha = 0.25 model
m5_E25 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.25)
coef(m5_E25,s='lambda.1se')
# 6b) Forecast loan returns
yhat = predict(m5_E25,newx=pred,s=m5_E25$lambda.1se)
# Calculate Means squared error
EN25N = mean((yhat-ytrue)^2)

# 6a) Estimate EN alpha = 0.50 model
m5_E50 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.50)
coef(m5_E50,s='lambda.1se')
# 6b) Forecast loan returns
yhat = predict(m5_E50,newx=pred,s=m5_E50$lambda.1se)
# Calculate Means squared error
EN50N = mean((yhat-ytrue)^2)

# 6a) Estimate EN alpha = 0.50 model
m5_E75 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.75)
coef(m5_E75,s='lambda.1se')
# 6b) Forecast loan returns
yhat = predict(m5_E75,newx=pred,s=m5_E75$lambda.1se)
# Calculate Means squared error
EN75N = mean((yhat-ytrue)^2)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 7) Compare forecasting accuracy of competing models
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
MSEs = c(OLS,LASSO,RIDGE,EN25,EN50,EN75,LASSO_N,RIDGE_N,EN25N,EN50N,EN75N)
names(MSEs) = c("OLS","LASSO","RIDGE","EN25","EN50","EN75","LASSO_N","RIDGE_N","EN25N","EN50N","EN75N")
MSEs = sort(MSEs)
cbind(MSEs)