DT = read.csv(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P.csv')
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# The goal is to verify, whether complex network-wise relationships are useful in
# predicting loan defaults
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# To evaluate the benefits of using network-wise variables, we need to re-estimate
# logistic regression model from previous session. We first estimate:
# 1) Logistic model (1a - estimate model, 1b - forecast)
# 2) LASSO Logistic model (2a - estimate model, 2b - forecast)
# 3) We create 'factor network' and extract variables
# 4) We estimate Logistic and LASSO with network-wise variables (4a - estimate model, 4b - forecast)
# 5) We compare the forecasting accuracy using auc (and plots)
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Sample spliting
# %%%%%%%%%%%%%%%%%%%%%%%%%
# Number of forecasted loans: Loans where we want to predict whether they default.
NF = 500
# Overall number of observations
N = dim(DT)[1]
# Storing the dependent variables for each of the samples
DT$DEF = (DT$RR2<0)*1
# The sample to use to estimate the model
S1 = DT[1:(N-NF),]
# The sample to use to predict (out-of-sample) loan return
S2 = DT[(N-NF+1):N,]

# %%%%%%%%%%%%%%%%%%%%%%%%%
# 1) Standard Logit model
# %%%%%%%%%%%%%%%%%%%%%%%%%

# 1a) Estimating the standard model
m4 = glm(formula = DEF ~ new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+
            female+lamt+int+durm+educprim+educbasic+
            educvocat+educsec+msmar+msco+mssi+msdi+nrodep+
            espem+esfue+essem+esent+esret+dures+exper+
            linctot+noliab+lliatot+norli+noplo+lamountplo+
            lamntplr+lamteprl+nopearlyrep,
          family = binomial(link = "logit"),data = S1)
summary(m4)

# 1b) Predicting defaults
# TPR - true positive rate; number of correct predicitons of default divided by total number of defaults
# FPR - false positive rate; number of false predictions of default divided by total number of nondefaults
ypred = predict(m4,new=S2)
ypred = exp(ypred)/(1+exp(ypred))
ytrue = S2$DEF
# install.packages("pROC") # If not already installed
library(pROC)
roc_obj <- roc(ytrue, ypred)
plot(roc_obj, xlab = "False positive rate", ylab = "True positive rate", legacy.axes=TRUE)
LOGIT = auc(roc_obj)

# %%%%%%%%%%%%%%%%%%%%%%%%%
# 2) LASSO Logit model
# %%%%%%%%%%%%%%%%%%%%%%%%%
library(glmnet)
# We need a matrix of independent variables: for-testing sample
indep = as.matrix(DT[1:(N-NF),c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                                "undG","female","lamt","int","durm","educprim","educbasic",
                                "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                                "espem","esfue","essem","esent","esret","dures","exper",
                                "linctot","noliab","lliatot","norli","noplo","lamountplo",
                                "lamntplr","lamteprl","nopearlyrep")])
dep = DT[1:(N-NF),"DEF"]
# Variables for the predicted loans
pred = as.matrix(DT[(N-NF+1):N,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                                 "undG","female","lamt","int","durm","educprim","educbasic",
                                 "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                                 "espem","esfue","essem","esent","esret","dures","exper",
                                 "linctot","noliab","lliatot","norli","noplo","lamountplo",
                                 "lamntplr","lamteprl","nopearlyrep")])
ytrue = DT[(N-NF+1):N,"DEF"]

# 2a) Estimating the model
m5 = cv.glmnet(x=indep, y=dep, type.measure="auc",alpha=1,family='binomial') 
coef(m5, s = "lambda.1se")
# 2b) Predicting defaults
ypred = predict(m5,newx=pred,s=m5$lambda.1se)
ypred = exp(ypred)/(1+exp(ypred))
roc_obj <- roc(ytrue, ypred)
lines(roc_obj,col='red')
LOGIT_LASSO = auc(roc_obj)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 3) Create factor network and its variables
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

X = DT[,c("int","durm","linctot","noliab")]
# The following fuction performs the:
# a) decomposition of the set of variables into factors
# b) select's the number of factors
# c) creates adjacency matrix
FN_SVD = function(X,p=0.95,gam=0.01) {
  # Standardize Dataset
  X <- sweep(X, 2, colMeans(X), "-")
  X <- sweep(X, 2, apply(X, 2 , sd), "/")
  
  nvar <- ncol(X)
  nobs <- nrow(X)
  cx <- sweep(X, 2, colMeans(X), "-")
  sv <- svd(cx)
  
  U <- sv$u
  D <- sv$d
  
  eig_values <- sv$d^2/(nobs - 1)
  variance.explained <- 100*prop.table(sv$d^2)
  cumperc <- cumsum(variance.explained)
  EigenTab <- round(cbind(eig_values, variance.explained, cumperc), 2)
  idk <- which(cumperc >= p*100)
  k <- idk[1]
  # Obtain Factors
  Fx <- U[,1:k]%*%diag(D[1:k])/sqrt(nvar)
  
  # Now working on the adjacency matrix
  theta <- qnorm(2/(nobs-1))
  #theta <- 0
  EZ <- Fx %*% t(Fx) + theta
  # Link prediction
  P_G <- pnorm(EZ, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
  
  # set the diagonal do zero
  diag(P_G) <- 0
  
  # Where-ever the condition applies give 1, 0 otherwise
  G <- 1*(P_G > gam)
  
  return(G)
}
AM = FN_SVD(X,p=0.75,gam=0.10)
g = graph_from_adjacency_matrix(AM, mode = "undirected", weighted = TRUE) # we define the graph 
BVC <- function(g) {
  library(igraph)
  tmp = 1/shortest.paths(g, mode = "all", weights = NULL)
  diag(tmp) = rep(0, dim(tmp)[1])
  vertex_centralities = rowSums(tmp)
  max_centrality = max(vertex_centralities)
  results = list()
  centrality = cbind(degree(g),vertex_centralities)
  results[["VCentrality"]] = centrality
  com = cluster_louvain(g) #community detection via 'Louvain method'
  # Number of communities
  length(unique(com$membership))
  # Cummunities with more than one member
  idx = as.numeric(which(table(com$membership)>1))
  cidx = rep(1,length(com$membership))
  for (i in 1:length(idx)) cidx[which(com$membership==idx[i])] = i + 1
  # kazdej skupine priradime indikatorovu premennu (bude ich vela)
  CD=dummy(cidx)
  results[["Community"]] = CD
  return(results)
}
NetDscr=BVC(g)
# Now add variable into the model:
DT$Deg = NetDscr$VCentrality[,1] #vertex degree
DT$Hac = NetDscr$VCentrality[,2] #harmonic centrality
DT = data.frame(DT,NetDscr$Community)

# The sample to use to estimate the model
S1 = DT[1:(N-NF),]
# The sample to use to predict (out-of-sample) loan return
S2 = DT[(N-NF+1):N,]

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 4) Estimate Logistic & Lasso Logistic models with network variables
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# 4a) Estimating the standard model with network variables
m6 = glm(formula = DEF ~ new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+
           female+lamt+int+durm+educprim+educbasic+
           educvocat+educsec+msmar+msco+mssi+msdi+nrodep+
           espem+esfue+essem+esent+esret+dures+exper+
           linctot+noliab+lliatot+norli+noplo+lamountplo+
           lamntplr+lamteprl+nopearlyrep+Deg+Hac+g1+g2+g3+g4,
         family = binomial(link = "logit"),data = S1)
summary(m6)

# 4b) Predicting defaults
ypred = predict(m6,new=S2)
ypred = exp(ypred)/(1+exp(ypred))
ytrue = S2$DEF
roc_obj <- roc(ytrue, ypred)
lines(roc_obj,col='green')
LOGIT_N = auc(roc_obj)

# Estimating the LASSO Logistic model with network variables

# We need a matrix of independent variables: for-testing sample
indep = as.matrix(DT[1:(N-NF),c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                                "undG","female","lamt","int","durm","educprim","educbasic",
                                "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                                "espem","esfue","essem","esent","esret","dures","exper",
                                "linctot","noliab","lliatot","norli","noplo","lamountplo",
                                "lamntplr","lamteprl","nopearlyrep","Deg","Hac",
                                paste("g",1:4,sep=''))])
dep = DT[1:(N-NF),"DEF"]
# Variables for the predicted loans
pred = as.matrix(DT[(N-NF+1):N,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                                 "undG","female","lamt","int","durm","educprim","educbasic",
                                 "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                                 "espem","esfue","essem","esent","esret","dures","exper",
                                 "linctot","noliab","lliatot","norli","noplo","lamountplo",
                                 "lamntplr","lamteprl","nopearlyrep","Deg","Hac",
                                 paste("g",1:4,sep=''))])
ytrue = S2$DEF

# 4a) Estimating the model
m7 = cv.glmnet(x=indep, y=dep, type.measure="auc",alpha=1,family='binomial') 
coef(m7, s = "lambda.1se")
# 4b) Predicting defaults
ypred = predict(m7,newx=pred,s=m7$lambda.1se)
ypred = exp(ypred)/(1+exp(ypred))
roc_obj <- roc(ytrue, ypred)
lines(roc_obj,col='blue')
LOGIT_N_LASSO = auc(roc_obj)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 5) Compare forecasting accuracy of competing models
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
AUC = c(LOGIT,LOGIT_LASSO,LOGIT_N,LOGIT_N_LASSO)
names(AUC) = c("Logit","Logit-L","Logit-N","Logit-NL")
AUC = sort(AUC,decreasing=T)
cbind(AUC)