# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# The goal is to verify, whether complex network-wise relationships are useful in
# predicting loan returns.
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# We continue with the previous method: adding factor network model variables
# We first estimate:
# 1) We create a network and network-wise variables: Factor network model
# 2) We estimate LASSO, RIDGE, EN models with network-wise variables (6a - estimate model, 6b - forecast)
# 3) We compare the forecasting accuracy using mean squared error (MSE)
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
DT = read.csv(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P.csv')

# %%%%%%%%%%%%%%%%%%%%%%%%%
# Sample parameters
# %%%%%%%%%%%%%%%%%%%%%%%%%
# Number of forecasted loans: Loans where we want to predict the return
NF = 100
# Overall number of observations
N = dim(DT)[1]

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 1) Create factor network and its variables
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

X = DT[,c("int","durm","linctot","noliab")]
# The following fuction performs the:
# a) decomposition of the set of variables into factors
# b) select's the number of factors
# c) creates adjacency matrix
FN_SVD = function(X,p=0.95,gam=0.01) {
  # Standardize Dataset
  X <- sweep(X, 2, colMeans(X), "-")
  X <- sweep(X, 2, apply(X, 2 , sd), "/")
  
  nvar <- ncol(X)
  nobs <- nrow(X)
  cx <- sweep(X, 2, colMeans(X), "-")
  sv <- svd(cx)
  
  U <- sv$u
  D <- sv$d
  
  eig_values <- sv$d^2/(nobs - 1)
  variance.explained <- 100*prop.table(sv$d^2)
  cumperc <- cumsum(variance.explained)
  EigenTab <- round(cbind(eig_values, variance.explained, cumperc), 2)
  idk <- which(cumperc >= p*100)
  k <- idk[1]
  # Obtain Factors
  Fx <- U[,1:k]%*%diag(D[1:k])/sqrt(nvar)
  
  # Now working on the adjacency matrix
  theta <- qnorm(2/(nobs-1))
  #theta <- 0
  EZ <- Fx %*% t(Fx) + theta
  # Link prediction
  P_G <- pnorm(EZ, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
  
  # set the diagonal do zero
  diag(P_G) <- 0
  
  # Where-ever the condition applies give 1, 0 otherwise
  G <- 1*(P_G > gam)
  
  return(G)
}
AM = FN_SVD(X,p=0.75,gam=0.10)
# Create a graph
g = graph_from_adjacency_matrix(AM, mode = "undirected", weighted = TRUE) # we define the graph 
# We can visualize the Network Factor Model
# This graph takes some time to create and and is not very nice
plot(g, graph = "NFM",
     vertex.label=NA, 
     vertex.size = 3, 
     main = "Network factor model of the P2P applicants networks")
# Not very nice - too complex

# The following fuction takes a graph 'g' and performs:
# a) calculation of the vertex degree and harmonic centrality
# b) community detection via the Louvain method

BVC <- function(g) {
  library(igraph)
  tmp = 1/shortest.paths(g, mode = "all", weights = NULL)
  diag(tmp) = rep(0, dim(tmp)[1])
  vertex_centralities = rowSums(tmp)
  max_centrality = max(vertex_centralities)
  results = list()
  centrality = cbind(degree(g),vertex_centralities)
  results[["VCentrality"]] = centrality
  com = cluster_louvain(g) #community detection via 'Louvain method'
  # Number of communities
  length(unique(com$membership))
  # Cummunities with more than one member
  idx = as.numeric(which(table(com$membership)>1))
  cidx = rep(1,length(com$membership))
  for (i in 1:length(idx)) cidx[which(com$membership==idx[i])] = i + 1
  # kazdej skupine priradime indikatorovu premennu (bude ich vela)
  CD=dummy(cidx)
  results[["Community"]] = CD
  return(results)
}
NetDscr=BVC(g)

# Now add variable into the model:
DT$Deg = NetDscr$VCentrality[,1] #vertex degree
DT$Hac = NetDscr$VCentrality[,2] #harmonic centrality
DT = data.frame(DT,NetDscr$Community)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 2) Estimate LASSO, RIDGE, EN models with graph-level variables
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# We need a matrix of independent variables: for-testing sample
indep = as.matrix(DT[1:(N-NF),c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                                "undG","female","lamt","int","durm","educprim","educbasic",
                                "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                                "espem","esfue","essem","esent","esret","dures","exper",
                                "linctot","noliab","lliatot","norli","noplo","lamountplo",
                                "lamntplr","lamteprl","nopearlyrep","Deg","Hac",
                                paste("g",1:5,sep=''))])
dep = DT[1:(N-NF),"RR2"]
# Variables for the predicted loans
pred = as.matrix(DT[(N-NF+1):N,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age",
                                 "undG","female","lamt","int","durm","educprim","educbasic",
                                 "educvocat","educsec","msmar","msco","mssi","msdi","nrodep",
                                 "espem","esfue","essem","esent","esret","dures","exper",
                                 "linctot","noliab","lliatot","norli","noplo","lamountplo",
                                 "lamntplr","lamteprl","nopearlyrep","Deg","Hac",
                                 paste("g",1:5,sep=''))])
ytrue = DT[(N-NF+1):N,'RR2']

# 3a) Estimate LASSO model
m3_L = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1)
coef(m3_L,s='lambda.1se')
# 3b) Forecast loan returns
yhat = predict(m3_L,newx=pred,s=m3_L$lambda.1se)
# Calculate Means squared error
LASSO_FN = mean((yhat-ytrue)^2)

# 6a) Estimate RIDGE model
m3_R = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1)
coef(m3_R,s='lambda.1se')
# 6b) Forecast loan returns
yhat = predict(m3_R,newx=pred,s=m3_R$lambda.1se)
# Calculate Means squared error
RIDGE_FN = mean((yhat-ytrue)^2)

# 6a) Estimate EN alpha = 0.25 model
m3_E25 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.25)
coef(m3_E25,s='lambda.1se')
# 6b) Forecast loan returns
yhat = predict(m3_E25,newx=pred,s=m3_E25$lambda.1se)
# Calculate Means squared error
EN25FN = mean((yhat-ytrue)^2)

# 6a) Estimate EN alpha = 0.50 model
m3_E50 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.50)
coef(m3_E50,s='lambda.1se')
# 6b) Forecast loan returns
yhat = predict(m3_E50,newx=pred,s=m3_E50$lambda.1se)
# Calculate Means squared error
EN50FN = mean((yhat-ytrue)^2)

# 6a) Estimate EN alpha = 0.50 model
m3_E75 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.75)
coef(m3_E75,s='lambda.1se')
# 6b) Forecast loan returns
yhat = predict(m3_E75,newx=pred,s=m3_E75$lambda.1se)
# Calculate Means squared error
EN75FN = mean((yhat-ytrue)^2)

# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# 3) Compare forecasting accuracy of competing models
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
MSEs = c(OLS,LASSO,RIDGE,EN25,EN50,EN75,LASSO_N,RIDGE_N,EN25N,EN50N,EN75N,LASSO_FN,RIDGE_FN,EN25FN,EN50FN,EN75FN)
names(MSEs) = c("OLS","LASSO","RIDGE","EN25","EN50","EN75","LASSO_N","RIDGE_N","EN25N","EN50N","EN75N","LASSO_FN","RIDGE_FN","EN25FN","EN50FN","EN75FN")
MSEs = sort(MSEs)
cbind(MSEs)