# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # The goal is to verify, whether complex network-wise relationships are useful in # predicting loan returns. # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # We continue with the previous method: adding factor network model variables # We first estimate: # 1) We create a network and network-wise variables: Factor network model # 2) We estimate LASSO, RIDGE, EN models with network-wise variables (6a - estimate model, 6b - forecast) # 3) We compare the forecasting accuracy using mean squared error (MSE) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% DT = read.csv(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P.csv') # %%%%%%%%%%%%%%%%%%%%%%%%% # Sample parameters # %%%%%%%%%%%%%%%%%%%%%%%%% # Number of forecasted loans: Loans where we want to predict the return NF = 100 # Overall number of observations N = dim(DT)[1] # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 1) Create factor network and its variables # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% X = DT[,c("int","durm","linctot","noliab")] # The following fuction performs the: # a) decomposition of the set of variables into factors # b) select's the number of factors # c) creates adjacency matrix FN_SVD = function(X,p=0.95,gam=0.01) { # Standardize Dataset X <- sweep(X, 2, colMeans(X), "-") X <- sweep(X, 2, apply(X, 2 , sd), "/") nvar <- ncol(X) nobs <- nrow(X) cx <- sweep(X, 2, colMeans(X), "-") sv <- svd(cx) U <- sv$u D <- sv$d eig_values <- sv$d^2/(nobs - 1) variance.explained <- 100*prop.table(sv$d^2) cumperc <- cumsum(variance.explained) EigenTab <- round(cbind(eig_values, variance.explained, cumperc), 2) idk <- which(cumperc >= p*100) k <- idk[1] # Obtain Factors Fx <- U[,1:k]%*%diag(D[1:k])/sqrt(nvar) # Now working on the adjacency matrix theta <- qnorm(2/(nobs-1)) #theta <- 0 EZ <- Fx %*% t(Fx) + theta # Link prediction P_G <- pnorm(EZ, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) # set the diagonal do zero diag(P_G) <- 0 # Where-ever the condition applies give 1, 0 otherwise G <- 1*(P_G > gam) return(G) } AM = FN_SVD(X,p=0.75,gam=0.10) # Create a graph g = graph_from_adjacency_matrix(AM, mode = "undirected", weighted = TRUE) # we define the graph # We can visualize the Network Factor Model # This graph takes some time to create and and is not very nice plot(g, graph = "NFM", vertex.label=NA, vertex.size = 3, main = "Network factor model of the P2P applicants networks") # Not very nice - too complex # The following fuction takes a graph 'g' and performs: # a) calculation of the vertex degree and harmonic centrality # b) community detection via the Louvain method BVC <- function(g) { library(igraph) tmp = 1/shortest.paths(g, mode = "all", weights = NULL) diag(tmp) = rep(0, dim(tmp)[1]) vertex_centralities = rowSums(tmp) max_centrality = max(vertex_centralities) results = list() centrality = cbind(degree(g),vertex_centralities) results[["VCentrality"]] = centrality com = cluster_louvain(g) #community detection via 'Louvain method' # Number of communities length(unique(com$membership)) # Cummunities with more than one member idx = as.numeric(which(table(com$membership)>1)) cidx = rep(1,length(com$membership)) for (i in 1:length(idx)) cidx[which(com$membership==idx[i])] = i + 1 # kazdej skupine priradime indikatorovu premennu (bude ich vela) CD=dummy(cidx) results[["Community"]] = CD return(results) } NetDscr=BVC(g) # Now add variable into the model: DT$Deg = NetDscr$VCentrality[,1] #vertex degree DT$Hac = NetDscr$VCentrality[,2] #harmonic centrality DT = data.frame(DT,NetDscr$Community) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 2) Estimate LASSO, RIDGE, EN models with graph-level variables # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # We need a matrix of independent variables: for-testing sample indep = as.matrix(DT[1:(N-NF),c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep","Deg","Hac", paste("g",1:5,sep=''))]) dep = DT[1:(N-NF),"RR2"] # Variables for the predicted loans pred = as.matrix(DT[(N-NF+1):N,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep","Deg","Hac", paste("g",1:5,sep=''))]) ytrue = DT[(N-NF+1):N,'RR2'] # 3a) Estimate LASSO model m3_L = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1) coef(m3_L,s='lambda.1se') # 3b) Forecast loan returns yhat = predict(m3_L,newx=pred,s=m3_L$lambda.1se) # Calculate Means squared error LASSO_FN = mean((yhat-ytrue)^2) # 6a) Estimate RIDGE model m3_R = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1) coef(m3_R,s='lambda.1se') # 6b) Forecast loan returns yhat = predict(m3_R,newx=pred,s=m3_R$lambda.1se) # Calculate Means squared error RIDGE_FN = mean((yhat-ytrue)^2) # 6a) Estimate EN alpha = 0.25 model m3_E25 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.25) coef(m3_E25,s='lambda.1se') # 6b) Forecast loan returns yhat = predict(m3_E25,newx=pred,s=m3_E25$lambda.1se) # Calculate Means squared error EN25FN = mean((yhat-ytrue)^2) # 6a) Estimate EN alpha = 0.50 model m3_E50 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.50) coef(m3_E50,s='lambda.1se') # 6b) Forecast loan returns yhat = predict(m3_E50,newx=pred,s=m3_E50$lambda.1se) # Calculate Means squared error EN50FN = mean((yhat-ytrue)^2) # 6a) Estimate EN alpha = 0.50 model m3_E75 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.75) coef(m3_E75,s='lambda.1se') # 6b) Forecast loan returns yhat = predict(m3_E75,newx=pred,s=m3_E75$lambda.1se) # Calculate Means squared error EN75FN = mean((yhat-ytrue)^2) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 3) Compare forecasting accuracy of competing models # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% MSEs = c(OLS,LASSO,RIDGE,EN25,EN50,EN75,LASSO_N,RIDGE_N,EN25N,EN50N,EN75N,LASSO_FN,RIDGE_FN,EN25FN,EN50FN,EN75FN) names(MSEs) = c("OLS","LASSO","RIDGE","EN25","EN50","EN75","LASSO_N","RIDGE_N","EN25N","EN50N","EN75N","LASSO_FN","RIDGE_FN","EN25FN","EN50FN","EN75FN") MSEs = sort(MSEs) cbind(MSEs)