DT = read.csv(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P.csv') # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # The goal is to verify, whether complex network-wise relationships are useful in # predicting loan defaults # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # To evaluate the benefits of using network-wise variables, we need to re-estimate # logistic regression model from previous session. We first estimate: # 1) Logistic model (1a - estimate model, 1b - forecast) # 2) LASSO Logistic model (2a - estimate model, 2b - forecast) # 3) We create 'factor network' and extract variables # 4) We estimate Logistic and LASSO with network-wise variables (4a - estimate model, 4b - forecast) # 5) We compare the forecasting accuracy using auc (and plots) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # %%%%%%%%%%%%%%%%%%%%%%%%% # Sample spliting # %%%%%%%%%%%%%%%%%%%%%%%%% # Number of forecasted loans: Loans where we want to predict whether they default. NF = 500 # Overall number of observations N = dim(DT)[1] # Storing the dependent variables for each of the samples DT$DEF = (DT$RR2<0)*1 # The sample to use to estimate the model S1 = DT[1:(N-NF),] # The sample to use to predict (out-of-sample) loan return S2 = DT[(N-NF+1):N,] # %%%%%%%%%%%%%%%%%%%%%%%%% # 1) Standard Logit model # %%%%%%%%%%%%%%%%%%%%%%%%% # 1a) Estimating the standard model m4 = glm(formula = DEF ~ new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+ female+lamt+int+durm+educprim+educbasic+ educvocat+educsec+msmar+msco+mssi+msdi+nrodep+ espem+esfue+essem+esent+esret+dures+exper+ linctot+noliab+lliatot+norli+noplo+lamountplo+ lamntplr+lamteprl+nopearlyrep, family = binomial(link = "logit"),data = S1) summary(m4) # 1b) Predicting defaults # TPR - true positive rate; number of correct predicitons of default divided by total number of defaults # FPR - false positive rate; number of false predictions of default divided by total number of nondefaults ypred = predict(m4,new=S2) ypred = exp(ypred)/(1+exp(ypred)) ytrue = S2$DEF # install.packages("pROC") # If not already installed library(pROC) roc_obj <- roc(ytrue, ypred) plot(roc_obj, xlab = "False positive rate", ylab = "True positive rate", legacy.axes=TRUE) LOGIT = auc(roc_obj) # %%%%%%%%%%%%%%%%%%%%%%%%% # 2) LASSO Logit model # %%%%%%%%%%%%%%%%%%%%%%%%% library(glmnet) # We need a matrix of independent variables: for-testing sample indep = as.matrix(DT[1:(N-NF),c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep")]) dep = DT[1:(N-NF),"DEF"] # Variables for the predicted loans pred = as.matrix(DT[(N-NF+1):N,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep")]) ytrue = DT[(N-NF+1):N,"DEF"] # 2a) Estimating the model m5 = cv.glmnet(x=indep, y=dep, type.measure="auc",alpha=1,family='binomial') coef(m5, s = "lambda.1se") # 2b) Predicting defaults ypred = predict(m5,newx=pred,s=m5$lambda.1se) ypred = exp(ypred)/(1+exp(ypred)) roc_obj <- roc(ytrue, ypred) lines(roc_obj,col='red') LOGIT_LASSO = auc(roc_obj) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 3) Create factor network and its variables # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% X = DT[,c("int","durm","linctot","noliab")] # The following fuction performs the: # a) decomposition of the set of variables into factors # b) select's the number of factors # c) creates adjacency matrix FN_SVD = function(X,p=0.95,gam=0.01) { # Standardize Dataset X <- sweep(X, 2, colMeans(X), "-") X <- sweep(X, 2, apply(X, 2 , sd), "/") nvar <- ncol(X) nobs <- nrow(X) cx <- sweep(X, 2, colMeans(X), "-") sv <- svd(cx) U <- sv$u D <- sv$d eig_values <- sv$d^2/(nobs - 1) variance.explained <- 100*prop.table(sv$d^2) cumperc <- cumsum(variance.explained) EigenTab <- round(cbind(eig_values, variance.explained, cumperc), 2) idk <- which(cumperc >= p*100) k <- idk[1] # Obtain Factors Fx <- U[,1:k]%*%diag(D[1:k])/sqrt(nvar) # Now working on the adjacency matrix theta <- qnorm(2/(nobs-1)) #theta <- 0 EZ <- Fx %*% t(Fx) + theta # Link prediction P_G <- pnorm(EZ, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) # set the diagonal do zero diag(P_G) <- 0 # Where-ever the condition applies give 1, 0 otherwise G <- 1*(P_G > gam) return(G) } AM = FN_SVD(X,p=0.75,gam=0.10) g = graph_from_adjacency_matrix(AM, mode = "undirected", weighted = TRUE) # we define the graph BVC <- function(g) { library(igraph) tmp = 1/shortest.paths(g, mode = "all", weights = NULL) diag(tmp) = rep(0, dim(tmp)[1]) vertex_centralities = rowSums(tmp) max_centrality = max(vertex_centralities) results = list() centrality = cbind(degree(g),vertex_centralities) results[["VCentrality"]] = centrality com = cluster_louvain(g) #community detection via 'Louvain method' # Number of communities length(unique(com$membership)) # Cummunities with more than one member idx = as.numeric(which(table(com$membership)>1)) cidx = rep(1,length(com$membership)) for (i in 1:length(idx)) cidx[which(com$membership==idx[i])] = i + 1 # kazdej skupine priradime indikatorovu premennu (bude ich vela) CD=dummy(cidx) results[["Community"]] = CD return(results) } NetDscr=BVC(g) # Now add variable into the model: DT$Deg = NetDscr$VCentrality[,1] #vertex degree DT$Hac = NetDscr$VCentrality[,2] #harmonic centrality DT = data.frame(DT,NetDscr$Community) # The sample to use to estimate the model S1 = DT[1:(N-NF),] # The sample to use to predict (out-of-sample) loan return S2 = DT[(N-NF+1):N,] # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 4) Estimate Logistic & Lasso Logistic models with network variables # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 4a) Estimating the standard model with network variables m6 = glm(formula = DEF ~ new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+ female+lamt+int+durm+educprim+educbasic+ educvocat+educsec+msmar+msco+mssi+msdi+nrodep+ espem+esfue+essem+esent+esret+dures+exper+ linctot+noliab+lliatot+norli+noplo+lamountplo+ lamntplr+lamteprl+nopearlyrep+Deg+Hac+g1+g2+g3+g4, family = binomial(link = "logit"),data = S1) summary(m6) # 4b) Predicting defaults ypred = predict(m6,new=S2) ypred = exp(ypred)/(1+exp(ypred)) ytrue = S2$DEF roc_obj <- roc(ytrue, ypred) lines(roc_obj,col='green') LOGIT_N = auc(roc_obj) # Estimating the LASSO Logistic model with network variables # We need a matrix of independent variables: for-testing sample indep = as.matrix(DT[1:(N-NF),c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep","Deg","Hac", paste("g",1:4,sep=''))]) dep = DT[1:(N-NF),"DEF"] # Variables for the predicted loans pred = as.matrix(DT[(N-NF+1):N,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep","Deg","Hac", paste("g",1:4,sep=''))]) ytrue = S2$DEF # 4a) Estimating the model m7 = cv.glmnet(x=indep, y=dep, type.measure="auc",alpha=1,family='binomial') coef(m7, s = "lambda.1se") # 4b) Predicting defaults ypred = predict(m7,newx=pred,s=m7$lambda.1se) ypred = exp(ypred)/(1+exp(ypred)) roc_obj <- roc(ytrue, ypred) lines(roc_obj,col='blue') LOGIT_N_LASSO = auc(roc_obj) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 5) Compare forecasting accuracy of competing models # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% AUC = c(LOGIT,LOGIT_LASSO,LOGIT_N,LOGIT_N_LASSO) names(AUC) = c("Logit","Logit-L","Logit-N","Logit-NL") AUC = sort(AUC,decreasing=T) cbind(AUC)