# The following command clears the workspace! rm(list=ls()) # This will clear the plot device dev.off() # We load the data-file. FinTechP2P is an R object already. load(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P') # Alternatively, we could load the *.csv file DT = read.csv(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P.csv') # The name of the dataset is DT. There are 4157 observations and 43 variables # Row is a loan # Column is a variable # Variables names are just abbreviations. We can take a look at them. names(DT) # More details about variables see Case study 3 from previous session # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # The goal is to verify, whether complex network-wise relationships are useful in # predicting loan returns. # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # To evaluate the benefits of using network-wise variables, we need to re-estimate # models from previous session. We first estimate: # 1) OLS model (1a - estimate model, 1b - forecast) # 2) LASSO (2a - estimate model, 2b - forecast) # 3) RIDGE (3a - estimate model, 3b - forecast) # 4) Elastic net (EN) with alpha = 0.25, 0.50, 0.75 (4a - estimate model, 4b - forecast) # 5) We create a network and network-wise variables # 6) We estimate LASSO, RIDGE, EN models with network-wise variables (6a - estimate model, 6b - forecast) # 7) We compare the forecasting accuracy using mean squared error (MSE) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # %%%%%%%%%%%%%%%%%%%%%%%%% # Sample spliting # %%%%%%%%%%%%%%%%%%%%%%%%% # Number of forecasted loans: Loans where we want to predict the return NF = 100 # Overall number of observations N = dim(DT)[1] # The sample to use to estimate the model S1 = DT[1:(N-NF),] # The sample to use to predict (out-of-sample) loan return S2 = DT[(N-NF+1):N,] # %%%%%%%%%%%%%%%%%%%%%%%%% # 1) OLS model # %%%%%%%%%%%%%%%%%%%%%%%%% # 1a) Estimate model m1 = lm(RR2~new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+ female+lamt+int+durm+educprim+educbasic+ educvocat+educsec+msmar+msco+mssi+msdi+nrodep+ espem+esfue+essem+esent+esret+dures+exper+ linctot+noliab+lliatot+norli+noplo+lamountplo+ lamntplr+lamteprl+nopearlyrep,data=S1) # 1b) Forecast loan returns yhat = predict(m1,new=S2) # Calculate Means squared error ytrue = S2[,"RR2"] OLS = mean((yhat-ytrue)^2) plot(y=ytrue,x=yhat,pch=19,cex=0.5,ylim=c(min(yhat,ytrue),max(yhat,ytrue)),col='red', xlim=c(min(yhat,ytrue),max(yhat,ytrue)),xlab='Predicted returns',ylab='Realized returns') lines(x=c(-100,100),y=c(-100,100),lty=1,lwd=2,col='black') # %%%%%%%%%%%%%%%%%%%%%%%%% # 2) LASSO model # %%%%%%%%%%%%%%%%%%%%%%%%% # install.packages('glmnet') # Install only if you have not installed the package before library(glmnet) # 2a) Estimate model # We need a matrix of independent variables indep = as.matrix(S1[,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep")]) dep = S1$RR2 m2 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1) # 2b) Forecast loan returns # Select variables for prediction purposes - to be used latter. pred = as.matrix(S2[,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep")]) yhat = predict(m2,newx=pred,s=m2$lambda.1se) # Calculate Means squared error ytrue = S2$RR2 LASSO = mean((yhat-ytrue)^2) points(y=ytrue,x=yhat,pch=19,cex=0.5,col='blue') # %%%%%%%%%%%%%%%%%%%%%%%%% # 3) RIDGE model # %%%%%%%%%%%%%%%%%%%%%%%%% # 3a) Estimate model m3 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0) # 3b) Forecast loan returns yhat = predict(m3,newx=pred,s=m3$lambda.1se) # Calculate Means squared error ytrue = S2$RR2 RIDGE = mean((yhat-ytrue)^2) points(y=ytrue,x=yhat,pch=19,cex=0.5,col='brown') # %%%%%%%%%%%%%%%%%%%%%%%%% # 4) Elastic net model # %%%%%%%%%%%%%%%%%%%%%%%%% # 4a) Estimate model m4_25 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.25) m4_50 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.50) m4_75 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.75) # 4b) Forecast loan returns ytrue = S2$RR2 # alpha = 0.25 yhat = predict(m4_25,newx=pred,s=m4_25$lambda.1se) EN25 = mean((yhat-ytrue)^2) # alpha = 0.50 yhat = predict(m4_50,newx=pred,s=m4_50$lambda.1se) EN50 = mean((yhat-ytrue)^2) # alpha = 0.75 yhat = predict(m4_75,newx=pred,s=m4_75$lambda.1se) EN75 = mean((yhat-ytrue)^2) points(y=ytrue,x=yhat,pch=19,cex=0.5,col='green') MSEs = c(OLS,LASSO,RIDGE,EN25,EN50,EN75) names(MSEs) = c("OLS","LASSO","RIDGE","EN25","EN50","EN75") MSEs = sort(MSEs) cbind(MSEs) # %%%%%%%%%%%%%%%%%%%%%%%%% # 5) Network & variables # %%%%%%%%%%%%%%%%%%%%%%%%% # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 5a) We select variables that we want to use to create a distance matrix # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% DMV = DT[,c("int","durm","linctot","noliab")] # Using Euclidean distance metric - we create the distance variables DM = as.matrix(dist(scale(DMV))) DM[1:5,1:5] hist(DM,breaks=200,prob=T,main='Distribution of distances') # 5b) We first create a network for the whole sample # install.packages('igraph') # Install only if you have not installed it before library(igraph) # This will create an adjacency matrix (weighted but indirected) g = graph_from_adjacency_matrix(DM, mode = "undirected", weighted = TRUE) # we define the graph # As this is a complete graph, we ll extract a sub-graph # Minimum spanning tree is a possibility # This might take a while - do not worry g_mst = mst(g) # Visualize the 'Minimum Spanning Tree' # Now let's plot the graph, highligting loans where return was negative (default) status = (S1$RR2<0)*1 V(g_mst)$status = status V(g_mst)[status == 1]$color = "firebrick1" # color defaulted companies red V(g_mst)[status == 0]$color = "lightgreen" # color active companies green # We create a plot plot(g_mst, graph = "MST", vertex.label=NA, vertex.size = 3, main = "MST of the P2P applicants networks") # Generate vertex level variables DT$Deg = igraph::degree(g_mst) #degree centrality DT$Str = igraph::strength(g_mst) #strenght centrality DT$Clos = igraph::closeness(g_mst)*10^4 #closeness centrality DT$Bet = igraph::betweenness(g_mst) #betweenness centrality com = cluster_louvain(g_mst) #community detection via 'Louvain method' # How many communities do we have? length(unique(com$membership)) # We know which loan belongs to a which community - now we need to define community # specific variables - dummies (there are 124 communities = 124 new variable!) # install.packages('dummies') # if not installed - install library(dummies) CD=dummy(com$membership) DT = data.frame(DT,CD) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 6) Estimate LASSO, RIDGE, EN models with graph-level variables # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # We need a matrix of independent variables: for-testing sample indep = as.matrix(DT[1:(N-NF),c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep","Deg","Str","Clos","Bet", paste("membership",1:124,sep=''))]) dep = DT[1:(N-NF),"RR2"] # Variables for the predicted loans pred = as.matrix(DT[(N-NF+1):N,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep","Deg","Str","Clos","Bet", paste("membership",1:124,sep=''))]) ytrue = S2$RR2 # 6a) Estimate LASSO model m5_L = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1) coef(m5_L,s='lambda.1se') # 6b) Forecast loan returns yhat = predict(m5_L,newx=pred,s=m5_L$lambda.1se) # Calculate Means squared error LASSO_N = mean((yhat-ytrue)^2) # 6a) Estimate RIDGE model m5_R = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1) coef(m5_R,s='lambda.1se') # 6b) Forecast loan returns yhat = predict(m5_R,newx=pred,s=m5_R$lambda.1se) # Calculate Means squared error RIDGE_N = mean((yhat-ytrue)^2) # 6a) Estimate EN alpha = 0.25 model m5_E25 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.25) coef(m5_E25,s='lambda.1se') # 6b) Forecast loan returns yhat = predict(m5_E25,newx=pred,s=m5_E25$lambda.1se) # Calculate Means squared error EN25N = mean((yhat-ytrue)^2) # 6a) Estimate EN alpha = 0.50 model m5_E50 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.50) coef(m5_E50,s='lambda.1se') # 6b) Forecast loan returns yhat = predict(m5_E50,newx=pred,s=m5_E50$lambda.1se) # Calculate Means squared error EN50N = mean((yhat-ytrue)^2) # 6a) Estimate EN alpha = 0.50 model m5_E75 = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.75) coef(m5_E75,s='lambda.1se') # 6b) Forecast loan returns yhat = predict(m5_E75,newx=pred,s=m5_E75$lambda.1se) # Calculate Means squared error EN75N = mean((yhat-ytrue)^2) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # 7) Compare forecasting accuracy of competing models # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% MSEs = c(OLS,LASSO,RIDGE,EN25,EN50,EN75,LASSO_N,RIDGE_N,EN25N,EN50N,EN75N) names(MSEs) = c("OLS","LASSO","RIDGE","EN25","EN50","EN75","LASSO_N","RIDGE_N","EN25N","EN50N","EN75N") MSEs = sort(MSEs) cbind(MSEs)