# The following command clears the workspace! rm(list=ls()) # This will clear the plot device dev.off() # We load the data-file. FinTechP2P is an R object already. load(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P') # Alternatively, we could load the *.csv file DT = read.csv(file='C:\\Users\\EU\\Dropbox\\CNB FIN-TECH\\Prague\\FinTechP2P.csv') # The name of the dataset is DT. There are 4157 observations and 43 variables # Row is a loan # Column is a variable # Variables names are just abbreviations. We can take a look at them. names(DT) # More details about variables in Case study 3 ############################################## # Case study 1: Simple linear regression model ############################################## # We are interested in explaining the internal rate of return of micro-loans (on a P2P market) # via a variable, that signals whether the income of the borrower was NOT verified. # %%%%%%%%%%%%%%%%%%%%%%%%% # Visualization # %%%%%%%%%%%%%%%%%%%%%%%%% # Plot the internal rate of return variable. The name of the variable is RR2 plot(DT$RR2,type='p',pch=19,cex=0.25,xlab='Loans',ylab='Internal rate of return') abline(h=0,lwd=2,col='red') hist(DT$RR2,breaks=100,xlab='Return',prob=T,main='Distribution of returns') abline(v=0,lwd=2,col='red') # Plot the variable that codes whether the income of the borrower was NOT verified. # The name of the variable is ver2 pie(table(DT$ver2),labels=c("Not verified", "Verified"),col=c("white","red")) # We can see the percentages of 1 (not verified) and 0 (verified) as (including rounding on 2 decimal places): prct = round(100*table(DT$ver2)/sum(table(DT$ver2)),2) prct # And a little tweak of the piechart with percentages in it pie(table(DT$ver2),labels=paste(c("Not verified", "Verified")," - ",prct,"%",sep=""),col=c("white","red")) # We can visualize rate of return conditional on verification together boxplot(DT$RR2~DT$ver2,pch=19,cex=0.25) # %%%%%%%%%%%%%%%%%%%%%%%%% # Descriptive statistics # %%%%%%%%%%%%%%%%%%%%%%%%% # For convenience we create a new object out of the dependent variable and name it 'y' y = DT$RR2 # We now need a new package - because of functions for skewness and kurtosis # Run the following line only once # install.packages('moments') # Now we 'load' the package into the workspace library(moments) # Now we calculate the mean, standard deviation, min, median, max, skewness, kurtosis # We also use rounding, on 2 decimal places, to make the results more readable round(c(mean(y),sd(y),min(y),median(y),max(y),skewness(y),kurtosis(y)),2) # We could also take a look at the % of loans that were competly defaulted round(100*sum(y==-100)/length(y),2) # Or at the % of loans with return below 0, but also not fully defaulted round(100*sum(y>-100 & y<0)/length(y),2) # As the independent variable is a 'dummy' variable, it can be fully summarized by the # table table(DT$ver2) # Or percentages prct # %%%%%%%%%%%%%%%%%%%%%%%%% # Model estimation: OLS # %%%%%%%%%%%%%%%%%%%%%%%%% # The following function lm() estimates the model. Note that constant is assumed # one does not need to write RR2~1+ver2, it is enought to write RR2~ver2 m1 = lm(RR2~ver2,data=DT) # Coefficient estimates are m1 # More detailed results summary(m1) # Take a look at the coefficients # Take a look at the (uncertainty) variance of coefficient estimates (hence the p-value) # Take a look at the R2 of the model # Take a look at important (testable) model assumptions # With cross-sectional data - particularly heteroscedasticity # We need to download a new library and install it. Run this line only once # install.packages("lmtest") # We need to load the library into our current workspace library(lmtest) # Now we run a very basic heteroscedasticity test of Breusch and Pagan that is implemented in library 'lmtest' in the function bptest() bptest(m1) # Further tests to consider: Normality of residuals, Model specifications # As heteroscedasticity is present (p-value < \alpha = 0.05) the inference is made using consistent estimator(s) of standard error of coefficients # We need a new library # install.packages("sandwich") library(sandwich) coeftest(m1, vcov=vcovHC(m1,type='HC0')) # Now interprete statistical significance of coefficients # Futher issues to consider: residual diagnostics (for outliers), for dependence structures (clustering, spatial, in time) ############################################## # Case study 1: Alternative version ############################################## ################################ # We are interested in explaining the interest on the loan # via a variable, that of the annualized interest rate on that loan. ################################ # %%%%%%%%%%%%%%%%%%%%%%%%% # Visualization # %%%%%%%%%%%%%%%%%%%%%%%%% # Let's take a look at the interest rate variable. The name of the variable is int plot(y=DT$int,x=DT$end_date,type='p',pch=19,cex=0.25,xlab='Date',ylab='Annualized interest rate') hist(DT$int,breaks=100,xlab='Interest rate',prob=T,main='Distribution of interest rates') # We can visualize the interest rate conditional on verification together boxplot(DT$int~DT$ver2,pch=19,cex=0.25) # %%%%%%%%%%%%%%%%%%%%%%%%% # Descriptive statistics # %%%%%%%%%%%%%%%%%%%%%%%%% y = DT$int round(c(mean(y),sd(y),min(y),median(y),max(y),skewness(y),kurtosis(y)),2) # %%%%%%%%%%%%%%%%%%%%%%%%% # Model estimation: OLS # %%%%%%%%%%%%%%%%%%%%%%%%% m2 = lm(int~ver2,data=DT) m2 summary(m2) bptest(m2) coeftest(m2, vcov=vcovHC(m2,type='HC0')) ############################################## # Case study 2: ############################################## # We are interested in explaining the return on the loan (RR2) via # interest rate. # %%%%%%%%%%%%%%%%%%%%%%%%% # Visualization # %%%%%%%%%%%%%%%%%%%%%%%%% plot(y=DT$int,x=DT$end_date,type='p',pch=19,cex=0.25,xlab='Date',ylab='Annualized interest rate') hist(DT$int,breaks=100,xlab='Interest rate',prob=T,main='Distribution of interest rates') plot(y=DT$RR2,x=DT$int,pch=19,cex=0.25,xlab='Interest rate',ylab='Realized return') # %%%%%%%%%%%%%%%%%%%%%%%%% # Model estimation: OLS # %%%%%%%%%%%%%%%%%%%%%%%%% m3 = lm(RR2~int,data=DT) m3 summary(m3) bptest(m3) coeftest(m3, vcov=vcovHC(m3,type='HC0')) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ######################################################### # You can check the multivariate regression model example ######################################################### m1=lm(RR2~ver2,data=DT);summary(m1) res1 = residuals(m1) m4=lm(int~ver2,data=DT);summary(m4) res4 = residuals(m4) # The coefficient here ... m5=lm(res1~-1+res4); summary(m5) # ... should be (and is) the same as the coefficient on 'int' here ... m6 = lm(RR2~int+ver2,data=DT) summary(m6) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ################################################ # Case study 3: Multiple linear regression model ################################################ # %%%%%%%%%%%%%%%%%%%%%%%%% # Data description # %%%%%%%%%%%%%%%%%%%%%%%%% ########################################################################### # We are interested in explaining the return on the loan (RR2) via multiple # indpendent variables: # new - 1 if the customer is new, 0 otherwise # ver2 - 1 if income is not verified, 0 otherwise # ver3 - 1 if income is verified, 0 otherwise # ver4 - 1 if income and expenses are verified, 0 otherwise # lfi - 1 if the language of the borrower is Finish, 0 otherwise # lee - 1 if the language of the borrower is Estonian, 0 otherwise # luk - 1 if the language of the borrower is English, 0 otherwise # lrs - 1 if the language of the borrower is Russian, 0 otherwise # lsk - 1 if the language of the borrower is Slovakian, 0 otherwise # German and Spanish languages are the remaining ones # age - age of the borrower # Female - 1 if the borrower is a female, 0 otherwise # undG - 1 if the gender is left undefined, 0 otherwise # Male is the one missing here # lamt - the log (natural henceforth) of the amount the borrower received # int - interest on the loan # durm - planned loan duration (in months) # educprim - 1 primary education, 0 otherwise # educbasic - 1 primary education, 0 otherwise # educvocat - 1 vocational education, 0 otherwise # educsec - 1 secondary education, 0 otherwise # Higher education is the one missing # msmar - 1 married, 0 otherwise # msco - 1 cohabitation, 0 otherwise # mssi - 1 single, 0 otherwise # msdi - 1 divorced, 0 otherwise # Widow status is the one missing # espem - 1 partially employed, 0 otherwise # esfue - 1 fully employed, 0 otherwise # essem - 1 self employed, 0 otherwise # esent - 1 enterpreneur, 0 otherwise # esret - 1 retiree, 0 otherwise # Unemployed is the one missing # dures - Current employment duration: in ordered categories, 0 - nothing, 1 - TrialPeriod, 2 - UpTo1Year, 3 - UpTo2Years, 4- UpTo3Years, 5 - UpTo4Years, 6 - UpTo5Years, 7 - MoreThan5Years # exper - Work experience: in ordered categories, no - 0, LessThan2Years - 1, 2To5Years - 2, 5To10Years- 3, 10To15Years - 4, 15To25Years - 5, MoreThan25Years- 6 # linctot - Log of total income of the borrower # noliab - Number of existing liabilities of the borrower # lliatot - Log of the total amount of current liabilities # norli - Number of refinanced liabilities # noplo - Number of previous loans # lamountplo - Log of the total amount of the previous loans # lamntplor - Log of the total amount of previous loans that were paid off # lamntplor - Log of the total amount of previous loans that were paid off before the loan's duration # nopearlyrep - Number of previous early re-paid loans # nrodep - Number of dependants in the household # %%%%%%%%%%%%%%%%%%%%%%%%% # Sample spliting # %%%%%%%%%%%%%%%%%%%%%%%%% # Number of forecasted loans NF = 100 # Overall number of observations N = dim(DT)[1] # The sample to use to estimate the model Sample1 = DT[1:(N-NF),] # The sample to use to predict (out-of-sample) loan return Sample2 = DT[(N-NF+1):N,] # %%%%%%%%%%%%%%%%%%%%%%%%% # Model estimation # %%%%%%%%%%%%%%%%%%%%%%%%% # Model estimation m7 = lm(RR2~new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+ female+lamt+int+durm+educprim+educbasic+ educvocat+educsec+msmar+msco+mssi+msdi+nrodep+ espem+esfue+essem+esent+esret+dures+exper+ linctot+noliab+lliatot+norli+noplo+lamountplo+ lamntplr+lamteprl+nopearlyrep,data=Sample1) # We take a look at the model-fit, coefficients summary(m7) # Heteroscedasticity in these types of models is almost guaranteed as they are not well specified (many non-linear effects) bptest(m7) # (Usually) more conservative standard errors coeftest(m7, df = Inf, vcov = vcovHC(m7, type = "HC0")) # Check the non-perfect colinearity using variance inflation factor # For checking the non-perfect colinearity we need a new library # install.packages('car') # Calling the library library(car) # Now calculating Variance Inflation Factor (VIF) # If any of the variables has VIF > 10 & the coefficients are non-significant, multicolinearity might be an issue here. which(vif(m7)>10) # Removal of 'mco' will lead to a model with no co-linearity issues, but it will not help in improving the model. We thus work with the m7 model # %%%%%%%%%%%%%%%%%%%%%%%%% # Return prediction # %%%%%%%%%%%%%%%%%%%%%%%%% # Let's predict loans using the estimated model yhat = predict(m7,new=Sample2) # We save the realized (true) loan returns ytrue = Sample2$RR2 # We can visualize our accuracy plot(y=ytrue,x=yhat,pch=19,cex=0.25,ylim=c(min(yhat,ytrue),max(yhat,ytrue)), xlim=c(min(yhat,ytrue),max(yhat,ytrue)),xlab='Predicted returns',ylab='Realized returns') # We can take a look at the true and forecasted returns cbind(yhat,ytrue) # In many occasions, errors are small, but particularly for negative returns, errors tend to increase a lot # There are several 'metrics' to measure the accuracy of the forecast # The most standard one is the Means Squared Error # We can use the Mean Squared Forecast Error to Compare forecasts between different model # And choose the one, with lowest MSFE. mean((yhat-ytrue)^2) # Mean absolute deviation mean(abs(yhat-ytrue)) # Histogram of forecast errors hist(abs(yhat-ytrue),main='Forecast errors') # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ######################################################### # Examples of overfitting: Model bias & Variance ######################################################### # %%%%%%%%%%%%%%%%%%%%%%%%% # Generate data # %%%%%%%%%%%%%%%%%%%%%%%%% # Number of observations N = 50 # Random seed number rs = 10 # Set random seed set.seed(rs) # Generate independent variables x = abs(rnorm(N))+1 # Set random seed set.seed(rs+7) # Generate error temrs u = rnorm(N,0,0.25) # Generate the 'true' relationship not influenced by noise yt = (2 + log(x)) # Generate the dependent variable y = (yt + u) # Stack data into a data frame sms = data.frame(x,y,yt,u) # Sort the data according to the 'x' variable sms = sms[order(sms$x),] # Plot the observations, 'y' against 'x' plot(x=sms$x,y=sms$y,type='p',pch=19,xlab='x',ylab='y') # Draw the true 'line' lines(x=sms$x,y=sms$yt,col='red',lwd=2) legend('bottomright',legend=c('Observed value', 'True relationship'), pch = 19, col=c('black','red'), bty='n') # %%%%%%%%%%%%%%%%%%%%%%%%% # Sample spliting # %%%%%%%%%%%%%%%%%%%%%%%%% # Now we split the sample into 2 parts # Testing sample set.seed(rs) # Which observations (rows) are we going to select for the testing sample? # We select N/2 observations for the test sample idx = c(sample(1:(N/2),20),sample((N/2+1):N,5)) tst = sms[idx,] # Validation sample # which observations (rows) are we going to select for the validation sample? (The remaining ones) val = sms[-idx,] # Let's plot the data from testing dataset # Plot the observations, 'y' against 'x' plot(x=tst$x,y=tst$y,type='p',pch=19,xlab='x',ylab='y',xlim=c(min(sms$x),max(sms$x)),ylim=c(min(sms$y),max(sms$y))) # To have the perspective on the true line, let's see lines(x=sms$x,y=sms$yt,col='red',lwd=2) points(x=val$x,y=val$y,col='green',pch=19) legend('topleft',legend=c('Testing sample','Validation sample','True relationship'),pch=19,bty='n',col=c('black','green','red')) # %%%%%%%%%%%%%%%%%%%%%%%%% # Model estimation: OLS # %%%%%%%%%%%%%%%%%%%%%%%%% # Estimate a simple OLS model m8 = lm(y~x,data=tst) m8 # Plot the line plot(x=tst$x,y=tst$y,type='p',pch=19,xlab='x',ylab='y',xlim=c(min(sms$x),max(sms$x)),ylim=c(min(sms$y),max(sms$y))) lines(x=sms$x,y=sms$yt,col='red',lwd=2) abline(m8,col='blue',lwd=3) legend('topleft',legend=c('Testing sample','True relationship','Linear model'),pch=19,bty='n',col=c('black','red','blue')) # Now let's estimate a polynomial model via OLS (increasing complexity) m9 = lm(y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5),data=tst) m9 # Plot the curve poly = data.frame(tst$x,fitted(m9)); names(poly) = c("x","ytp") poly = poly[order(poly$x),] plot(x=tst$x,y=tst$y,type='p',pch=19,xlab='x',ylab='y',xlim=c(min(sms$x),max(sms$x)),ylim=c(min(sms$y),max(sms$y))) abline(m8,col='blue',lwd=3) lines(x=poly$x,y=poly$ytp,col='purple',lwd=2) legend('topleft',legend=c('Testing sample','Linear model','Polynomial'),pch=19,bty='n',col=c('black','blue','purple')) # Purple curve fits the data much better... on the testing sample # Let's compare the 'fit' using mean squared errors # First for the line # MSE Line MSE_m8=mean((tst$y-fitted(m8))^2) MSE_m8 # MSE Polynomial MSE_m9=mean((tst$y-fitted(m9))^2) MSE_m9 # The improvement? 100*(MSE_m8-MSE_m9)/MSE_m8 # %%%%%%%%%%%%%%%%%%%%%%%%% # Predictions # %%%%%%%%%%%%%%%%%%%%%%%%% # Is the polynomial going to out-perform the simple linear model? # We now predict 'y' from the validation sample # First using the linear model fm8 = predict(m8,new=val) # Second using the polynomial model fm9 = predict(m9,new=val) # Let's plot the 'new' points and 'old' curves plot(x=val$x,y=val$y,pch=19,col='green', xlim=c(min(sms$x),max(sms$x)), ylim=c(min(c(sms$y,fm8,fm9)),max(c(sms$y,fm8,fm9))), xlab='x',ylab='y') abline(m8,col='blue',lwd=3) lines(x=poly$x,y=poly$ytp,col='purple',lwd=2) points(x=val$x,y=val$y,pch=19,col='green') legend('topleft',legend=c('Testing sample','Linear model','Polynomial','Validation sample'),pch=19,bty='n',col=c('black','blue','purple','green')) # We now plot the forecasts plot(x=val$x,y=val$y,pch=19,col='green', xlim=c(min(sms$x),max(sms$x)), ylim=c(min(c(sms$y,fm8,fm9)),max(c(sms$y,fm8,fm9))), xlab='x',ylab='y') points(x=val$x,y=fm8,col='blue',pch=19) points(x=val$x,y=fm9,col='purple',pch=19) legend('topleft', bty='n',pch=19, legend=c("True values","Line", "Curve"), col=c('green','blue','purple')) # Now let's check the MSF(orecast)E MSFE_m8 = mean((val$y-fm8)^2) MSFE_m8 MSFE_m9 = mean((val$y-fm9)^2) MSFE_m9 # The improvement? 100*(MSFE_m8-MSFE_m9)/MSFE_m8 # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ################################################ # Case study 4: Ridge regression ################################################ # %%%%%%%%%%%%%%%%%%%%%%%%% # Sample splitting # %%%%%%%%%%%%%%%%%%%%%%%%% # We split the sample as before # Number of observations we use for the out-of-sample study NF = 100 N = dim(DT)[1] # Now we create the two samples tst = DT[1:(N-NF),] # in-sample (testing sample) val = DT[((N-NF)+1):N,] #out-of-sample (validation sample) # %%%%%%%%%%%%%%%%%%%%%%%%% # Model estimation # %%%%%%%%%%%%%%%%%%%%%%%%% # We again estimate the OLS model # Model estimation m7 = lm(RR2~new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+ female+lamt+int+durm+educprim+educbasic+ educvocat+educsec+msmar+msco+mssi+msdi+nrodep+ espem+esfue+essem+esent+esret+dures+exper+ linctot+noliab+lliatot+norli+noplo+lamountplo+ lamntplr+lamteprl+nopearlyrep,data=tst) # Now the Ridge regression # We need to instal the library - do it only once. # Install.packages('glmnet') library(glmnet) # We first need to prepare the data into a format that # the function glmnet understands # We need a matrix of independent variables indep = as.matrix(tst[,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep")]) dep = tst[,"RR2"] # Select variables for prediction purposes - to be used latter. pred = as.matrix(val[,c("new","ver3","ver4","lfi","lee","luk","lrs","lsk","age", "undG","female","lamt","int","durm","educprim","educbasic", "educvocat","educsec","msmar","msco","mssi","msdi","nrodep", "espem","esfue","essem","esent","esret","dures","exper", "linctot","noliab","lliatot","norli","noplo","lamountplo", "lamntplr","lamteprl","nopearlyrep")]) # We set the number of groups for cross-validation purposes. # We set it to 30 CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0) # We plot the relationship between lambda and the MSE plot(CV) # Which lambda minimizes MSE? CV$lambda.min # Which lambda corresponds to (min(MSE)+1SD(MSE))? CV$lambda.1se # You can take a look at the coefficients of the RIDGE regression and compare it with OLS round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4) # This is interesting as it shows, how coefficients are Shrinked. Some are # shrinked towards zero, while others are allowed to be even further ways from 0 # suggesting, that these coefficients play an important role in predictions. # %%%%%%%%%%%%%%%%%%%%%%%%% # Predictions # %%%%%%%%%%%%%%%%%%%%%%%%% # We perform the prediction using coefficients from 'm7' and new data from Sample2 yOLS = predict(m7,new=val) # We save the realized (true) loan returns ytrue = val$RR2 # We calculate the MSFE MSEOLS = mean((yOLS-ytrue)^2) ############### # Now the RIDGE using the glmnet package yRIDGEmin=predict(CV,newx=pred,s=CV$lambda.min) MSER1 = mean((ytrue-yRIDGEmin)^2) yRIDGE1se=predict(CV,newx=pred,s=CV$lambda.1se) MSER2 = mean((ytrue-yRIDGE1se)^2) # Compare errors cbind(MSEOLS, MSER1, MSER2) # Both Ridge regression predictions have slightly out-performed the OLS! ################################################ # Case study 5: Lasso regression ################################################ # %%%%%%%%%%%%%%%%%%%%%%%%% # Model estimation: LASSO # %%%%%%%%%%%%%%%%%%%%%%%%% # We use objects from previous case study # Perform the CV (note the difference is that now alpha=1 instead of 0) CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=1) # We plot the relationship between lambda and the MSE plot(CV) # Which lambda minimizes MSE? CV$lambda.min # Which lambda corresponds to (min(MSE)+1SD(MSE))? CV$lambda.1se # You can take a look at the coefficients of the LASSO regression and compare it with OLS round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4) # Several coefficients are reduced to 0! These variables are interpreted as not being very # useful in predicting the dependent variable. # %%%%%%%%%%%%%%%%%%%%%%%%% # Predictions # %%%%%%%%%%%%%%%%%%%%%%%%% yLASSOmin=predict(CV,newx=pred,s=CV$lambda.min) MSEL1 = mean((ytrue-yLASSOmin)^2) yLASSO1se=predict(CV,newx=pred,s=CV$lambda.1se) MSEL2 = mean((ytrue-yLASSO1se)^2) # Compare errors cbind(MSEOLS, MSER1, MSER2, MSEL1, MSEL2) # Lasso L2 out-performed the OLS and the Ridge! ################################################ # Case study 6: Elastic net ################################################ # %%%%%%%%%%%%%%%%%%%%%%%%%% # Predictions: EN alpha=0.25 # %%%%%%%%%%%%%%%%%%%%%%%%%% CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.25) CV$lambda.min CV$lambda.1se round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4) yNET025min=predict(CV,newx=pred,s=CV$lambda.min) MSEEN1.1 = mean((ytrue-yNET025min)^2) yNET0251se=predict(CV,newx=pred,s=CV$lambda.1se) MSEEN1.2 = mean((ytrue-yNET0251se)^2) # %%%%%%%%%%%%%%%%%%%%%%%%%% # Predictions: EN alpha=0.25 # %%%%%%%%%%%%%%%%%%%%%%%%%% CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.50) CV$lambda.min CV$lambda.1se round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4) yNET050min=predict(CV,newx=pred,s=CV$lambda.min) MSEEN2.1 = mean((ytrue-yNET050min)^2) yNET0501se=predict(CV,newx=pred,s=CV$lambda.1se) MSEEN2.2 = mean((ytrue-yNET0501se)^2) # %%%%%%%%%%%%%%%%%%%%%%%%%% # Predictions: EN alpha=0.75 # %%%%%%%%%%%%%%%%%%%%%%%%%% CV = cv.glmnet(x=indep,y=dep,nfolds=30,alpha=0.75) CV$lambda.min CV$lambda.1se round(cbind(coefficients(m7),coef(CV,s='lambda.min'),coef(CV,s='lambda.1se')),4) yNET075min=predict(CV,newx=pred,s=CV$lambda.min) MSEEN3.1 = mean((ytrue-yNET075min)^2) yNET0751se=predict(CV,newx=pred,s=CV$lambda.1se) MSEEN3.2 = mean((ytrue-yNET0751se)^2) # %%%%%%%%%%%%%%%%%%%%%%%%%% # Compare all results # %%%%%%%%%%%%%%%%%%%%%%%%%% MSEs = c(MSEOLS,MSER1,MSER2,MSEL1,MSEL2,MSEEN1.1,MSEEN1.2,MSEEN2.1,MSEEN2.2,MSEEN3.1,MSEEN3.2) names(MSEs) = c("OLS","Ridge_M","Ridge_1","LASSO_M","LASSO_1", "EN25_M","EN25_1","EN50_M","EN50_1","EN75_M","EN75_1") MSEs = sort(MSEs) cbind(MSEs) # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #################################################### # FIGURES & CODES FOR EXAMPLE ON LOGISTIC REGRESSION #################################################### # The dependent variable, 1 - if return on investment was less than 0, 0 - otherwise DEF = (DT$RR2<0)*1 # The independent variable is the annualized interest rate # First, fit the OLS model (not a good option) m10 = lm(DEF~int,data=DT) summary(m10) # Visualize the relationship plot(y=DEF,x=DT$int,pch=19,cex=0.50,xlab='Annualized interest rate [%]',ylab='(Probability of) default') # Plot the regression line abline(m10,col='red',lwd=2) # Now estimate a logit model m11 = glm(formula = DEF ~ int, family = binomial(link = "logit"),data = DT) summary(m11) # Plotting the S-curve yfit=fitted(m11) points(y=yfit,x=DT$int,col='blue',pch=19,cex=0.25) # Visualizing how unit change in interest influences the probability interest = seq(from=0,to=250,by=1) B0=coefficients(m11)[1] B1=coefficients(m11)[2] fpi = (exp(B0+B1*interest))/(1+exp(B0+B1*interest)) dat = data.frame(fpi,interest); names(dat) = c("FittedProb","Interest") dat$ME = c(NA,diff(dat$FittedProb)) plot(x=dat$Interest[-1],y=dat$ME[-1],pch=19,cex=0.25,xlab='Interest',ylab="Marginal effect") # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ################################################ # Case study 7: Logistic regression ################################################ # %%%%%%%%%%%%%%%%%%%%%%%%%% # Model estimation: Logit # %%%%%%%%%%%%%%%%%%%%%%%%%% # Storing the dependent variables for each of the samples tst$DEF = (tst$RR2<0)*1 val$DEF = (val$RR2<0)*1 # Estimating the model m12 = glm(formula = DEF ~ new+ver3+ver4+lfi+lee+luk+lrs+lsk+age+undG+ female+lamt+int+durm+educprim+educbasic+ educvocat+educsec+msmar+msco+mssi+msdi+nrodep+ espem+esfue+essem+esent+esret+dures+exper+ linctot+noliab+lliatot+norli+noplo+lamountplo+ lamntplr+lamteprl+nopearlyrep, family = binomial(link = "logit"),data = tst) summary(m12) # %%%%%%%%%%%%%%%%%%%%%%%%%% # Prediction of defaults # %%%%%%%%%%%%%%%%%%%%%%%%%% # Now we perform the prediction on the validation sample. # Predicted log odds ypred=predict(m12,new=val) # Predicted probabilities ppred=exp(ypred)/(1+exp(ypred)) # Comparing model predictions true = val$DEF # Let's code the response as follows. IF the predicted probability is above 0.5 # we assume that the loan is going for a default, 0 otherwise. # Now, let's compare truth with predictions. predicted = as.numeric((ppred>0.5)*1) # We create a table for that TBL = table(predicted,true) TBL # We can calculate the # Accuracy of predictions (on the diagonal) sum(diag(TBL))/100 # Sensitivity TBL[1,1]/sum(TBL[1,]) # Specificity TBL[2,2]/sum(TBL[,2]) # High accuracy driven by high sensitivity but low specificity. The model # predict non-failures good, but failures poorly. What to do? # We could change the threshold (in-sample approach now) ths = seq(from=0.01,to=0.99,by=0.01) logit_eval = matrix(NA,nrow=length(ths),ncol=4) logit_eval[,4] = ths for (i in 1:length(ths)) { predicted = as.numeric((ppred>ths[i])*1) # We create a table for that TBL = table(predicted,true) if (dim(TBL)[1]==1 | dim(TBL)[2]==1) next # Accuracy of predictions (on the diagonal) logit_eval[i,1]=100*sum(diag(TBL))/100 # Sensitivity logit_eval[i,2]=100*TBL[1,1]/sum(TBL[1,]) # Specificity logit_eval[i,3]=100*TBL[2,2]/sum(TBL[,2]) } # And visualize how: Accuracy, Sensitivity & Specificity change given a treshold plot(x=ths,y=logit_eval[,1],pch=19,cex=0.25,ylim=c(0,100),xlim=c(-0.2,1),xlab='Treshold',ylab='Forecast evaluation') points(x=ths,y=logit_eval[,2],pch=19,cex=0.25,col='red') points(x=ths,y=logit_eval[,3],pch=19,cex=0.25,col='blue') legend('bottomleft',legend=c("Accuracy","Sensitivity","Specificity"), col=c('black',"red",'blue'),pch=19,bty='n') abline(v=0.5,col='black',lty=2)