# Function creates a 'formula' object gen.fm = function(dep='default',x=features) { # This will create the 'dep~1' string - which is y = beta_0 model # paste() is a useful function... it links strings spec = paste(dep,'~1',sep='') # If there are no features - the vector 'x' has no elements - return if (is.null(x)) return(as.formula(spec)) # Number of features NV = length(x) # Loop over all features - we add one feature at a time for (v in 1:NV) spec = paste(spec,'+',x[v],sep='') return(as.formula(spec)) } library(caret) library(glmnet) library(tree) library(rpart) library(rpart.plot) library(ranger) library(factoextra) library(fastDummies) library(cluster) library(pROC) library(foreach) library(doParallel) library(gbm) library(factoextra) ############################# # Predict price of the cars: # BENCHMARKS: # OLS # LASSO-MIN, RIDGE-MIN, RF, XGB # OLS with selected two-way interaction features # LASSO-MIN, RIDGE-MIN, RF, XGB with all two-way interaction features # COMBINATION FORECASTS: # Simple average # Trimmed average # Median # Optimal weights # Performance based (MSE/MAE) # Rank based (MSE/MAE) # MCS based average # Regression based -> unconstrained optimal weights # Regression based -> conditional upon features # Principal components regression ############################# ############################# # First load training and testing data into the workspace # rm(list=ls()) ocr <- read.csv(file='F:\\Institutions\\MU\\ML Finance\\2023\\Week 10\\ocr.csv') oct <- read.csv(file='F:\\Institutions\\MU\\ML Finance\\2023\\Week 10\\oct.csv') ############################# # We will need to separate the training! Why? # Because we need to see out-of-sample performance of models for creating combination forecasts. ocr.calib <- ocr[seq(from=1,to=dim(ocr)[1],by=5) ,] ocr.train <- ocr[-seq(from=1,to=dim(ocr)[1],by=5),] # Store results results = matrix(NA,nrow=18,ncol=5) colnames(results) = c('mse','mse_mcs','mae','mae_mcs','mpe') rownames(results) = c('LR','LASSO-MIN', 'RIDGE-MIN', 'RF', 'XGB', 'LASSO-MIN-2W', 'RIDGE-MIN-2W', 'RF-2W', 'XGB-2W', 'CF-AVE','CF-TAVE', 'CF-MED', 'CF-PERF-MSE', 'CF-PERF-MAE', 'CF-RANK-MSE', 'CF-RANK-MAE', 'CF-MCE', 'CF-UN-REG') loss.mse = matrix(NA,nrow=dim(oct)[1],ncol=18) colnames(loss.mse) = rownames(results) loss.mae = loss.mse # For calibration samples loss.mse.calib = matrix(NA,nrow=dim(ocr.calib)[1],ncol=9) colnames(loss.mse.calib) = c('LR','LASSO-MIN', 'RIDGE-MIN', 'RF', 'XGB', 'LASSO-MIN-2W', 'RIDGE-MIN-2W', 'RF-2W', 'XGB-2W') loss.mae.calib = loss.mse.calib ############################# # Specification specs <- gen.fm(dep='price',x=names(ocr)[c(3:6,8:21,23:27)]) ############################# # Benchmark models - PART I ############################# # LR m0 <- lm(specs,data=ocr.train) p0 <- predict(m0,new=oct) pc0 <- predict(m0,new=ocr.calib) # LASSO-MIN x <- as.matrix(ocr.train[,all.vars(specs)[-1]]) newx <- as.matrix(oct[,all.vars(specs)[-1]]) newcx <- as.matrix(ocr.calib[,all.vars(specs)[-1]]) y <- as.matrix(ocr.train[,all.vars(specs)[1]]) lasso <- cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='gaussian',alpha=1) p1 <- as.numeric(predict(lasso,newx=newx,s='lambda.min')) pc1 <- as.numeric(predict(lasso,newx=newcx,s='lambda.min')) # RIDGE-MIN ridge <- cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='gaussian',alpha=0) p2 <- as.numeric(predict(ridge,newx=newx,s='lambda.min')) pc2 <- as.numeric(predict(ridge,newx=newcx,s='lambda.min')) # RF rf.tree <- ranger(specs,data=ocr.train,num.trees=5000,mtry=6,min.node.size=5,max.depth=0) p3 <- predict(rf.tree,data=oct)$predictions pc3 <- predict(rf.tree,data=ocr.calib)$predictions # XGB gb.tree <- gbm(specs,data=ocr.train,distribution='gaussian',n.trees=5000,interaction.depth=3,shrinkage=0.01) p4 <- predict(gb.tree,newdata=oct) pc4 <- predict(gb.tree,newdata=ocr.calib) ############################# # Evaluate benchmark models ############################# p <- oct$price pc <- ocr.calib$price loss.mse[,1] <- (p-p0)^2 loss.mae[,1] <- abs(p-p0) results[1,c(1,3,5)] <- c(mean(loss.mse[,1]),mean(loss.mae[,1]),mean(abs(p-p0)/p)) loss.mse.calib[,1] <- (pc-pc0)^2 loss.mae.calib[,1] <- abs(pc-pc0) loss.mse[,2] <- (p-p1)^2 loss.mae[,2] <- abs(p-p1) results[2,c(1,3,5)] <- c(mean(loss.mse[,2]),mean(loss.mae[,2]),mean(abs(p-p1)/p)) loss.mse.calib[,2] <- (pc-pc1)^2 loss.mae.calib[,2] <- abs(pc-pc1) loss.mse[,3] <- (p-p2)^2 loss.mae[,3] <- abs(p-p2) results[3,c(1,3,5)] <- c(mean(loss.mse[,3]),mean(loss.mae[,3]),mean(abs(p-p2)/p)) loss.mse.calib[,3] <- (pc-pc2)^2 loss.mae.calib[,3] <- abs(pc-pc2) loss.mse[,4] <- (p-p3)^2 loss.mae[,4] <- abs(p-p3) results[4,c(1,3,5)] <- c(mean(loss.mse[,4]),mean(loss.mae[,4]),mean(abs(p-p3)/p)) loss.mse.calib[,4] <- (pc-pc3)^2 loss.mae.calib[,4] <- abs(pc-pc3) loss.mse[,5] <- (p-p4)^2 loss.mae[,5] <- abs(p-p4) results[5,c(1,3,5)] <- c(mean(loss.mse[,5]),mean(loss.mae[,5]),mean(abs(p-p4)/p)) loss.mse.calib[,5] <- (pc-pc4)^2 loss.mae.calib[,5] <- abs(pc-pc4) ############################# # Benchmark models - 2-WAY INTERACTIONS ############################# features <- all.vars(specs)[-1] NV <- length(features) ocr.train.2w <- matrix(NA,nrow=dim(ocr.train)[1],ncol=NV^2) ocr.calib.2w <- matrix(NA,nrow=dim(ocr.calib)[1],ncol=NV^2) oct.2w <- matrix(NA,nrow=dim(oct)[1],ncol=NV^2) nms <- c() k <- 1 for (i in 1:23) { for (j in 1:23) { ocr.train.2w[,k] <- ocr.train[,features[i]]*ocr.train[,features[j]] ocr.calib.2w[,k] <- ocr.calib[,features[i]]*ocr.calib[,features[j]] oct.2w[,k] <- oct[,features[i]]*oct[,features[j]] nms <- c(nms,paste(features[i],'.',features[j],sep='')) k <- k+1 } } colnames(ocr.train.2w) <- nms colnames(ocr.calib.2w) <- nms colnames(oct.2w) <- nms ocr.train.2w <- data.frame(ocr.train,ocr.train.2w) ocr.calib.2w <- data.frame(ocr.calib,ocr.calib.2w) oct.2w <- data.frame(oct,oct.2w) # Now we can create new - extra specifications specs.2w <- gen.fm(dep='price',x=names(ocr.train.2w)[c(3:6,8:21,23:556)]) ############################# # Benchmark models - PART II ############################# # LASSO-MIN x <- as.matrix(ocr.train.2w[,all.vars(specs.2w)[-1]]) newx <- as.matrix(oct.2w[,all.vars(specs.2w)[-1]]) newcx <- as.matrix(ocr.calib.2w[,all.vars(specs.2w)[-1]]) y <- as.matrix(ocr.train.2w[,all.vars(specs.2w)[1]]) lasso <- cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='gaussian',alpha=1) plot(lasso) p5 <- as.numeric(predict(lasso,newx=newx,s='lambda.min')) pc5 <- as.numeric(predict(lasso,newx=newcx,s='lambda.min')) # RIDGE-MIN ridge <- cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='gaussian',alpha=0) plot(ridge) p6 <- as.numeric(predict(ridge,newx=newx,s='lambda.min')) pc6 <- as.numeric(predict(ridge,newx=newcx,s='lambda.min')) # RF rf.tree <- ranger(specs,data=ocr.train.2w,num.trees=5000,mtry=6,min.node.size=5,max.depth=0) p7 <- predict(rf.tree,data=oct.2w)$predictions pc7 <- predict(rf.tree,data=ocr.calib.2w)$predictions # XGB gb.tree <- gbm(specs,data=ocr.train.2w,distribution='gaussian',n.trees=5000,interaction.depth=3,shrinkage=0.01) p8 <- predict(gb.tree,newdata=oct.2w) pc8 <- predict(gb.tree,newdata=ocr.calib.2w) ############################# # Evaluate benchmark models - part 2 loss.mse[,6] <- (p-p5)^2 loss.mae[,6] <- abs(p-p5) results[6,c(1,3,5)] <- c(mean(loss.mse[,6]),mean(loss.mae[,6]),mean(abs(p-p5)/p)) loss.mse.calib[,6] <- (pc-pc5)^2 loss.mae.calib[,6] <- abs(pc-pc5) loss.mse[,7] <- (p-p6)^2 loss.mae[,7] <- abs(p-p6) results[7,c(1,3,5)] <- c(mean(loss.mse[,7]),mean(loss.mae[,7]),mean(abs(p-p6)/p)) loss.mse.calib[,7] <- (pc-pc6)^2 loss.mae.calib[,7] <- abs(pc-pc6) loss.mse[,8] <- (p-p7)^2 loss.mae[,8] <- abs(p-p7) results[8,c(1,3,5)] <- c(mean(loss.mse[,8]),mean(loss.mae[,8]),mean(abs(p-p7)/p)) loss.mse.calib[,8] <- (pc-pc7)^2 loss.mae.calib[,8] <- abs(pc-pc7) loss.mse[,9] <- (p-p8)^2 loss.mae[,9] <- abs(p-p8) results[9,c(1,3,5)] <- c(mean(loss.mse[,9]),mean(loss.mae[,9]),mean(abs(p-p8)/p)) loss.mse.calib[,9] <- (pc-pc8)^2 loss.mae.calib[,9] <- abs(pc-pc8) ############################# # Combination forecasts ############################# # Simple average p.test <- cbind(p0,p1,p2,p3,p4,p5,p6,p7,p8) p9 <- apply(p.test,1,mean) loss.mse[,10] <- (p-p9)^2 loss.mae[,10] <- abs(p-p9) results[10,c(1,3,5)] <- c(mean(loss.mse[,10]),mean(loss.mae[,10]),mean(abs(p-p9)/p)) # Trimmed average p10 <- apply(p.test,1,mean,trim=0.3) loss.mse[,11] <- (p-p10)^2 loss.mae[,11] <- abs(p-p10) results[11,c(1,3,5)] <- c(mean(loss.mse[,11]),mean(loss.mae[,11]),mean(abs(p-p10)/p)) # Median p11 <- apply(p.test,1,median) loss.mse[,12] <- (p-p11)^2 loss.mae[,12] <- abs(p-p11) results[12,c(1,3,5)] <- c(mean(loss.mse[,12]),mean(loss.mae[,12]),mean(abs(p-p11)/p)) # Performance based (MSE/MAE) # Average MSE on calibration sample. The lower the better! mse.calib <- apply(loss.mse.calib,2,mean) w.mse <- mse.calib^(-1)/sum(mse.calib^(-1)) p12 <- p.test %*% w.mse loss.mse[,13] <- (p-p12)^2 loss.mae[,13] <- abs(p-p12) results[13,c(1,3,5)] <- c(mean(loss.mse[,13]),mean(loss.mae[,13]),mean(abs(p-p12)/p)) mae.calib <- apply(loss.mae.calib,2,mean) w.mae <- mae.calib^(-1)/sum(mae.calib^(-1)) p13 <- p.test %*% w.mae loss.mse[,14] <- (p-p13)^2 loss.mae[,14] <- abs(p-p13) results[14,c(1,3,5)] <- c(mean(loss.mse[,14]),mean(loss.mae[,14]),mean(abs(p-p13)/p)) # Rank based (MSE/MAE) mse.calib <- rank(apply(loss.mse.calib,2,mean)) w.mse <- mse.calib^(-1)/sum(mse.calib^(-1)) p14 <- p.test %*% w.mse loss.mse[,15] <- (p-p14)^2 loss.mae[,15] <- abs(p-p14) results[15,c(1,3,5)] <- c(mean(loss.mse[,15]),mean(loss.mae[,15]),mean(abs(p-p14)/p)) mae.calib <- rank(apply(loss.mae.calib,2,mean)) w.mae <- mae.calib^(-1)/sum(mae.calib^(-1)) p15 <- p.test %*% w.mae loss.mse[,16] <- (p-p15)^2 loss.mae[,16] <- abs(p-p15) results[16,c(1,3,5)] <- c(mean(loss.mse[,16]),mean(loss.mae[,16]),mean(abs(p-p15)/p)) # MCS model based selection library(MCS) MCSprocedure(loss.mse.calib,alpha=0.15,B=2000) # just one model left! MCSprocedure(loss.mae.calib,alpha=0.15,B=2000) # just one model left again! p16 <- apply(cbind(p4,p5,p7,p8),1,mean) loss.mse[,17] <- (p-p16)^2 loss.mae[,17] <- abs(p-p16) results[17,c(1,3,5)] <- c(mean(loss.mse[,17]),mean(loss.mae[,17]),mean(abs(p-p16)/p)) # Regression based -> conditional upon features unregmod <- lm(pc~pc0+pc1+pc2+pc3+pc4+pc5+pc6+pc7+pc8+ocr.calib.2w$km+ocr.calib.2w$age+ocr.calib.2w$transmission) summary(unregmod) wreg <- as.numeric(coefficients(unregmod)) exog <- cbind(1,p.test,oct.2w$km,oct.2w$age,oct.2w$transmission) p17 <- exog %*% wreg loss.mse[,18] <- (p-p17)^2 loss.mae[,18] <- abs(p-p17) results[18,c(1,3,5)] <- c(mean(loss.mse[,18]),mean(loss.mae[,18]),mean(abs(p-p17)/p)) # Statistical evaluation results tmp = MCSprocedure(loss.mse[,-16],alpha=0.15,B=2000) # column 15 and 16 are the same predictions. Need to remove one results[which(rownames(results) %in% tmp@Info$model.names),2] <- 1 tmp = MCSprocedure(loss.mae[,-16],alpha=0.15,B=2000) results[which(rownames(results) %in% tmp@Info$model.names),4] <- 1