library(caret) library(glmnet) library(tree) library(rpart) library(rpart.plot) library(ranger) library(factoextra) library(fastDummies) library(cluster) library(pROC) library(foreach) library(doParallel) library(gbm) library(factoextra) #------------------------------------------------------------------------------------------------------------------------------------------- # Predict the price of a used car: # Benchmark: # LR, LASSO, RIDGE, RF, XGB, CSLR # Competing: # LR-KM, LASSO-KM, RIDGE-KM, RF-KM, XGB-KM, CSLR-KM # LR-KM-PC, LASSO-KM-PC, RIDGE-KM-PC, RF-KM-PC, XGB-KM-PC, CSLR-KM-PC # Combination forecasts .... # Evaluate via MSE, MAE and MCS algorithm + mean absolute percentage error #------------------------------------------------------------------------------------------------------------------------------------------- # First load training and testing data into the workspace # rm(list=ls()) oct <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//oct.csv') ocr <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//ocr.csv') # For training sample ocr$km2 <- ocr$km^2 ocr$km3 <- ocr$km^3 ocr$age2 <- ocr$age^2 ocr$age3 <- ocr$age^3 ocr$kmage <- ocr$km * ocr$age ocr$kmkw <- ocr$km * ocr$kw ocr$kmkwage <- ocr$km * ocr$kw * ocr$age # For testing sample oct$km2 <- oct$km^2 oct$km3 <- oct$km^3 oct$age2 <- oct$age^2 oct$age3 <- oct$age^3 oct$kmage <- oct$km * oct$age oct$kmkw <- oct$km * oct$kw oct$kmkwage <- oct$km * oct$kw * oct$age # Store results results = matrix(NA,nrow=28,ncol=5) colnames(results) = c('mse','mse_mcs','mae','mae_mcs','mpe') rownames(results) = c('LR','LASSO','RIDGE','RF','XGB','CSLR', 'LR-KM','LASSO-KM','RIDGE-KM','RF-KM','XGB-KM','CSLR-KM', 'LR-KM-PC','LASSO-KM-PC','RIDGE-KM-PC','RF-KM-PC','XGB-KM-PC','CSLR-KM-PC', 'CF-AVE','CF-TAVE', 'CF-MED', 'CF-MSE-LIN', 'CF-MSE-TB', 'CF-RANK-MSE-LIN', 'CF-RANK-MSE--TB', 'CF-MCS', 'CF-LASSO','CF-RIDGE') # Number of models NM = dim(results)[1] # Store losses -> we need that to MCS later on loss.mse = matrix(NA,nrow=dim(oct)[1],ncol=NM) colnames(loss.mse) = rownames(results) loss.mae = loss.mse #------------------------------------------------------------------------------------------------------------------------------------------- # What are our features? features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage') # How the specification looks like specs <- gen.fm(dep='price',x=features) #------------------------------------------------------------------------------------------------------------------------------------------- # Standard benchmark models # LR m1 <- lm(specs,data=ocr) p1 <- predict(m1,new=oct) # LASSO & RIDGE X <- as.matrix(ocr[,features]) Y <- as.matrix(ocr[,'price']) XT <- as.matrix(oct[,features]) # Estimate the LASSO cvm <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=20,alpha=1) p2 <- predict(cvm,newx = XT,s='lambda.min') # RIDGE cvm <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=20,alpha=0) p3 <- predict(cvm,newx = XT,s='lambda.min') # RF (we use a function that will cross-validate optimum parameters) A <- Sys.time() # Set 'nc' (number of cores) and 'cv' (number of cross-validation samples) to the number of physical cores of your computer rf.tree <- opt.rf(spec=specs,DT=ocr,DE=oct,nc=10,cv=10, minsplit=c(10),minbucket=c(5),maxdepth=c(0,7), mtry=c(10,15),ntree=c(2000)) p4 <- rf.tree$predictions Sys.time()-A # 1.7min # XGB (new function again) A <- Sys.time() gb.tree <- opt.gbm(spec=specs,DT=ocr,DE=oct,nc=10,cv=10, ntree=5000, interaction.depth=1, shrinkage=c(0.05,0.01)) p5 <- gb.tree$predictions Sys.time()-A # 2.5min # CSLR # Do not set q to > 3!!! A <- Sys.time() tmp <- csr.method(spec=specs,train=ocr,test=oct,q=3,cv=10,trim=0.2,nc=14) p6 <- tmp[,3] Sys.time()-A #------------------------------------------------------------------------------------------------------------------------------------------- # Select continuous features & use Zscore standardization # For KMeans and Principal Components as well #------------------------------------------------------------------------------------------------------------------------------------------- cont_features <- c('km','age','kw','km2','age2','kmage','kmkw','km3','age3','kmkwage') # Standardize data to kmeans zscore.train <- ocr[,cont_features] for (i in 1:length(cont_features)) zscore.train[,i] <- scale(ocr[,cont_features[i]]) for (i in 1:length(cont_features)) zscore.train[,i] <- scale(ocr[,cont_features[i]]) zscore.test <- oct[,cont_features] for (i in 1:length(cont_features)) zscore.test[,i] <- scale(oct[,cont_features[i]]) for (i in 1:length(cont_features)) zscore.test[,i] <- scale(oct[,cont_features[i]]) #------------------------------------------------------------------------------------------------------------------------------------------- # Calculate K-Means # Given the distance matrix - Find K clusters K = 20 res.hw <- kmeans(zscore.train,centers=K,iter.max=100,nstart=2000,algorithm='Hartigan-Wong') #---------------------------------------------------------------------------------------------------- # Feature engineering - training # Perhaps we can create variables that will tell to which cluster the given car belongs to. clu.train = dummy_cols(res.hw$cluster)[,-1] # First column is the original vector colnames(clu.train) = paste('hw',c(1:K),sep='') # The 'clu.train' are our new features 0/1 that we can use to train a model train1 = data.frame(ocr,clu.train) #---------------------------------------------------------------------------------------------------- # Feature engineering - testing # Center of each cluster? centroid <- res.hw$centers clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K colnames(clu.test) <- colnames(clu.train) for (i in 1:nrow(zscore.test)) { # We need to find the distance between given observation from testing data to each of the centroid # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset) cld = rbind(centroid,zscore.test[i,cont_features]) # Now find the distances between all those observations tmp = as.matrix(dist(cld,method='euclidean')) # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation # tmp[dim(tmp)[1],1:K] # Find to which centroid the observation is closest to opt.clus = which(tmp[dim(tmp)[1],1:K]==min(tmp[dim(tmp)[1],1:K])) # Just add 1 there clu.test[i,opt.clus] = 1 } # We create two training and testing datasets with either one or the second approach of assignment of an observation to a cluster test1 = data.frame(oct,clu.test) #---------------------------------------------------------------------------------------------------- # Models with new features specs <- gen.fm(dep='price',x=c(features,names(clu.train))) # LR m7 <- lm(specs,data=train1) p7 <- predict(m7,new=test1) # LASSO & RIDGE X <- as.matrix(train1[,all.vars(specs)[-1]]) Y <- as.matrix(train1[,'price']) XT <- as.matrix(test1[,all.vars(specs)[-1]]) # Estimate the LASSO cvm <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=20,alpha=1) p8 <- predict(cvm,newx = XT,s='lambda.1se') # RIDGE cvm <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=20,alpha=0) p9 <- predict(cvm,newx = XT,s='lambda.1se') # RF (we use a function that will cross-validate optimum parameters) A <- Sys.time() # Set 'nc' (number of cores) and 'cv' (number of cross-validation samples) to the number of physical cores of your computer rf.tree <- opt.rf(spec=specs,DT=train1,DE=test1,nc=10,cv=10, minsplit=c(10),minbucket=c(5),maxdepth=c(0,7), mtry=c(10,15),ntree=c(2000)) p10 <- rf.tree$predictions Sys.time()-A # 1.5min # XGB (new function again) A <- Sys.time() gb.tree <- opt.gbm(spec=specs,DT=train1,DE=test1,nc=10,cv=10, ntree=5000, interaction.depth=1, shrinkage=c(0.05,0.01)) p11 <- gb.tree$predictions Sys.time()-A # 1min # CSLR # Do not set q to > 3!!! A <- Sys.time() tmp <- csr.method(spec=specs,train=train1,test=test1,q=3,cv=10,trim=0.2,nc=14) p12 <- tmp[,2] Sys.time()-A #------------------------------------------------------------------------------------------------------------------------------------------- # Extract principal components => use in predictive models #------------------------------------------------------------------------------------------------------------------------------------------- # Let's select all variables and use min-max standardization m = prcomp(x=zscore.train[,cont_features]) str(m) get_eigenvalue(m) # Weights in the matrix m$rotation # Rows are variables and Columns are principal components # Values are weights # Extract principal components pcc = as.matrix(zscore.train[,cont_features]) %*% m$rotation head(pcc) # Check - uncorrelated? round(cor(cbind(pcc,train1)),2) # Indeed! principal components are orthogonal. # How to select number of components fviz_eig(m) # First 3 seem to be important # Values above 1? round(get_eig(m),2) # Add to training dataset train1 = data.frame(train1,pcc[,1:3]) # Add to testing dataset pcc = as.matrix(zscore.test[,cont_features]) %*% m$rotation test1 = data.frame(test1,pcc[,1:3]) #---------------------------------------------------------------------------------------------------- # Models with new features specs <- gen.fm(dep='price',x=c(features[c(1,2,4:15)],'PC1','PC2','PC3',names(clu.train))) # LR m13 <- lm(specs,data=train1) p13 <- predict(m13,new=test1) # LASSO & RIDGE X <- as.matrix(train1[,all.vars(specs)[-1]]) Y <- as.matrix(train1[,'price']) XT <- as.matrix(test1[,all.vars(specs)[-1]]) # Estimate the LASSO cvm <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=1) p14 <- predict(cvm,newx = XT,s='lambda.min') # RIDGE cvm <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0) p15 <- predict(cvm,newx = XT,s='lambda.min') # RF (we use a function that will cross-validate optimum parameters) A <- Sys.time() # Set 'nc' (number of cores) and 'cv' (number of cross-validation samples) to the number of physical cores of your computer rf.tree <- opt.rf(spec=specs,DT=train1,DE=test1,nc=10,cv=10, minsplit=c(10),minbucket=c(5),maxdepth=c(0,7), mtry=c(10,15),ntree=c(2000)) p16 <- rf.tree$predictions Sys.time()-A # 1.5min # XGB (new function again) A <- Sys.time() gb.tree <- opt.gbm(spec=specs,DT=train1,DE=test1,nc=10,cv=10, ntree=5000, interaction.depth=1, shrinkage=c(0.05,0.01)) p17 <- gb.tree$predictions Sys.time()-A # 1.5min # CSLR # Do not set q to > 3!!! A <- Sys.time() tmp <- csr.method(spec=specs,train=train1,test=test1,q=3,cv=10,trim=0.2,nc=14) p18 <- tmp[,3] Sys.time()-A #---------------------------------------------------------------------------------------------------- # Evaluate competing models predictions <- matrix(NA,nrow=dim(oct)[1],ncol=NM+1) colnames(predictions) <- c('True',rownames(results)) predictions[,1] <- oct$price predictions[,(1:18)+1] <- cbind(p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,p17,p18) for (m in 1:18) loss.mse[,m] <- (predictions[,1]-predictions[,m+1])^2 for (m in 1:18) loss.mae[,m] <- abs(predictions[,1]-predictions[,m+1]) for (m in 1:18) results[m,c(1,3,5)] <- c(mean(loss.mse[,m]),mean(loss.mae[,m]),mean(abs(predictions[,1]-predictions[,m+1])/predictions[,1])) round(results,2) #---------------------------------------------------------------------------------------------------- # Combination Forecasts #---------------------------------------------------------------------------------------------------- # Unconditional forecast combinations #---------------------------------------------------------------------------------------------------- # Simple average p19 <- predictions[,20] <- p19 mean(abs(p19-predictions[,1])) # Trimmed average p20 <- predictions[,21] <- p20 mean(abs(p19-predictions[,1])) # Median p21 <- predictions[,22] <- p21 mean(abs(p19-predictions[,1])) #---------------------------------------------------------------------------------------------------- # Unconditional forecast combinations #---------------------------------------------------------------------------------------------------- # Performance based (MSE/MAE) #---------------------------------------------------------------------------------------------------- # Average MSE on calibration sample. The lower the error the higher the weight # First define a calibration sample size (from the testing) CS <- 156 # Number of all observations in the testing sample TS <- dim(predictions)[1] # MSE - select only LR & LASSO models mse.calib <- apply(loss.mse[1:CS,c(1,2,7:8,13:14)],2,mean) w.mse <- mse.calib^(-1)/sum(mse.calib^(-1)) w.mse p22 <- predictions[(CS+1):TS,c(1,2,7:8,13:14)+1] %*% w.mse predictions[(CS+1):TS,23] <- p22 # MSE - select RF & XGB mse.calib <- w.mse <- w.mse p23 <- predictions[(CS+1):TS,24] <- p23 #---------------------------------------------------------------------------------------------------- # Rank based (MSE/MAE) # MSE - select only LR & LASSO mse.calib <- rank(apply(loss.mse[(CS+1):TS,c(1,2,7:8,13:14)],2,mean)) w.mse <- mse.calib^(-1)/sum(mse.calib^(-1)) p24 <- predictions[(CS+1):TS,c(1,2,7:8,13:14)+1] %*% w.mse predictions[(CS+1):TS,25] <- p24 # MSE - select only RF & XGB mae.calib <- rank(apply(loss.mae[(CS+1):TS,c(4:5,10:11,16:17)],2,mean)) w.mae <- mae.calib^(-1)/sum(mae.calib^(-1)) p25 <- predictions[(CS+1):TS,c(4:5,10:11,16:17)+1] %*% w.mae predictions[(CS+1):TS,26] <- p25 #---------------------------------------------------------------------------------------------------- # MCS model based selection - use trimmed average for the models that were selected as superiod library(MCS) MCSprocedure(loss.mse[1:CS,1:18],alpha=0.15,B=2000) p26 <- predictions[(CS+1):TS,27] <- p26 # just one model left! #---------------------------------------------------------------------------------------------------- # Regression based -> conditional upon features # LASSO & RIDGE YT <- as.matrix(test1[1:CS,'price']) XT <- as.matrix(predictions[1:CS,2:19]) # Estimate the LASSO cvm <- cv.glmnet(x=XT,y=YT,type.measure='mse',nfolds=10,alpha=1) round(coefficients(cvm,s='lambda.min'),3) p27 <- as.matrix(cbind(1,predictions[(CS+1):TS,2:19])) %*% as.matrix(coefficients(cvm,s='lambda.min')) #XT <- as.matrix(cbind(test1[1:CS,features],predictions[1:CS,2:19])) #p27 <- as.matrix(cbind(1,test1[(CS+1):TS,features],predictions[(CS+1):TS,2:19])) %*% as.matrix(coefficients(cvm,s='lambda.min')) predictions[(CS+1):TS,28] <- p27 # RIDGE XT <- as.matrix(cbind(predictions[1:CS,2:19],test1[1:CS,features])) cvm <- cv.glmnet(x=XT,y=YT,type.measure='mse',nfolds=10,alpha=0) round(coefficients(cvm,s='lambda.min'),3) p28 <- as.matrix(cbind(1,predictions[(CS+1):TS,2:19],test1[(CS+1):TS,features])) %*% as.matrix(coefficients(cvm,s='lambda.min')) #p28 <- as.matrix(cbind(1,test1[(CS+1):TS,features],predictions[(CS+1):TS,2:19])) %*% as.matrix(coefficients(cvm,s='lambda.min')) predictions[(CS+1):TS,29] <- p28 #---------------------------------------------------------------------------------------------------- # Statistical evaluation for (m in 1:28) loss.mse[,m] <- (predictions[,1]-predictions[,m+1])^2 for (m in 1:28) loss.mae[,m] <- abs(predictions[,1]-predictions[,m+1]) for (m in 1:28) results[m,c(1,3,5)] <- c(mean(loss.mse[(CS+1):TS,m]),mean(loss.mae[(CS+1):TS,m]),mean(abs(predictions[(CS+1):TS,1]-predictions[(CS+1):TS,m+1])/predictions[(CS+1):TS,1])) round(results,4) #---------------------------------------------------------------------------------------------------- # We can study MCS tmp = MCSprocedure(loss.mse[(CS+1):TS,],alpha=0.15,B=2000) results[which(rownames(results) %in% tmp@Info$model.names),2] <- 1 tmp = MCSprocedure(loss.mae[(CS+1):TS,],alpha=0.15,B=2000) results[which(rownames(results) %in% tmp@Info$model.names),4] <- 1 #---------------------------------------------------------------------------------------------------- round(results,4)