library(caret)
library(glmnet)
library(tree)
library(rpart)
library(rpart.plot)
library(ranger)
library(factoextra)
library(fastDummies)
library(cluster)
library(pROC)
library(foreach)
library(doParallel)
library(gbm)
library(factoextra)

#-------------------------------------------------------------------------------------------------------------------------------------------
# Predict the price of a used car: 
# Benchmark:
# LR, LASSO, RIDGE, RF, XGB, CSLR
# Competing:
# LR-KM, LASSO-KM, RIDGE-KM, RF-KM, XGB-KM, CSLR-KM
# LR-KM-PC, LASSO-KM-PC, RIDGE-KM-PC, RF-KM-PC, XGB-KM-PC, CSLR-KM-PC
# Combination forecasts .... 
# Evaluate via MSE, MAE and MCS algorithm + mean absolute percentage error
#-------------------------------------------------------------------------------------------------------------------------------------------
# First load training and testing data into the workspace
# rm(list=ls())
oct <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//oct.csv')
ocr <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//ocr.csv')
# For training sample
ocr$km2   <- ocr$km^2
ocr$km3   <- ocr$km^3
ocr$age2  <- ocr$age^2
ocr$age3  <- ocr$age^3
ocr$kmage <- ocr$km * ocr$age
ocr$kmkw  <- ocr$km * ocr$kw
ocr$kmkwage <- ocr$km * ocr$kw * ocr$age
# For testing sample
oct$km2   <- oct$km^2         
oct$km3   <- oct$km^3
oct$age2  <- oct$age^2
oct$age3  <- oct$age^3
oct$kmage <- oct$km * oct$age
oct$kmkw  <- oct$km * oct$kw
oct$kmkwage <- oct$km * oct$kw * oct$age
# Store results
results =            matrix(NA,nrow=28,ncol=5)
colnames(results)  = c('mse','mse_mcs','mae','mae_mcs','mpe')
rownames(results)  = c('LR','LASSO','RIDGE','RF','XGB','CSLR',
                       'LR-KM','LASSO-KM','RIDGE-KM','RF-KM','XGB-KM','CSLR-KM',
                       'LR-KM-PC','LASSO-KM-PC','RIDGE-KM-PC','RF-KM-PC','XGB-KM-PC','CSLR-KM-PC',
                       'CF-AVE','CF-TAVE', 'CF-MED', 
                       'CF-MSE-LIN', 'CF-MSE-TB', 'CF-RANK-MSE-LIN', 'CF-RANK-MSE--TB',
                       'CF-MCS',
                       'CF-LASSO','CF-RIDGE')
# Number of models
NM                 = dim(results)[1]
# Store losses -> we need that to MCS later on
loss.mse           = matrix(NA,nrow=dim(oct)[1],ncol=NM)
colnames(loss.mse) = rownames(results)
loss.mae           = loss.mse

#-------------------------------------------------------------------------------------------------------------------------------------------
# What are our features?
features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage')
# How the specification looks like
specs    <- gen.fm(dep='price',x=features)

#-------------------------------------------------------------------------------------------------------------------------------------------
# Standard benchmark models
# LR
m1      <- lm(specs,data=ocr)
p1      <- predict(m1,new=oct)
# LASSO & RIDGE
X       <- as.matrix(ocr[,features])
Y       <- as.matrix(ocr[,'price'])
XT      <- as.matrix(oct[,features])
# Estimate the LASSO
cvm     <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=20,alpha=1)
p2      <- predict(cvm,newx = XT,s='lambda.min')
# RIDGE
cvm     <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=20,alpha=0)
p3      <- predict(cvm,newx = XT,s='lambda.min')
# RF (we use a function that will cross-validate optimum parameters)
A       <- Sys.time()
# Set 'nc' (number of cores) and 'cv' (number of cross-validation samples) to the number of physical cores of your computer
rf.tree <- opt.rf(spec=specs,DT=ocr,DE=oct,nc=10,cv=10,
                  minsplit=c(10),minbucket=c(5),maxdepth=c(0,7),
                  mtry=c(10,15),ntree=c(2000))
p4      <- rf.tree$predictions
Sys.time()-A # 1.7min
# XGB (new function again)
A       <- Sys.time()
gb.tree <- opt.gbm(spec=specs,DT=ocr,DE=oct,nc=10,cv=10,
                   ntree=5000,
                   interaction.depth=1,
                   shrinkage=c(0.05,0.01))
p5      <- gb.tree$predictions
Sys.time()-A # 2.5min
# CSLR
# Do not set q to > 3!!!
A       <- Sys.time()
tmp     <- csr.method(spec=specs,train=ocr,test=oct,q=3,cv=10,trim=0.2,nc=14)
p6      <- tmp[,3]
Sys.time()-A

#-------------------------------------------------------------------------------------------------------------------------------------------
# Select continuous features & use Zscore standardization
# For KMeans and Principal Components as well
#-------------------------------------------------------------------------------------------------------------------------------------------
cont_features <- c('km','age','kw','km2','age2','kmage','kmkw','km3','age3','kmkwage')
# Standardize data to kmeans
zscore.train     <- ocr[,cont_features]
for (i in 1:length(cont_features)) zscore.train[,i] <- scale(ocr[,cont_features[i]])
for (i in 1:length(cont_features)) zscore.train[,i] <- scale(ocr[,cont_features[i]])
zscore.test      <- oct[,cont_features]
for (i in 1:length(cont_features)) zscore.test[,i] <- scale(oct[,cont_features[i]])
for (i in 1:length(cont_features)) zscore.test[,i] <- scale(oct[,cont_features[i]])
#-------------------------------------------------------------------------------------------------------------------------------------------
# Calculate K-Means
# Given the distance matrix - Find K clusters
K = 20
res.hw                  <- kmeans(zscore.train,centers=K,iter.max=100,nstart=2000,algorithm='Hartigan-Wong')
#----------------------------------------------------------------------------------------------------
# Feature engineering - training
# Perhaps we can create variables that will tell to which cluster the given car belongs to.
clu.train = dummy_cols(res.hw$cluster)[,-1] # First column is the original vector
colnames(clu.train) = paste('hw',c(1:K),sep='')
# The 'clu.train' are our new features 0/1 that we can use to train a model
train1 = data.frame(ocr,clu.train)
#----------------------------------------------------------------------------------------------------
# Feature engineering - testing
# Center of each cluster?
centroid <- res.hw$centers
clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K
colnames(clu.test) <- colnames(clu.train)
for (i in 1:nrow(zscore.test)) {
  # We need to find the distance between given observation from testing data to each of the centroid
  # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset)
  cld = rbind(centroid,zscore.test[i,cont_features])
  # Now find the distances between all those observations
  tmp = as.matrix(dist(cld,method='euclidean'))
  # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation
  # tmp[dim(tmp)[1],1:K]
  # Find to which centroid the observation is closest to
  opt.clus = which(tmp[dim(tmp)[1],1:K]==min(tmp[dim(tmp)[1],1:K]))
  # Just add 1 there
  clu.test[i,opt.clus] = 1
}
# We create two training and testing datasets with either one or the second approach of assignment of an observation to a cluster
test1 = data.frame(oct,clu.test)
#----------------------------------------------------------------------------------------------------
# Models with new features
specs    <- gen.fm(dep='price',x=c(features,names(clu.train)))
# LR
m7      <- lm(specs,data=train1)
p7      <- predict(m7,new=test1)
# LASSO & RIDGE
X       <- as.matrix(train1[,all.vars(specs)[-1]])
Y       <- as.matrix(train1[,'price'])
XT      <- as.matrix(test1[,all.vars(specs)[-1]])
# Estimate the LASSO
cvm     <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=20,alpha=1)
p8      <- predict(cvm,newx = XT,s='lambda.1se')
# RIDGE
cvm     <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=20,alpha=0)
p9      <- predict(cvm,newx = XT,s='lambda.1se')
# RF (we use a function that will cross-validate optimum parameters)
A       <- Sys.time()
# Set 'nc' (number of cores) and 'cv' (number of cross-validation samples) to the number of physical cores of your computer
rf.tree <- opt.rf(spec=specs,DT=train1,DE=test1,nc=10,cv=10,
                  minsplit=c(10),minbucket=c(5),maxdepth=c(0,7),
                  mtry=c(10,15),ntree=c(2000))
p10      <- rf.tree$predictions
Sys.time()-A # 1.5min
# XGB (new function again)
A       <- Sys.time()
gb.tree <- opt.gbm(spec=specs,DT=train1,DE=test1,nc=10,cv=10,
                   ntree=5000,
                   interaction.depth=1,
                   shrinkage=c(0.05,0.01))
p11      <- gb.tree$predictions
Sys.time()-A # 1min
# CSLR
# Do not set q to > 3!!!
A       <- Sys.time()
tmp     <- csr.method(spec=specs,train=train1,test=test1,q=3,cv=10,trim=0.2,nc=14)
p12      <- tmp[,2]
Sys.time()-A

#-------------------------------------------------------------------------------------------------------------------------------------------
# Extract principal components => use in predictive models
#-------------------------------------------------------------------------------------------------------------------------------------------
# Let's select all variables and use min-max standardization
m = prcomp(x=zscore.train[,cont_features])
str(m)
get_eigenvalue(m)
# Weights in the matrix
m$rotation
# Rows are variables and Columns are principal components
# Values are weights
# Extract principal components
pcc = as.matrix(zscore.train[,cont_features]) %*% m$rotation
head(pcc)
# Check - uncorrelated?
round(cor(cbind(pcc,train1)),2)
# Indeed! principal components are orthogonal.
# How to select number of components
fviz_eig(m)
# First 3 seem to be important
# Values above 1?
round(get_eig(m),2)
# Add to training dataset
train1 = data.frame(train1,pcc[,1:3])
# Add to testing dataset
pcc = as.matrix(zscore.test[,cont_features]) %*% m$rotation
test1 = data.frame(test1,pcc[,1:3])
#----------------------------------------------------------------------------------------------------
# Models with new features
specs    <- gen.fm(dep='price',x=c(features[c(1,2,4:15)],'PC1','PC2','PC3',names(clu.train)))
# LR
m13      <- lm(specs,data=train1)
p13      <- predict(m13,new=test1)
# LASSO & RIDGE
X       <- as.matrix(train1[,all.vars(specs)[-1]])
Y       <- as.matrix(train1[,'price'])
XT      <- as.matrix(test1[,all.vars(specs)[-1]])
# Estimate the LASSO
cvm     <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=1)
p14     <- predict(cvm,newx = XT,s='lambda.min')
# RIDGE
cvm     <- cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0)
p15     <- predict(cvm,newx = XT,s='lambda.min')
# RF (we use a function that will cross-validate optimum parameters)
A       <- Sys.time()
# Set 'nc' (number of cores) and 'cv' (number of cross-validation samples) to the number of physical cores of your computer
rf.tree <- opt.rf(spec=specs,DT=train1,DE=test1,nc=10,cv=10,
                  minsplit=c(10),minbucket=c(5),maxdepth=c(0,7),
                  mtry=c(10,15),ntree=c(2000))
p16      <- rf.tree$predictions
Sys.time()-A # 1.5min
# XGB (new function again)
A       <- Sys.time()
gb.tree <- opt.gbm(spec=specs,DT=train1,DE=test1,nc=10,cv=10,
                   ntree=5000,
                   interaction.depth=1,
                   shrinkage=c(0.05,0.01))
p17      <- gb.tree$predictions
Sys.time()-A # 1.5min
# CSLR
# Do not set q to > 3!!!
A       <- Sys.time()
tmp     <- csr.method(spec=specs,train=train1,test=test1,q=3,cv=10,trim=0.2,nc=14)
p18      <- tmp[,3]
Sys.time()-A
#----------------------------------------------------------------------------------------------------
# Evaluate competing models
predictions           <- matrix(NA,nrow=dim(oct)[1],ncol=NM+1)
colnames(predictions) <- c('True',rownames(results))
predictions[,1]       <- oct$price
predictions[,(1:18)+1]       <- cbind(p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,p11,p12,p13,p14,p15,p16,p17,p18)
for (m in 1:18) loss.mse[,m] <-    (predictions[,1]-predictions[,m+1])^2
for (m in 1:18) loss.mae[,m] <- abs(predictions[,1]-predictions[,m+1])
for (m in 1:18) results[m,c(1,3,5)] <- c(mean(loss.mse[,m]),mean(loss.mae[,m]),mean(abs(predictions[,1]-predictions[,m+1])/predictions[,1]))
round(results,2)

#----------------------------------------------------------------------------------------------------
# Combination Forecasts
#----------------------------------------------------------------------------------------------------
# Unconditional forecast combinations
#----------------------------------------------------------------------------------------------------
# Simple average
p19 <- 
predictions[,20] <- p19
mean(abs(p19-predictions[,1]))
# Trimmed average
p20 <- 
predictions[,21] <- p20
mean(abs(p19-predictions[,1]))
# Median
p21 <- 
predictions[,22] <- p21
mean(abs(p19-predictions[,1]))
#----------------------------------------------------------------------------------------------------
# Unconditional forecast combinations
#----------------------------------------------------------------------------------------------------
# Performance based (MSE/MAE)
#----------------------------------------------------------------------------------------------------
# Average MSE on calibration sample. The lower the error the higher the weight
# First define a calibration sample size (from the testing)
CS        <- 156
# Number of all observations in the testing sample
TS        <- dim(predictions)[1]
# MSE - select only LR & LASSO models
mse.calib <- apply(loss.mse[1:CS,c(1,2,7:8,13:14)],2,mean)
w.mse     <- mse.calib^(-1)/sum(mse.calib^(-1))
w.mse
p22       <- predictions[(CS+1):TS,c(1,2,7:8,13:14)+1] %*% w.mse
predictions[(CS+1):TS,23] <- p22
# MSE - select RF & XGB
mse.calib <- 
w.mse     <- 
w.mse
p23       <- 
predictions[(CS+1):TS,24] <- p23
#----------------------------------------------------------------------------------------------------
# Rank based (MSE/MAE)
# MSE - select only LR & LASSO
mse.calib <- rank(apply(loss.mse[(CS+1):TS,c(1,2,7:8,13:14)],2,mean))
w.mse     <- mse.calib^(-1)/sum(mse.calib^(-1))
p24       <- predictions[(CS+1):TS,c(1,2,7:8,13:14)+1] %*% w.mse
predictions[(CS+1):TS,25] <- p24
# MSE - select only RF & XGB
mae.calib <- rank(apply(loss.mae[(CS+1):TS,c(4:5,10:11,16:17)],2,mean))
w.mae     <- mae.calib^(-1)/sum(mae.calib^(-1))
p25       <- predictions[(CS+1):TS,c(4:5,10:11,16:17)+1] %*% w.mae
predictions[(CS+1):TS,26] <- p25
#----------------------------------------------------------------------------------------------------
# MCS model based selection - use trimmed average for the models that were selected as superiod
library(MCS)
MCSprocedure(loss.mse[1:CS,1:18],alpha=0.15,B=2000)
p26       <- 
predictions[(CS+1):TS,27] <- p26
# just one model left!
#----------------------------------------------------------------------------------------------------
# Regression based -> conditional upon features
# LASSO & RIDGE
YT      <- as.matrix(test1[1:CS,'price'])
XT      <- as.matrix(predictions[1:CS,2:19])
# Estimate the LASSO
cvm     <- cv.glmnet(x=XT,y=YT,type.measure='mse',nfolds=10,alpha=1)
round(coefficients(cvm,s='lambda.min'),3)
p27     <- as.matrix(cbind(1,predictions[(CS+1):TS,2:19])) %*% as.matrix(coefficients(cvm,s='lambda.min'))
#XT     <- as.matrix(cbind(test1[1:CS,features],predictions[1:CS,2:19]))
#p27    <- as.matrix(cbind(1,test1[(CS+1):TS,features],predictions[(CS+1):TS,2:19])) %*% as.matrix(coefficients(cvm,s='lambda.min'))
predictions[(CS+1):TS,28] <- p27
# RIDGE
XT      <- as.matrix(cbind(predictions[1:CS,2:19],test1[1:CS,features]))
cvm     <- cv.glmnet(x=XT,y=YT,type.measure='mse',nfolds=10,alpha=0)
round(coefficients(cvm,s='lambda.min'),3)
p28     <- as.matrix(cbind(1,predictions[(CS+1):TS,2:19],test1[(CS+1):TS,features])) %*% as.matrix(coefficients(cvm,s='lambda.min'))
#p28     <- as.matrix(cbind(1,test1[(CS+1):TS,features],predictions[(CS+1):TS,2:19])) %*% as.matrix(coefficients(cvm,s='lambda.min'))
predictions[(CS+1):TS,29] <- p28
#----------------------------------------------------------------------------------------------------
# Statistical evaluation
for (m in 1:28) loss.mse[,m] <-    (predictions[,1]-predictions[,m+1])^2
for (m in 1:28) loss.mae[,m] <- abs(predictions[,1]-predictions[,m+1])
for (m in 1:28) results[m,c(1,3,5)] <- c(mean(loss.mse[(CS+1):TS,m]),mean(loss.mae[(CS+1):TS,m]),mean(abs(predictions[(CS+1):TS,1]-predictions[(CS+1):TS,m+1])/predictions[(CS+1):TS,1]))
round(results,4)
#----------------------------------------------------------------------------------------------------
# We can study MCS
tmp = MCSprocedure(loss.mse[(CS+1):TS,],alpha=0.15,B=2000)
results[which(rownames(results) %in% tmp@Info$model.names),2] <- 1
tmp = MCSprocedure(loss.mae[(CS+1):TS,],alpha=0.15,B=2000)
results[which(rownames(results) %in% tmp@Info$model.names),4] <- 1
#----------------------------------------------------------------------------------------------------
round(results,4)