#---
#title: "Seminar5RM"
#author: "Stefan Lyocsa"
#date: "2023-23-10"
#output: html_document
#---
  
# Content
# * Import data, load packages and estimate previous models: LM1, LM2, LM3, BE, LASSO, RIDGE, EN
# * Complete Subset Linear Regression
# * Simple decision tree
#   + Estimate
#   + Visualize
#   + Predict
#   + Evaluate evaluation
# * More variables - shallow tree
# * More variables - deep tree
# * More variables - deep tree -> pre-pruning
# * More variables - deep tree -> penalization
# * Bagging
# * Random forest
# * Boosted tree
# * Task

# Import data, load packages and estimate models from previous session
# First load training and testing datasets into the workspace
oct <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//oct.csv')
ocr <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//ocr.csv')
# Add some features that we engineered -> for training sample
ocr$km2   <- ocr$km^2
ocr$km3   <- ocr$km^3
ocr$age2  <- ocr$age^2
ocr$age3  <- ocr$age^3
ocr$kmage <- ocr$km * ocr$age
ocr$kmkw  <- ocr$km * ocr$kw
ocr$kmkwage <- ocr$km * ocr$kw * ocr$age
# For testing sample
oct$km2   <- oct$km^2         
oct$km3   <- oct$km^3
oct$age2  <- oct$age^2
oct$age3  <- oct$age^3
oct$kmage <- oct$km * oct$age
oct$kmkw  <- oct$km * oct$kw
oct$kmkwage <- oct$km * oct$kw * oct$age

library(glmnet)  # LASSO, RIDGE, EN
library(rpart)   # Decision trees
library(tree)    # Creating trees
library(rpart.plot) # Having some plots of trees
library(ranger)  # Random forest
library(gbm) # Boosted regression tree
library(foreach) # Parallel look
library(doParallel) # Parallel computing
library(MCS)     # Model confidence set

# Before working with backward, forward and bi-directional elimination we estimate some baseline models that we want to out-perform.
m1 <- lm(price ~ km      , data=ocr)
m2 <- lm(price ~ age     , data=ocr)
m3 <- lm(price ~ km + age, data=ocr)
p1 <- predict(m1,new=oct)
p2 <- predict(m2,new=oct)
p3 <- predict(m3,new=oct)
# And we can create model 6 and corresponding predictions
features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage')
m6 <- bwelim(dt=ocr,pt=0.05,dep='price',features=features)
p6 <- predict(m6,new=oct)
# We will work with octavia dataset
features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage')
X = as.matrix(ocr[,features])
Y = as.matrix(ocr[,'price'])
XT = as.matrix(oct[,features])
# Estimate the LASSO
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=1)
p9 = predict(cvm,newx = XT,s='lambda.min')
# Estimate the Ridge
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0)
p11 = predict(cvm,newx = XT,s='lambda.min')
# Estimate Elastic Net
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0.5)
p13 = predict(cvm,newx = XT,s='lambda.min')
# No predictions and make sure that negative prices are winsorized

# Complete Subset Linear Regression
# csr.method()
# q - is the complexity parameter, cv - is the size of the cross-validation samples, trim - trimmed average, nc - number of cores to use
# Let's try it out. However, if you type 'wrong' number into 'q' the number of models you have to estimates will be too large and we do not have time for this.
# Let's try q=3 (if you have 4 cores) and q=4 if you have 4+ physical cores.
spec <- gen.fm(dep='price',x=features)
A = Sys.time()
p = csr.method(spec,train=ocr,test=oct,q=5,cv=10,trim=0.2,nc=8)
Sys.time()-A
# Highly correlated predictions
cor(p)
p14 = p[,1]
p15 = p[,2]
p16 = p[,3]

# We combine forecasts & apply the sanity filter
pred <- cbind(true=oct$price,p1,p2,p3,p6,p9,p11,p13,p14,p15,p16)
for (i in 2:ncol(pred)) pred[pred[,i]<0,i] = min(ocr$price)

mse = pred[,-1]
mae = mse
for (i in 2:ncol(pred)) {
  mse[,i-1] =    (pred[,1] - pred[,i])^2
  mae[,i-1] = abs(pred[,1] - pred[,i])
}
apply(mse,2,mean)
apply(mae,2,mean)
# Does not seem to work very well here.

# Decision trees
# Can we do any better with a decision tree? Let's see
# Estimate
t1 = rpart(price~km+trans,data=ocr,method='anova',model=TRUE,
           control=rpart.control(cp=0,maxdepth=3))
# Visualize
rpart.plot(t1,type=0)
rpart.plot(t1,type=1)
# Predict
p17  = predict(t1,new=oct)
pred = cbind(pred, p17)
# Evaluate (using slightly different way as before - but still mse or mae)
apply(   (pred[,-1] - pred[,1])^2,2,mean)
apply(abs(pred[,-1] - pred[,1])  ,2,mean)
# Not very accurate...

# Advanced Task
# - test your skils (use ChatGPT if you dare)
# Create a figure - similar to that on slide 9 of the Lecture 5 (Segmentation of the feature space)

# Let's try a tree with more variables - make it a shallow tree
# We will not use interactions
features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng')
spec <- gen.fm(dep='price',x=features)
t2 = rpart(spec, 
           data=ocr,method='anova',model=TRUE,
           control=rpart.control(cp=0.0001,maxdepth=3))
rpart.plot(t2,type=0)
p18 = predict(t2,new=oct)
pred = cbind(pred, p18)
apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1])
apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1])
# Better but still not great

# Let's try a deeper tree
t3 = rpart(spec, 
           data=ocr,method='anova',model=TRUE,
           control=rpart.control(cp=0.0001,maxdepth=9))
# Visualization get's a little bit tricky - it's not necessary
rpart.plot(t3,type=0)
p19  = predict(t3,new=oct)
pred = cbind(pred, p19)
apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1])
apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1])
# This is so much better -> closing on the best

# Can we improve via pre-pruning?
?rpart.control
t4 = rpart(spec, 
           data=ocr,method='anova',model=TRUE,
           control=rpart.control(cp=0.0001,minsplit=25,minbucket=12,maxdepth=9))
p20  = predict(t4,new=oct)
pred = cbind(pred, p20)
apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1])
apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1])

# Can we improve via penalization?
t5 = rpart(spec, 
           data=ocr,method='anova',model=TRUE,
           control=rpart.control(cp=0,xval=10,minsplit=c(25),minbucket=12,maxdepth=9))
plotcp(t5)
# Which is the optimal shrinkage parameter
cv.tbl = printcp(t5)
head(cv.tbl)
opt.cp = cv.tbl[which(cv.tbl[,4] == min(cv.tbl[,4])),1]
# You can force him to estimate the model with optimal shrinkage
t5 = rpart(spec, 
           data=ocr,method='anova',model=TRUE,
           control=rpart.control(cp=opt.cp,minsplit=c(25),minbucket=12,maxdepth=9))
p21  = predict(t5,new=oct)
pred = cbind(pred, p21)
apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1])
apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1])

# 7) Bagging
# Recall - it involves bootstrapping!
# Number of bootstrap samples
B = 1000
# Number of observations
NT = dim(ocr)[1]
# I need a place (matrix/dataset) to store prediction
bag.fore = matrix(NA,nrow=dim(oct)[1],ncol=B)
rownames(bag.fore) = rownames(oct)
for (b in 1:B) {
  if (b %in% seq(0,B,100)) print(b)
  # Randomly select (with replacement) some row numbers
  idx = sample(1:NT,NT,replace=T)
  # Create the bootstrap sample
  auta.train.b = ocr[idx ,]
  # Estimate the model
  bt = rpart(spec, 
             data=auta.train.b,method='anova',model=TRUE,
             control=rpart.control(cp=0,xval=10,minsplit=c(25),minbucket=12,maxdepth=9))
  # Prediction on a testing sample
  bag.fore[,b] = predict(bt,new=oct)
}
# For every single observations in the testing dataset you have B predictions
hist(bag.fore[1,],breaks=50) # nice
# This way you could estimate 'confidence in your prediction'
# For example, for the first car, the 95% confidence would be
quantile(bag.fore[1,],p=c(0.025,0.975))
# Now evaluate
p22  = apply(bag.fore,1,mean,na.rm=T)
pred = cbind(pred, p22)
apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1])
apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1])
# https://tenor.com/bnHxa.gif

# Random forest - can it get any better?
# Let's try to decorrelate trees.
# Number of trees
B = 5000
# Depth of the trees
depth = c(3,6,9,12)
# Number of depth parameters
ND = length(depth)
# Number of random 'picks' of features to consider in each split
mtry = c(3,6,9)
# Number of mtry parameters
NR = length(mtry)
# Number of cross-validations
cv = 10
# For each cross-validation sample I need to store predictions
# MSE for CV
rf.cv = array(NA,dim=c(NR,ND,cv))
dimnames(rf.cv)[[1]] = paste('Try',mtry,'features')
dimnames(rf.cv)[[2]] = paste('Depth',depth)
dimnames(rf.cv)[[3]] = paste('CV sample',1:cv)
# I need to find the average values for each cross-validation
rf.cv.ave = array(NA,dim=c(NR,ND))
dimnames(rf.cv.ave)[[1]] = paste('Try',mtry,'features')
dimnames(rf.cv.ave)[[2]] = paste('Depth',depth)
# Now we loop over different parameters & cross-validation samples
# Number of mtry
for (m in 1:NR) {
  num.try = mtry[m]
  # Depth
  for (d in 1:ND) {
    num.depth = depth[d]
    # Now cross-validation      
    for (r in 1:cv) {
      # Select data
      idx = c(((r-1)*(NT/10)+1):(r*(NT/10)))
      auta.train.cvin = ocr[-idx,]
      auta.train.cout = ocr[+idx,]
      
      # Estimate the model
      rf.tree = ranger(spec,
                       data=auta.train.cvin,
                       num.trees=B,mtry=num.try,min.node.size=5,max.depth=num.depth)  
      pred.rf.cv = predict(rf.tree,data=auta.train.cout)$predictions
      rf.cv[m,d,r] = mean((pred.rf.cv - auta.train.cout$price)^2)
    }
    # Average
    rf.cv.ave[m,d] = mean(rf.cv[m,d,])
  }
}
# We estimated random forest(S) under different hyper-parameters
# Now which rando forest has the lowest forecast error via cross-validation?
which(rf.cv.ave == min(rf.cv.ave), arr.ind=TRUE)
mtry.opt  = mtry[3]
depth.opt = depth[4]
rf = ranger(spec,data=ocr,num.trees=B,mtry=mtry.opt,min.node.size=5,max.depth=depth.opt)
# Now evaluate
p23  = predict(rf,data=oct)$predictions
pred = cbind(pred, p23)
apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1])
apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1])
# https://tenor.com/bSFw3.gif 

# What about boosting?
# We will use a package gbm()
mod.gbm = gbm(spec,data=ocr,distribution='gaussian',n.trees=10000,
              interaction.depth=depth.opt,shrinkage=0.001,bag.fraction=1)
# Now evaluate
p24  = predict(mod.gbm,new=oct)
pred = cbind(pred,p24)
apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1])
apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1])

# Final evaluation
# Now we find the Mean Square Error and Mean Absolute Error:
mse = pred[,-1]
mae = mse
for (i in 2:ncol(pred)) {
  mse[,i-1] =    (pred[,1] - pred[,i])^2
  mae[,i-1] = abs(pred[,1] - pred[,i])
}
apply(mse,2,mean)
apply(mae,2,mean)

# Which model is the best one?
# For MCS we need only models that have not the same predictions (correlation 1.0)
MCSprocedure(mse)
MCSprocedure(mae)

# Competition  
# Task
# Can you improve upon the apartments dataset's predictions?
apartments <- read.csv(file='C:\\Users\\ckt\\OneDrive - MUNI\\Institutions\\MU\\ML Finance\\2024\\datasets\\aprt1.csv')
set.seed(99)
N     <- dim(apartments)[1]
idx   <- sample(1:N,size=floor(0.7*N),replace=FALSE)
train <- apartments[idx ,]
test  <- apartments[-idx,]
head(train)