#--- #title: "Seminar5RM" #author: "Stefan Lyocsa" #date: "2023-23-10" #output: html_document #--- # Content # * Import data, load packages and estimate previous models: LM1, LM2, LM3, BE, LASSO, RIDGE, EN # * Complete Subset Linear Regression # * Simple decision tree # + Estimate # + Visualize # + Predict # + Evaluate evaluation # * More variables - shallow tree # * More variables - deep tree # * More variables - deep tree -> pre-pruning # * More variables - deep tree -> penalization # * Bagging # * Random forest # * Boosted tree # * Task # Import data, load packages and estimate models from previous session # First load training and testing datasets into the workspace oct <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//oct.csv') ocr <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//ocr.csv') # Add some features that we engineered -> for training sample ocr$km2 <- ocr$km^2 ocr$km3 <- ocr$km^3 ocr$age2 <- ocr$age^2 ocr$age3 <- ocr$age^3 ocr$kmage <- ocr$km * ocr$age ocr$kmkw <- ocr$km * ocr$kw ocr$kmkwage <- ocr$km * ocr$kw * ocr$age # For testing sample oct$km2 <- oct$km^2 oct$km3 <- oct$km^3 oct$age2 <- oct$age^2 oct$age3 <- oct$age^3 oct$kmage <- oct$km * oct$age oct$kmkw <- oct$km * oct$kw oct$kmkwage <- oct$km * oct$kw * oct$age library(glmnet) # LASSO, RIDGE, EN library(rpart) # Decision trees library(tree) # Creating trees library(rpart.plot) # Having some plots of trees library(ranger) # Random forest library(gbm) # Boosted regression tree library(foreach) # Parallel look library(doParallel) # Parallel computing library(MCS) # Model confidence set # Before working with backward, forward and bi-directional elimination we estimate some baseline models that we want to out-perform. m1 <- lm(price ~ km , data=ocr) m2 <- lm(price ~ age , data=ocr) m3 <- lm(price ~ km + age, data=ocr) p1 <- predict(m1,new=oct) p2 <- predict(m2,new=oct) p3 <- predict(m3,new=oct) # And we can create model 6 and corresponding predictions features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage') m6 <- bwelim(dt=ocr,pt=0.05,dep='price',features=features) p6 <- predict(m6,new=oct) # We will work with octavia dataset features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage') X = as.matrix(ocr[,features]) Y = as.matrix(ocr[,'price']) XT = as.matrix(oct[,features]) # Estimate the LASSO cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=1) p9 = predict(cvm,newx = XT,s='lambda.min') # Estimate the Ridge cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0) p11 = predict(cvm,newx = XT,s='lambda.min') # Estimate Elastic Net cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0.5) p13 = predict(cvm,newx = XT,s='lambda.min') # No predictions and make sure that negative prices are winsorized # Complete Subset Linear Regression # csr.method() # q - is the complexity parameter, cv - is the size of the cross-validation samples, trim - trimmed average, nc - number of cores to use # Let's try it out. However, if you type 'wrong' number into 'q' the number of models you have to estimates will be too large and we do not have time for this. # Let's try q=3 (if you have 4 cores) and q=4 if you have 4+ physical cores. spec <- gen.fm(dep='price',x=features) A = Sys.time() p = csr.method(spec,train=ocr,test=oct,q=5,cv=10,trim=0.2,nc=8) Sys.time()-A # Highly correlated predictions cor(p) p14 = p[,1] p15 = p[,2] p16 = p[,3] # We combine forecasts & apply the sanity filter pred <- cbind(true=oct$price,p1,p2,p3,p6,p9,p11,p13,p14,p15,p16) for (i in 2:ncol(pred)) pred[pred[,i]<0,i] = min(ocr$price) mse = pred[,-1] mae = mse for (i in 2:ncol(pred)) { mse[,i-1] = (pred[,1] - pred[,i])^2 mae[,i-1] = abs(pred[,1] - pred[,i]) } apply(mse,2,mean) apply(mae,2,mean) # Does not seem to work very well here. # Decision trees # Can we do any better with a decision tree? Let's see # Estimate t1 = rpart(price~km+trans,data=ocr,method='anova',model=TRUE, control=rpart.control(cp=0,maxdepth=3)) # Visualize rpart.plot(t1,type=0) rpart.plot(t1,type=1) # Predict p17 = predict(t1,new=oct) pred = cbind(pred, p17) # Evaluate (using slightly different way as before - but still mse or mae) apply( (pred[,-1] - pred[,1])^2,2,mean) apply(abs(pred[,-1] - pred[,1]) ,2,mean) # Not very accurate... # Advanced Task # - test your skils (use ChatGPT if you dare) # Create a figure - similar to that on slide 9 of the Lecture 5 (Segmentation of the feature space) # Let's try a tree with more variables - make it a shallow tree # We will not use interactions features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng') spec <- gen.fm(dep='price',x=features) t2 = rpart(spec, data=ocr,method='anova',model=TRUE, control=rpart.control(cp=0.0001,maxdepth=3)) rpart.plot(t2,type=0) p18 = predict(t2,new=oct) pred = cbind(pred, p18) apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1]) apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1]) # Better but still not great # Let's try a deeper tree t3 = rpart(spec, data=ocr,method='anova',model=TRUE, control=rpart.control(cp=0.0001,maxdepth=9)) # Visualization get's a little bit tricky - it's not necessary rpart.plot(t3,type=0) p19 = predict(t3,new=oct) pred = cbind(pred, p19) apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1]) apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1]) # This is so much better -> closing on the best # Can we improve via pre-pruning? ?rpart.control t4 = rpart(spec, data=ocr,method='anova',model=TRUE, control=rpart.control(cp=0.0001,minsplit=25,minbucket=12,maxdepth=9)) p20 = predict(t4,new=oct) pred = cbind(pred, p20) apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1]) apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1]) # Can we improve via penalization? t5 = rpart(spec, data=ocr,method='anova',model=TRUE, control=rpart.control(cp=0,xval=10,minsplit=c(25),minbucket=12,maxdepth=9)) plotcp(t5) # Which is the optimal shrinkage parameter cv.tbl = printcp(t5) head(cv.tbl) opt.cp = cv.tbl[which(cv.tbl[,4] == min(cv.tbl[,4])),1] # You can force him to estimate the model with optimal shrinkage t5 = rpart(spec, data=ocr,method='anova',model=TRUE, control=rpart.control(cp=opt.cp,minsplit=c(25),minbucket=12,maxdepth=9)) p21 = predict(t5,new=oct) pred = cbind(pred, p21) apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1]) apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1]) # 7) Bagging # Recall - it involves bootstrapping! # Number of bootstrap samples B = 1000 # Number of observations NT = dim(ocr)[1] # I need a place (matrix/dataset) to store prediction bag.fore = matrix(NA,nrow=dim(oct)[1],ncol=B) rownames(bag.fore) = rownames(oct) for (b in 1:B) { if (b %in% seq(0,B,100)) print(b) # Randomly select (with replacement) some row numbers idx = sample(1:NT,NT,replace=T) # Create the bootstrap sample auta.train.b = ocr[idx ,] # Estimate the model bt = rpart(spec, data=auta.train.b,method='anova',model=TRUE, control=rpart.control(cp=0,xval=10,minsplit=c(25),minbucket=12,maxdepth=9)) # Prediction on a testing sample bag.fore[,b] = predict(bt,new=oct) } # For every single observations in the testing dataset you have B predictions hist(bag.fore[1,],breaks=50) # nice # This way you could estimate 'confidence in your prediction' # For example, for the first car, the 95% confidence would be quantile(bag.fore[1,],p=c(0.025,0.975)) # Now evaluate p22 = apply(bag.fore,1,mean,na.rm=T) pred = cbind(pred, p22) apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1]) apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1]) # https://tenor.com/bnHxa.gif # Random forest - can it get any better? # Let's try to decorrelate trees. # Number of trees B = 5000 # Depth of the trees depth = c(3,6,9,12) # Number of depth parameters ND = length(depth) # Number of random 'picks' of features to consider in each split mtry = c(3,6,9) # Number of mtry parameters NR = length(mtry) # Number of cross-validations cv = 10 # For each cross-validation sample I need to store predictions # MSE for CV rf.cv = array(NA,dim=c(NR,ND,cv)) dimnames(rf.cv)[[1]] = paste('Try',mtry,'features') dimnames(rf.cv)[[2]] = paste('Depth',depth) dimnames(rf.cv)[[3]] = paste('CV sample',1:cv) # I need to find the average values for each cross-validation rf.cv.ave = array(NA,dim=c(NR,ND)) dimnames(rf.cv.ave)[[1]] = paste('Try',mtry,'features') dimnames(rf.cv.ave)[[2]] = paste('Depth',depth) # Now we loop over different parameters & cross-validation samples # Number of mtry for (m in 1:NR) { num.try = mtry[m] # Depth for (d in 1:ND) { num.depth = depth[d] # Now cross-validation for (r in 1:cv) { # Select data idx = c(((r-1)*(NT/10)+1):(r*(NT/10))) auta.train.cvin = ocr[-idx,] auta.train.cout = ocr[+idx,] # Estimate the model rf.tree = ranger(spec, data=auta.train.cvin, num.trees=B,mtry=num.try,min.node.size=5,max.depth=num.depth) pred.rf.cv = predict(rf.tree,data=auta.train.cout)$predictions rf.cv[m,d,r] = mean((pred.rf.cv - auta.train.cout$price)^2) } # Average rf.cv.ave[m,d] = mean(rf.cv[m,d,]) } } # We estimated random forest(S) under different hyper-parameters # Now which rando forest has the lowest forecast error via cross-validation? which(rf.cv.ave == min(rf.cv.ave), arr.ind=TRUE) mtry.opt = mtry[3] depth.opt = depth[4] rf = ranger(spec,data=ocr,num.trees=B,mtry=mtry.opt,min.node.size=5,max.depth=depth.opt) # Now evaluate p23 = predict(rf,data=oct)$predictions pred = cbind(pred, p23) apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1]) apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1]) # https://tenor.com/bSFw3.gif # What about boosting? # We will use a package gbm() mod.gbm = gbm(spec,data=ocr,distribution='gaussian',n.trees=10000, interaction.depth=depth.opt,shrinkage=0.001,bag.fraction=1) # Now evaluate p24 = predict(mod.gbm,new=oct) pred = cbind(pred,p24) apply(pred[,-1],2,function(x,y) mean((x-y)^2), y=pred[,1]) apply(pred[,-1],2,function(x,y) mean(abs(x-y)), y=pred[,1]) # Final evaluation # Now we find the Mean Square Error and Mean Absolute Error: mse = pred[,-1] mae = mse for (i in 2:ncol(pred)) { mse[,i-1] = (pred[,1] - pred[,i])^2 mae[,i-1] = abs(pred[,1] - pred[,i]) } apply(mse,2,mean) apply(mae,2,mean) # Which model is the best one? # For MCS we need only models that have not the same predictions (correlation 1.0) MCSprocedure(mse) MCSprocedure(mae) # Competition # Task # Can you improve upon the apartments dataset's predictions? apartments <- read.csv(file='C:\\Users\\ckt\\OneDrive - MUNI\\Institutions\\MU\\ML Finance\\2024\\datasets\\aprt1.csv') set.seed(99) N <- dim(apartments)[1] idx <- sample(1:N,size=floor(0.7*N),replace=FALSE) train <- apartments[idx ,] test <- apartments[-idx,] head(train)