# We will create two functions(). One will use names of response (target) variable and names of features and returns an as.formula() object. Second will run the algorithm. gen.fm = function(dep='price',x=features) { # This will create the 'dep~1' string - which is y = beta_0 model # paste() is a useful function... it links strings spec = paste(dep,'~1',sep='') # If there are no features - the vector 'x' has no elements - return if (is.null(x)) return(as.formula(spec)) # Number of features NV = length(x) # Loop over all features - we add one feature at a time for (v in 1:NV) spec = paste(spec,'+',x[v],sep='') return(as.formula(spec)) } # Now the algorithm with backward elimination: bwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) { library(sandwich) library(lmtest) pval <- 1 # We start by the initial full model spec <- gen.fm(dep=dep,x=features) model <- lm(spec,data=dt) while (pval > pt) { estim <- coeftest(model,vcov=vcovHC(model,type='HC1')) idx <- which(estim[-1,4] > pt) if (length(idx)>0) { to.remove <- rownames(estim)[which(estim[-1,4] == max(estim[-1,4]))+1] features <- features[-which(features == to.remove)] new_spec <- gen.fm(dep=dep,x=features) model <- lm(new_spec,data=dt) } else { pval <- pt - 0.01 } #print(features) } return(model) } # We will create a function() that will run the algorithm. fwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) { # Number of features NV <- length(features) # We start by the baseline model with only a constant spec <- gen.fm(dep=dep,x=NULL) # Estimate model m <- lm(spec,data=dt) # Here we will store variables that 'pass' the test good.features <- c() # In forward elimination we need to start to add the 'best' variable at a time. # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left. for (v in 1:NV) { # Remaining number of features NF <- length(features) # Store pvalues mat.pvals <- matrix(NA,nrow=NF,ncol=1) # We need to try to add one-at-a-time all remaining features for (f in 1:NF) { # Generate specification spec <- gen.fm(dep=dep,x=c(good.features,features[f])) # Estimate model model <- lm(spec,data=dt) # Find p-values estim <- coeftest(model,vcov=vcovHC(model,type='HC1')) # Store p-values mat.pvals[f,1] <- estim[dim(estim)[1],4] } # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops if (length(which(mat.pvals < pt))==0) break # Which variable has lowest p-value? add.var <- features[(which(mat.pvals==min(mat.pvals)))][1] # We will add this variable to the set of good features good.features <- c(good.features,add.var) # We will remove this variable from the set of remaining features features <- features[-which(features==add.var)] # Now we repeat the procedure with remaining features until we: # - either have only features that have a p-value above threshold # - or we are with one left } spec <- gen.fm(dep=dep,x=good.features) return(lm(spec,data=dt)) } # We will create a function() that will run the algorithm. bielim <- function(dt=ocr,pt=0.1,dep='price',features=features) { # Number of features NV <- length(features) # We start by the baseline model with only a constant spec <- gen.fm(dep=dep,x=NULL) # Estimate model m <- lm(spec,data=dt) # Here we will store variables that 'pass' the test good.features <- c() # Here we will store variable that should be tested test.features <- features # In forward elimination we need to start to add the 'best' variable at a time. # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left. pvalue <- 1 while (pvalue > pt) { # Remaining number of features NF <- length(test.features) # Forward elimination first # Store pvalues mat.pvals <- matrix(NA,nrow=NF,ncol=1) rownames(mat.pvals) <- test.features # We need to try to add one-at-a-time all remaining features for (f in 1:NF) { # Generate specification spec <- gen.fm(dep=dep,x=c(good.features,test.features[f])) # Estimate model model <- lm(spec,data=dt) # Find p-values estim <- coeftest(model,vcov=vcovHC(model,type='HC1')) # Store p-values mat.pvals[f,1] <- estim[dim(estim)[1],4] } # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops if (length(which(mat.pvals < pt))==0) break # Which variable has lowest p-value? add.var <- rownames(mat.pvals)[which(mat.pvals==min(mat.pvals))] # We will add this variable to the set of good features good.features <- c(good.features,add.var) # We will remove this variable from the set of remaining features test.features <- features[-which(features %in% good.features)] # Backward elimination second tmp <- bwelim(dt=dt,pt=pt,dep=dep,features=good.features) good.features <- names(tmp$coefficients[-1]) test.features <- features[-which(features%in%good.features)] } spec <- gen.fm(dep=dep,x=good.features) return(lm(spec,data=dt)) } # We will create a function() that will run the complete-subset linear regression csr.method <- function(spec,train=ocr,test=oct,q=3,cv=10,trim=0.2,nc=14) { library(foreach) library(doParallel) # Overall number of observations TT = dim(train)[1] # Number of variables NV = length(all.vars(spec)) - 1 # Check the following condition if (q < 1 | q >= TT) return('Complexity parameter need to be higher as 0 and less as number of observations') # Models models = combn(NV,q) # Number of models NM = dim(models)[2] # Number of training observations NT = dim(train)[1] # 1) We need a place to store predictions for various forecasts # 2) We need a place to store forecast evaluations - used to select optimum models # 3) We need a place to store actual forecasts of models # -> 1) Store MSE grid.mse = array(NA,dim=c(NM,cv)) dimnames(grid.mse)[[1]] = paste('Model_',1:NM,sep='') dimnames(grid.mse)[[2]] = paste('cv_',1:cv,sep='') # -> 2) Initialize multi core cl <- makeCluster(nc) registerDoParallel(cl) # We loop over different models TMP = foreach (m = 1:NM,.packages=c('glmnet')) %dopar% { mse.cv = c() for (v in 1:cv) { # Select validation sample valid.idx = seq(from=v,to=NT,by=cv) valid = train[valid.idx,] # Select sub-training dataset train_cv = train[-valid.idx,] # Estimate & predict model mDT = train_cv[,c(all.vars(spec)[1],all.vars(spec)[models[,m]+1])] # Mean square error mse.cv[v] <- mean((predict(lm(mDT),newdata=valid) - valid[,all.vars(spec)[1]])^2) } return(mse.cv) } stopCluster(cl) for (m in 1:NM) grid.mse[m,] = TMP[[m]] # Average forecast error across cross-validation samples mean.mse = apply(grid.mse,1,mean) # Which ONE model led to best forecast? top.model = as.numeric(which(mean.mse == min(mean.mse))) mDT = train[,c(all.vars(spec)[1],all.vars(spec)[models[,top.model]+1])] pred.top.1 <- predict(lm(mDT),newdata=test) # Which 10 models led to best forecats? top.10 = as.numeric(which(rank(mean.mse)<=10)) pred = c() for (m in 1:10) { mDT = train[,c(all.vars(spec)[1],all.vars(spec)[models[,top.10[m]]+1])] pred <- cbind(pred,predict(lm(mDT),newdata=test)) } pred.top.10 <- apply(pred,1,mean) # Which 100 models led to best forecats? top.100 = as.numeric(which(rank(mean.mse)<=100)) pred = c() for (m in 1:100) { mDT = train[,c(all.vars(spec)[1],all.vars(spec)[models[,top.100[m]]+1])] pred <- cbind(pred,predict(lm(mDT),newdata=test)) } pred.top.100 <- apply(pred,1,mean,trim=0.2) # Combine into one output results = cbind(pred.top.1,pred.top.10,pred.top.100) return(results) } # Tree-based forecast - cross-validation opt.rf <- function(spec=specs,DT=ocr,DE=oct,nc=20,cv=20, minsplit=c(10,20), minbucket=c(5,10,15), maxdepth=c(0,3,5,8), mtry=c(2,4,6,8), ntree=c(100,250,500,1000)) { library(foreach) library(doParallel) library(ranger) # Overall number of observations TT = dim(DT)[1] # No. Minimum split to consider NMS = length(minsplit) # No. Minimum bucket size NBS = length(minbucket) # No. Depth NDH = length(maxdepth) # Number of trees NTE = length(ntree) # Number of tries in each split NTR = length(mtry) # Number of model combinations NM = NMS*NBS*NDH*NTE*NTR # Initialize multi core cl <- makeCluster(min(c(max(nc,cv),nc))) registerDoParallel(cl) # RANDOM FOREST TMP = foreach(r = 1:cv,.packages=c('ranger')) %dopar% { idx = sample(seq(from=r,to=TT,by=cv)) test.cv = DT[idx ,] train.cv = DT[-idx,] # Store ave loss from this iteration tmp.rfr = array(NA,dim=c(NMS,NBS,NDH,NTE,NTR)) dimnames(tmp.rfr)[[1]] = paste('split_',minsplit,sep='') dimnames(tmp.rfr)[[2]] = paste('bucket_',minbucket,sep='') dimnames(tmp.rfr)[[3]] = paste('depth_',maxdepth,sep='') dimnames(tmp.rfr)[[4]] = paste('trees_',ntree,sep='') dimnames(tmp.rfr)[[5]] = paste('mtry_',mtry,sep='') for (d in 1:NMS) { print(d) for (q in 1:NBS) { print(q) for (w in 1:NDH) { for (s in 1:NTE) { for (b in 1:NTR) { # RF mDTB = ranger(formula=spec,data=train.cv,num.threads=1, num.trees=ntree[s], mtry=mtry[b], min.node.size = minsplit[d], max.depth=maxdepth[w], min.bucket=minbucket[q]) tmp.rfr[d,q,w,s,b] = mean(c(predict(mDTB,data=test.cv)$predict - test.cv[,all.vars(spec)[1]])^2) } } } } } return(tmp.rfr) } stopCluster(cl) # Store ave loss from this iteration tmp.rfr = array(NA,dim=c(NMS,NBS,NDH,NTE,NTR,cv)) dimnames(tmp.rfr)[[1]] = paste('split_',minsplit,sep='') dimnames(tmp.rfr)[[2]] = paste('bucket_',minbucket,sep='') dimnames(tmp.rfr)[[3]] = paste('depth_',maxdepth,sep='') dimnames(tmp.rfr)[[4]] = paste('trees_',ntree,sep='') dimnames(tmp.rfr)[[5]] = paste('mtry_',mtry,sep='') dimnames(tmp.rfr)[[6]] = 1:cv # Extract average losses for (r in 1:cv) tmp.rfr[,,,,,r] <- TMP[[r]] # Average across cv & find optimum parameters ave.rfr = array(NA,dim=c(NMS,NBS,NDH,NTE,NTR,cv)) dimnames(ave.rfr)[[1]] = paste('split_',minsplit,sep='') dimnames(ave.rfr)[[2]] = paste('bucket_',minbucket,sep='') dimnames(ave.rfr)[[3]] = paste('depth_',maxdepth,sep='') dimnames(ave.rfr)[[4]] = paste('trees_',ntree,sep='') dimnames(ave.rfr)[[5]] = paste('mtry_',mtry,sep='') ave.rfr <- apply(tmp.rfr,1:5,mean) opt.idx = which(ave.rfr == min(ave.rfr), arr.ind=TRUE) opt.split = minsplit[opt.idx[1]] opt.bucket = minbucket[opt.idx[2]] opt.depth = maxdepth[opt.idx[3]] opt.trees = ntree[opt.idx[4]] opt.mtry = mtry[opt.idx[5]] # Estimate RF using full training sample with optimum parameters opt.rf = ranger(formula=spec,data=DT, num.trees=opt.trees, mtry=opt.mtry, min.node.size = opt.split, max.depth=opt.depth, min.bucket=opt.bucket) results <- list() results[['predictions']] = predict(opt.rf,data=DE)$predict results[['opt.par']] = list(opt.trees=opt.trees,opt.mtry=opt.mtry,opt.split=opt.split,opt.depth=opt.depth,opt.bucket=opt.bucket) return(results) } # boosted forecast - cross-validation opt.gbm <- function(spec=spces,DT=ocr,DE=oct,nc=20,cv=20, ntree=c(100,250,500), interaction.depth=c(1,3,5,8,12), shrinkage=c(1,0.75,0.5,0.25,0.10,0.05,0.01,0.001,0.0001)) { # Number of trees NT = length(ntree) # Number of interaction depths ND = length(interaction.depth) # Number of shrinkage - learning rate NS = length(shrinkage) # Number of observations TT = dim(DT)[1] # Packages library(gbm) library(xgboost) library(foreach) library(doParallel) # Initialize multi core cl <- makeCluster(nc) registerDoParallel(cl) # We loop over-time TMP = foreach (r = 1:cv,.packages=c('gbm')) %dopar% { idx = sample(seq(from=r,to=TT,by=cv)) test.cv = DT[idx ,] train.cv = DT[-idx,] # Store ave loss from this iteration tmp.gbm = array(NA,dim=c(NT,ND,NS)) dimnames(tmp.gbm)[[1]] = paste('trees_',ntree,sep='') dimnames(tmp.gbm)[[2]] = paste('depth_',interaction.depth,sep='') dimnames(tmp.gbm)[[3]] = paste('learning_',shrinkage,sep='') # Store values from this foreach iteration A = Sys.time() for (s in 1:NT) { print(s) for (q in 1:ND) { for (w in 1:NS) { mod.gbm = gbm(spec,data=train.cv,distribution='gaussian', n.trees=ntree[s], interaction.depth=interaction.depth[q], shrinkage=shrinkage[w], bag.fraction=1,n.cores = 1) tmp.gbm[s,q,w] = mean(c(predict(mod.gbm,newdata=test.cv) - test.cv[,all.vars(spec)[1]])^2) } } } Sys.time() return(tmp.gbm) } stopCluster(cl) # Store ave loss from this iteration tmp.gbm = array(NA,dim=c(NT,ND,NS,cv)) dimnames(tmp.gbm)[[1]] = paste('trees_',ntree,sep='') dimnames(tmp.gbm)[[2]] = paste('depth_',interaction.depth,sep='') dimnames(tmp.gbm)[[3]] = paste('learning_',shrinkage,sep='') dimnames(tmp.gbm)[[4]] = paste('cv',1:cv,sep='') for (r in 1:cv) tmp.gbm[,,,r] <- TMP[[r]] # Average loss across cv's ave.gbm = array(NA, dim = c(NT,ND,NS)) dimnames(ave.gbm)[[1]] = paste('trees_',ntree,sep='') dimnames(ave.gbm)[[2]] = paste('depth_',interaction.depth,sep='') dimnames(ave.gbm)[[3]] = paste('learning_',shrinkage,sep='') ave.gbm <- apply(tmp.gbm,c(1:3),mean) opt.idx = which(ave.gbm == min(ave.gbm,na.rm=T), arr.ind=TRUE) opt.depth = interaction.depth[opt.idx[2]] opt.trees = ntree[opt.idx[1]] opt.learn = shrinkage[opt.idx[3]] # Estimate fbm using full training sample with optimum parameters opt.gbm = gbm(spec,data=DT,distribution='gaussian', n.trees=opt.trees, interaction.depth=opt.depth, shrinkage=opt.learn, bag.fraction=1) results <- list() results[['predictions']] = predict(opt.gbm,newdata=DE) results[['opt.par']] = list(opt.depth=opt.depth,opt.trees=opt.trees,opt.learn=opt.learn) return(results) }