# We will create two functions(). One will use names of response (target) variable and names of features and returns an as.formula() object. Second will run the algorithm.
gen.fm = function(dep='price',x=features) {
  # This will create the 'dep~1' string - which is y = beta_0 model
  # paste() is a useful function... it links strings
  spec = paste(dep,'~1',sep='')
  # If there are no features - the vector 'x' has no elements - return
  if (is.null(x)) return(as.formula(spec))
  # Number of features
  NV = length(x)
  # Loop over all features - we add one feature at a time
  for (v in 1:NV) spec = paste(spec,'+',x[v],sep='')
  return(as.formula(spec))
}
# Now the algorithm with backward elimination:
bwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) {
  library(sandwich)
  library(lmtest)
  pval  <- 1
  # We start by the initial full model
  spec  <- gen.fm(dep=dep,x=features)
  model <- lm(spec,data=dt)
  while (pval > pt) {
    estim <- coeftest(model,vcov=vcovHC(model,type='HC1'))
    idx   <- which(estim[-1,4] > pt)
    if (length(idx)>0) {
      to.remove <- rownames(estim)[which(estim[-1,4] == max(estim[-1,4]))+1]
      features  <- features[-which(features == to.remove)]
      new_spec  <- gen.fm(dep=dep,x=features)
      model     <- lm(new_spec,data=dt)
    } else {
      pval  <- pt - 0.01
    }
    #print(features)
  }
  return(model)
}
# We will create a function() that will run the algorithm.
fwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) {
  # Number of features
  NV <- length(features)
  # We start by the baseline model with only a constant
  spec <- gen.fm(dep=dep,x=NULL)
  # Estimate model
  m <- lm(spec,data=dt)
  # Here we will store variables that 'pass' the test
  good.features <- c()
  # In forward elimination we need to start to add the 'best' variable at a time.
  # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left.
  for (v in 1:NV) {
    # Remaining number of features
    NF <- length(features)
    # Store pvalues
    mat.pvals <- matrix(NA,nrow=NF,ncol=1)
    # We need to try to add one-at-a-time all remaining features
    for (f in 1:NF) {
      # Generate specification
      spec  <- gen.fm(dep=dep,x=c(good.features,features[f]))
      # Estimate model
      model <- lm(spec,data=dt)
      # Find p-values
      estim <- coeftest(model,vcov=vcovHC(model,type='HC1'))
      # Store p-values
      mat.pvals[f,1] <- estim[dim(estim)[1],4]
    }
    # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops
    if (length(which(mat.pvals < pt))==0) break
    # Which variable has lowest p-value?
    add.var       <- features[(which(mat.pvals==min(mat.pvals)))][1]
    # We will add this variable to the set of good features
    good.features <- c(good.features,add.var)
    # We will remove this variable from the set of remaining features
    features      <- features[-which(features==add.var)]
    # Now we repeat the procedure with remaining features until we:
    # - either have only features that have a p-value above threshold
    # - or we are with one left
  }
  spec <- gen.fm(dep=dep,x=good.features)
  return(lm(spec,data=dt))
}
# We will create a function() that will run the algorithm.
bielim <- function(dt=ocr,pt=0.1,dep='price',features=features) {
  # Number of features
  NV <- length(features)
  # We start by the baseline model with only a constant
  spec <- gen.fm(dep=dep,x=NULL)
  # Estimate model
  m <- lm(spec,data=dt)
  # Here we will store variables that 'pass' the test
  good.features <- c()
  # Here we will store variable that should be tested
  test.features <- features
  # In forward elimination we need to start to add the 'best' variable at a time.
  # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left.
  pvalue <- 1
  while (pvalue > pt) {
    # Remaining number of features
    NF <- length(test.features)
    
    # Forward elimination first
    # Store pvalues
    mat.pvals <- matrix(NA,nrow=NF,ncol=1)
    rownames(mat.pvals) <- test.features
    # We need to try to add one-at-a-time all remaining features
    for (f in 1:NF) {
      # Generate specification
      spec  <- gen.fm(dep=dep,x=c(good.features,test.features[f]))
      # Estimate model
      model <- lm(spec,data=dt)
      # Find p-values
      estim <- coeftest(model,vcov=vcovHC(model,type='HC1'))
      # Store p-values
      mat.pvals[f,1] <- estim[dim(estim)[1],4]
    }
    # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops
    if (length(which(mat.pvals < pt))==0) break
    # Which variable has lowest p-value?
    add.var       <- rownames(mat.pvals)[which(mat.pvals==min(mat.pvals))]
    # We will add this variable to the set of good features
    good.features <- c(good.features,add.var)
    # We will remove this variable from the set of remaining features
    test.features <- features[-which(features %in% good.features)]
    
    # Backward elimination second
    tmp <- bwelim(dt=dt,pt=pt,dep=dep,features=good.features)
    good.features <- names(tmp$coefficients[-1])
    test.features <- features[-which(features%in%good.features)]
    
  }
  spec <- gen.fm(dep=dep,x=good.features)
  return(lm(spec,data=dt))
}
# We will create a function() that will run the complete-subset linear regression
csr.method <- function(spec,train=ocr,test=oct,q=3,cv=10,trim=0.2,nc=14) {
  
  library(foreach)
  library(doParallel)
  
  # Overall number of observations
  TT = dim(train)[1]
  # Number of variables
  NV = length(all.vars(spec)) - 1
  # Check the following condition
  if (q < 1 | q >= TT) return('Complexity parameter need to be higher as 0 and less as number of observations')
  # Models
  models = combn(NV,q)
  # Number of models
  NM = dim(models)[2]
  # Number of training observations
  NT = dim(train)[1]
  
  # 1) We need a place to store predictions for various forecasts
  # 2) We need a place to store forecast evaluations - used to select optimum models
  # 3) We need a place to store actual forecasts of models
  
  # -> 1) Store MSE
  grid.mse = array(NA,dim=c(NM,cv))
  dimnames(grid.mse)[[1]] = paste('Model_',1:NM,sep='')
  dimnames(grid.mse)[[2]] = paste('cv_',1:cv,sep='')
  # -> 2) Initialize multi core
  cl <- makeCluster(nc)
  registerDoParallel(cl)
  
  # We loop over different models
  TMP = foreach (m = 1:NM,.packages=c('glmnet')) %dopar% {
    
    mse.cv = c()
    
    for (v in 1:cv) {
      
      # Select validation sample
      valid.idx = seq(from=v,to=NT,by=cv)
      valid     = train[valid.idx,]
      # Select sub-training dataset
      train_cv  = train[-valid.idx,]
      
      # Estimate & predict model
      mDT = train_cv[,c(all.vars(spec)[1],all.vars(spec)[models[,m]+1])]
      
      # Mean square error
      mse.cv[v] <- mean((predict(lm(mDT),newdata=valid) -  valid[,all.vars(spec)[1]])^2)
    }
    
    return(mse.cv)
    
  }
  stopCluster(cl)
  
  for (m in 1:NM) grid.mse[m,] = TMP[[m]]
  
  # Average forecast error across cross-validation samples
  mean.mse = apply(grid.mse,1,mean)
  
  # Which ONE model led to best forecast?
  top.model = as.numeric(which(mean.mse == min(mean.mse)))
  mDT = train[,c(all.vars(spec)[1],all.vars(spec)[models[,top.model]+1])]
  pred.top.1 <- predict(lm(mDT),newdata=test)
  
  # Which 10 models led to best forecats?
  top.10 = as.numeric(which(rank(mean.mse)<=10))
  pred   = c()
  for (m in 1:10) {
    mDT = train[,c(all.vars(spec)[1],all.vars(spec)[models[,top.10[m]]+1])]
    pred <- cbind(pred,predict(lm(mDT),newdata=test))
  }
  pred.top.10 <- apply(pred,1,mean)
  
  # Which 100 models led to best forecats?
  top.100 = as.numeric(which(rank(mean.mse)<=100))
  pred   = c()
  for (m in 1:100) {
    mDT = train[,c(all.vars(spec)[1],all.vars(spec)[models[,top.100[m]]+1])]
    pred <- cbind(pred,predict(lm(mDT),newdata=test))
  }
  pred.top.100 <- apply(pred,1,mean,trim=0.2)
  
  # Combine into one output
  results = cbind(pred.top.1,pred.top.10,pred.top.100)
  return(results)
}
# Tree-based forecast - cross-validation
opt.rf <- function(spec=specs,DT=ocr,DE=oct,nc=20,cv=20,
                minsplit=c(10,20),
                minbucket=c(5,10,15),
                maxdepth=c(0,3,5,8),
                mtry=c(2,4,6,8),
                ntree=c(100,250,500,1000)) {
  
  library(foreach)
  library(doParallel)
  library(ranger)
  
  # Overall number of observations
  TT = dim(DT)[1]
  # No. Minimum split to consider
  NMS = length(minsplit)
  # No. Minimum bucket size
  NBS = length(minbucket)
  # No. Depth
  NDH = length(maxdepth)
  # Number of trees
  NTE = length(ntree)
  # Number of tries in each split
  NTR = length(mtry)

  # Number of model combinations
  NM = NMS*NBS*NDH*NTE*NTR
  
  # Initialize multi core
  cl <- makeCluster(min(c(max(nc,cv),nc)))
  registerDoParallel(cl)
  
  # RANDOM FOREST
  TMP = foreach(r = 1:cv,.packages=c('ranger')) %dopar% {
    
    idx      = sample(seq(from=r,to=TT,by=cv))
    test.cv  = DT[idx ,]
    train.cv = DT[-idx,]
    
    # Store ave loss from this iteration
    tmp.rfr = array(NA,dim=c(NMS,NBS,NDH,NTE,NTR))
    dimnames(tmp.rfr)[[1]] = paste('split_',minsplit,sep='')
    dimnames(tmp.rfr)[[2]] = paste('bucket_',minbucket,sep='')
    dimnames(tmp.rfr)[[3]] = paste('depth_',maxdepth,sep='')
    dimnames(tmp.rfr)[[4]] = paste('trees_',ntree,sep='')
    dimnames(tmp.rfr)[[5]] = paste('mtry_',mtry,sep='')
      
    for (d in 1:NMS) {
      print(d)
      for (q in 1:NBS) {
        print(q)
        for (w in 1:NDH) {
          for (s in 1:NTE) {
            for (b in 1:NTR) {
              # RF
              mDTB = ranger(formula=spec,data=train.cv,num.threads=1,
                            num.trees=ntree[s],
                            mtry=mtry[b],
                            min.node.size = minsplit[d],
                            max.depth=maxdepth[w],
                            min.bucket=minbucket[q])
              tmp.rfr[d,q,w,s,b] = mean(c(predict(mDTB,data=test.cv)$predict - test.cv[,all.vars(spec)[1]])^2)
            }
          }
        }
      }
    }
    
    return(tmp.rfr)
    
  }
  
  stopCluster(cl)
  
  # Store ave loss from this iteration
  tmp.rfr = array(NA,dim=c(NMS,NBS,NDH,NTE,NTR,cv))
  dimnames(tmp.rfr)[[1]] = paste('split_',minsplit,sep='')
  dimnames(tmp.rfr)[[2]] = paste('bucket_',minbucket,sep='')
  dimnames(tmp.rfr)[[3]] = paste('depth_',maxdepth,sep='')
  dimnames(tmp.rfr)[[4]] = paste('trees_',ntree,sep='')
  dimnames(tmp.rfr)[[5]] = paste('mtry_',mtry,sep='')
  dimnames(tmp.rfr)[[6]] = 1:cv
  # Extract average losses
  for (r in 1:cv) tmp.rfr[,,,,,r] <- TMP[[r]]
    
  # Average across cv & find optimum parameters
  ave.rfr = array(NA,dim=c(NMS,NBS,NDH,NTE,NTR,cv))
  dimnames(ave.rfr)[[1]] = paste('split_',minsplit,sep='')
  dimnames(ave.rfr)[[2]] = paste('bucket_',minbucket,sep='')
  dimnames(ave.rfr)[[3]] = paste('depth_',maxdepth,sep='')
  dimnames(ave.rfr)[[4]] = paste('trees_',ntree,sep='')
  dimnames(ave.rfr)[[5]] = paste('mtry_',mtry,sep='')
  
  ave.rfr <- apply(tmp.rfr,1:5,mean)
  opt.idx    = which(ave.rfr == min(ave.rfr), arr.ind=TRUE)
  opt.split  = minsplit[opt.idx[1]]
  opt.bucket = minbucket[opt.idx[2]]
  opt.depth  = maxdepth[opt.idx[3]]
  opt.trees  = ntree[opt.idx[4]]
  opt.mtry   = mtry[opt.idx[5]]
  
  # Estimate RF using full training sample with optimum parameters
  opt.rf = ranger(formula=spec,data=DT,
                num.trees=opt.trees,
                mtry=opt.mtry,
                min.node.size = opt.split,
                max.depth=opt.depth,
                min.bucket=opt.bucket)
  results <- list()
  results[['predictions']] = predict(opt.rf,data=DE)$predict
  results[['opt.par']]     = list(opt.trees=opt.trees,opt.mtry=opt.mtry,opt.split=opt.split,opt.depth=opt.depth,opt.bucket=opt.bucket)

  return(results)
}
# boosted forecast - cross-validation
opt.gbm <- function(spec=spces,DT=ocr,DE=oct,nc=20,cv=20,
                ntree=c(100,250,500),
                interaction.depth=c(1,3,5,8,12),
                shrinkage=c(1,0.75,0.5,0.25,0.10,0.05,0.01,0.001,0.0001)) {
  
  # Number of trees
  NT = length(ntree)
  # Number of interaction depths
  ND = length(interaction.depth)
  # Number of shrinkage - learning rate
  NS = length(shrinkage)
  # Number of observations
  TT = dim(DT)[1]
  
  # Packages
  library(gbm)
  library(xgboost)
  library(foreach)
  library(doParallel)
  
  # Initialize multi core
  cl <- makeCluster(nc)
  registerDoParallel(cl)
  
  # We loop over-time
  TMP = foreach (r = 1:cv,.packages=c('gbm')) %dopar% {
    
    idx      = sample(seq(from=r,to=TT,by=cv))
    test.cv  = DT[idx ,]
    train.cv = DT[-idx,]
    
    # Store ave loss from this iteration
    tmp.gbm = array(NA,dim=c(NT,ND,NS))
    dimnames(tmp.gbm)[[1]] = paste('trees_',ntree,sep='')
    dimnames(tmp.gbm)[[2]] = paste('depth_',interaction.depth,sep='')
    dimnames(tmp.gbm)[[3]] = paste('learning_',shrinkage,sep='')
    
    # Store values from this foreach iteration
    A = Sys.time()
    for (s in 1:NT) {
      print(s)
      for (q in 1:ND) {
        for (w in 1:NS) {
          
          mod.gbm = gbm(spec,data=train.cv,distribution='gaussian',
                        n.trees=ntree[s],
                        interaction.depth=interaction.depth[q],
                        shrinkage=shrinkage[w],
                        bag.fraction=1,n.cores = 1)
          
          tmp.gbm[s,q,w] = mean(c(predict(mod.gbm,newdata=test.cv) - test.cv[,all.vars(spec)[1]])^2)
          
        }
      }
    }
    Sys.time()
    return(tmp.gbm)
    
  }
  
  stopCluster(cl)
  
  # Store ave loss from this iteration
  tmp.gbm = array(NA,dim=c(NT,ND,NS,cv))
  dimnames(tmp.gbm)[[1]] = paste('trees_',ntree,sep='')
  dimnames(tmp.gbm)[[2]] = paste('depth_',interaction.depth,sep='')
  dimnames(tmp.gbm)[[3]] = paste('learning_',shrinkage,sep='')
  dimnames(tmp.gbm)[[4]] = paste('cv',1:cv,sep='')
  
  for (r in 1:cv) tmp.gbm[,,,r] <- TMP[[r]]
  
  # Average loss across cv's
  ave.gbm = array(NA, dim = c(NT,ND,NS))
  dimnames(ave.gbm)[[1]] = paste('trees_',ntree,sep='')
  dimnames(ave.gbm)[[2]] = paste('depth_',interaction.depth,sep='')
  dimnames(ave.gbm)[[3]] = paste('learning_',shrinkage,sep='')

  ave.gbm <- apply(tmp.gbm,c(1:3),mean)
  opt.idx    = which(ave.gbm == min(ave.gbm,na.rm=T), arr.ind=TRUE)
  opt.depth  = interaction.depth[opt.idx[2]]
  opt.trees  = ntree[opt.idx[1]]
  opt.learn  = shrinkage[opt.idx[3]]
  
  # Estimate fbm using full training sample with optimum parameters
  opt.gbm = gbm(spec,data=DT,distribution='gaussian',
                n.trees=opt.trees,
                interaction.depth=opt.depth,
                shrinkage=opt.learn,
                bag.fraction=1)
  results <- list()
  results[['predictions']] = predict(opt.gbm,newdata=DE)
  results[['opt.par']]     = list(opt.depth=opt.depth,opt.trees=opt.trees,opt.learn=opt.learn)
  
  return(results)
}