# We will create two functions(). One will use names of response (target) variable and names of features and returns an as.formula() object. Second will run the algorithm. gen.fm = function(dep='price',x=features) { # This will create the 'dep~1' string - which is y = beta_0 model # paste() is a useful function... it links strings spec = paste(dep,'~1',sep='') # If there are no features - the vector 'x' has no elements - return if (is.null(x)) return(as.formula(spec)) # Number of features NV = length(x) # Loop over all features - we add one feature at a time for (v in 1:NV) spec = paste(spec,'+',x[v],sep='') return(as.formula(spec)) } # Now the algorithm with backward elimination: bwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) { library(sandwich) library(lmtest) pval <- 1 # We start by the initial full model spec <- gen.fm(dep=dep,x=features) model <- lm(spec,data=dt) while (pval > pt) { estim <- coeftest(model,vcov=vcovHC(model,type='HC1')) idx <- which(estim[,4] > pt) if (length(idx)>0) { to.remove <- rownames(estim)[which(estim[,4] == max(estim[,4]))] features <- features[-which(features == to.remove)] new_spec <- gen.fm(dep=dep,x=features) model <- lm(new_spec,data=dt) } else { pval <- pt - 0.01 } #print(features) } return(model) } # We will create a function() that will run the algorithm. fwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) { # Number of features NV <- length(features) # We start by the baseline model with only a constant spec <- gen.fm(dep=dep,x=NULL) # Estimate model m <- lm(spec,data=dt) # Here we will store variables that 'pass' the test good.features <- c() # In forward elimination we need to start to add the 'best' variable at a time. # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left. for (v in 1:NV) { # Remaining number of features NF <- length(features) # Store pvalues mat.pvals <- matrix(NA,nrow=NF,ncol=1) # We need to try to add one-at-a-time all remaining features for (f in 1:NF) { # Generate specification spec <- gen.fm(dep=dep,x=c(good.features,features[f])) # Estimate model model <- lm(spec,data=dt) # Find p-values estim <- coeftest(model,vcov=vcovHC(model,type='HC1')) # Store p-values mat.pvals[f,1] <- estim[dim(estim)[1],4] } # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops if (length(which(mat.pvals < pt))==0) break # Which variable has lowest p-value? add.var <- features[(which(mat.pvals==min(mat.pvals)))][1] # We will add this variable to the set of good features good.features <- c(good.features,add.var) # We will remove this variable from the set of remaining features features <- features[-which(features==add.var)] # Now we repeat the procedure with remaining features until we: # - either have only features that have a p-value above threshold # - or we are with one left } spec <- gen.fm(dep=dep,x=good.features) return(lm(spec,data=dt)) } # We will create a function() that will run the algorithm. bielim <- function(dt=ocr,pt=0.1,dep='price',features=features) { # Number of features NV <- length(features) # We start by the baseline model with only a constant spec <- gen.fm(dep=dep,x=NULL) # Estimate model m <- lm(spec,data=dt) # Here we will store variables that 'pass' the test good.features <- c() # Here we will store variable that should be tested test.features <- features # In forward elimination we need to start to add the 'best' variable at a time. # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left. pvalue <- 1 while (pvalue > pt) { # Remaining number of features NF <- length(test.features) # Forward elimination first # Store pvalues mat.pvals <- matrix(NA,nrow=NF,ncol=1) rownames(mat.pvals) <- test.features # We need to try to add one-at-a-time all remaining features for (f in 1:NF) { # Generate specification spec <- gen.fm(dep=dep,x=c(good.features,test.features[f])) # Estimate model model <- lm(spec,data=dt) # Find p-values estim <- coeftest(model,vcov=vcovHC(model,type='HC1')) # Store p-values mat.pvals[f,1] <- estim[dim(estim)[1],4] } # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops if (length(which(mat.pvals < pt))==0) break # Which variable has lowest p-value? add.var <- rownames(mat.pvals)[which(mat.pvals==min(mat.pvals))] # We will add this variable to the set of good features good.features <- c(good.features,add.var) # We will remove this variable from the set of remaining features test.features <- features[-which(features %in% good.features)] # Backward elimination second tmp <- bwelim(dt=ocr,pt=pt,dep=dep,features=good.features) good.features <- names(tmp$coefficients[-1]) test.features <- features[-which(features%in%good.features)] } spec <- gen.fm(dep=dep,x=good.features) return(lm(spec,data=dt)) } csr.method <- function(spec,train=ocr,test=oct,q=3,cv=10,trim=0.2,nc=14) { library(foreach) library(doParallel) # Overall number of observations TT = dim(train)[1] # Number of variables NV = length(all.vars(spec)) - 1 # Check the following condition if (q < 1 | q >= TT) return('Complexity parameter need to be higher as 0 and less as number of observations') # Models models = combn(NV,q) # Number of models NM = dim(models)[2] # Number of training observations NT = dim(train)[1] # 1) We need a place to store predictions for various forecasts # 2) We need a place to store forecast evaluations - used to select optimum models # 3) We need a place to store actual forecasts of models # -> 1) Store MSE grid.mse = array(NA,dim=c(NM,cv)) dimnames(grid.mse)[[1]] = paste('Model_',1:NM,sep='') dimnames(grid.mse)[[2]] = paste('cv_',1:cv,sep='') # -> 2) Initialize multi core cl <- makeCluster(nc) registerDoParallel(cl) # We loop over different models TMP = foreach (m = 1:NM,.packages=c('glmnet')) %dopar% { mse.cv = c() for (v in 1:cv) { # Select validation sample valid.idx = seq(from=v,to=NT,by=cv) valid = train[valid.idx,] # Select sub-training dataset train_cv = train[-valid.idx,] # Estimate & predict model mDT = train_cv[,c(all.vars(spec)[1],all.vars(spec)[models[,m]+1])] # Mean square error mse.cv[v] <- mean((predict(lm(mDT),newdata=valid) - valid[,all.vars(spec)[1]])^2) } return(mse.cv) } stopCluster(cl) for (m in 1:NM) grid.mse[m,] = TMP[[m]] # Average forecast error across cross-validation samples mean.mse = apply(grid.mse,1,mean) # Which ONE model led to best forecast? top.model = as.numeric(which(mean.mse == min(mean.mse))) mDT = train[,c(all.vars(spec)[1],all.vars(spec)[models[,top.model]+1])] pred.top.1 <- predict(lm(mDT),newdata=test) # Which 10 models led to best forecats? top.10 = as.numeric(which(rank(mean.mse)<=10)) pred = c() for (m in 1:10) { mDT = train[,c(all.vars(spec)[1],all.vars(spec)[models[,top.10[m]]+1])] pred <- cbind(pred,predict(lm(mDT),newdata=test)) } pred.top.10 <- apply(pred,1,mean) # Which 100 models led to best forecats? top.100 = as.numeric(which(rank(mean.mse)<=100)) pred = c() for (m in 1:100) { mDT = train[,c(all.vars(spec)[1],all.vars(spec)[models[,top.100[m]]+1])] pred <- cbind(pred,predict(lm(mDT),newdata=test)) } pred.top.100 <- apply(pred,1,mean,trim=0.2) # Combine into one output results = cbind(pred.top.1,pred.top.10,pred.top.100) return(results) }