#---
title: "Seminar4RM"
author: "Stefan Lyocsa"
date: "2024-10-17"
output: html_document
#---
  
# Content
# * Estimate OLS models:
#   + Given model specifications.
#   + Your own model specifications.
#   + Backward elimination.
#   + Forward elimination.
#   + Bi-directional elimination.
# * Model prediction
# * Forecast evaluation:
#   + Statistical loss function.
#   + Model confidence set
# * Task

# First load training and testing datasets into the workspace
oct <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//oct.csv')
ocr <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//ocr.csv')

# Before working with backward, forward and bi-directional elimination we estimate some baseline models that we want to out-perform.
m1 <- lm(price ~ km      , data=ocr)
m2 <- lm(price ~ age     , data=ocr)
m3 <- lm(price ~ km + age, data=ocr)

# Add a model with all variable that will likely overfit the data
m4 <- lm(price ~ km + age + kw + trans + combi + allw + ambi + styl + rs + dsg + scr + 
           diesel + lpg + hybrid + cng + age, data=ocr)

# Add some features that we engineered -> for training sample
ocr$km2   <- ocr$km^2
ocr$km3   <- ocr$km^3
ocr$age2  <- ocr$age^2
ocr$age3  <- ocr$age^3
ocr$kmage <- ocr$km * ocr$age
ocr$kmkw  <- ocr$km * ocr$kw
ocr$kmkwage <- ocr$km * ocr$kw * ocr$age

# For testing sample
oct$km2   <- oct$km^2         
oct$km3   <- oct$km^3
oct$age2  <- oct$age^2
oct$age3  <- oct$age^3
oct$kmage <- oct$km * oct$age
oct$kmkw  <- oct$km * oct$kw
oct$kmkwage <- oct$km * oct$kw * oct$age

# Add one 'extra' model, which is my 'top' specification
m5 <- lm(price ~ km + age + kw + trans + combi + allw + ambi + styl + 
           rs + dsg + scr + diesel + lpg + hybrid + cng + km2 + age2 + kmage, data = ocr)

# No predictions and make sure that negative prices are winsorized
p1 <- predict(m1,new=oct)
p1[p1<0] <- min(ocr$price)
p2 = predict(m2,new=oct)
p2[p2<0] <- min(ocr$price)
p3 = predict(m3,new=oct)
p3[p3<0] <- min(ocr$price)
p4 = predict(m4,new=oct)
p4[p4<0] <- min(ocr$price)
p5 = predict(m5,new=oct)
p5[p5<0] <- min(ocr$price)

# Backward elimination
# The goal is to create a function that will perform the backward elimination. Recall the algorithm:
  
# * Select a threshold p-value.
# * Estimate model with all features.
# * Estimate p-values of regression coefficients.
# * Eliminate feature with highest p-value that is above the threshold.
# * Estimate model without the eliminated feature.
# * Repeat until no feature can be removed.

# We will create two functions(). One will use names of response (target) variable and names of features and returns an as.formula() object. Second will run the algorithm.
gen.fm = function(dep='price',x=features) {
  # This will create the 'dep~1' string - which is y = beta_0 model
  # paste() is a useful function... it links strings
  spec = paste(dep,'~1',sep='')
  # If there are no features - the vector 'x' has no elements - return
  if (is.null(x)) return(as.formula(spec))
  # Number of features
  NV = length(x)
  # Loop over all features - we add one feature at a time
  for (v in 1:NV) spec = paste(spec,'+',x[v],sep='')
  return(as.formula(spec))
}

# Let's try it out:
gen.fm(dep='price',x=c('km','age'))

# Now the algorithm with backward elimination:
bwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) {
  library(sandwich)
  library(lmtest)
  pval  <- 1
  # We start by the initial full model
  spec  <- gen.fm(dep=dep,x=features)
  model <- lm(spec,data=dt)
  while (pval > pt) {
    estim <- coeftest(model,vcov=vcovHC(model,type='HC1'))
    idx   <- which(estim[,4] > pt)
    if (length(idx)>0) {
      to.remove <- rownames(estim)[which(estim[,4] == max(estim[,4]))]
      features  <- features[-which(features == to.remove)]
      new_spec  <- gen.fm(dep=dep,x=features)
      model     <- lm(new_spec,data=dt)
    } else {
      pval  <- pt - 0.01
    }
    #print(features)
  }
  return(model)
}

# And we can create model 6 and corresponding predictions
features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs',
              'dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage')
m6 <- bwelim(dt=ocr,pt=0.1,dep='price',features=features)
p6 <- predict(m6,new=oct)
p6[p6<0] <- min(ocr$price)

# Forward elimination
# The goal is to create a function that will perform the forward elimination. Recall the algorithm:

# * Select a threshold p-value.
# * Estimate model with an intercept only.
# * Add one feature to the specification that has the lowest p-value.
# * Repeat until you are unable to find a feature with p-value below the threshold.

# We will create a function() that will run the algorithm.
fwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) {
  # Number of features
  NV <- length(features)
  # We start by the baseline model with only a constant
  spec <- gen.fm(dep=dep,x=NULL)
  # Estimate model
  m <- lm(spec,data=dt)
  # Here we will store variables that 'pass' the test
  good.features <- c()
  # In forward elimination we need to start to add the 'best' variable at a time.
  # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left.
  for (v in 1:NV) {
    # Remaining number of features
    NF <- length(features)
    # Store pvalues
    mat.pvals <- matrix(NA,nrow=NF,ncol=1)
    # We need to try to add one-at-a-time all remaining features
    for (f in 1:NF) {
      # Generate specification
      spec  <- gen.fm(dep=dep,x=c(good.features,features[f]))
      # Estimate model
      model <- lm(spec,data=dt)
      # Find p-values
      estim <- coeftest(model,vcov=vcovHC(model,type='HC1'))
      # Store p-values
      mat.pvals[f,1] <- estim[dim(estim)[1],4]
    }
    # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops
    if (length(which(mat.pvals < pt))==0) break
    # Which variable has lowest p-value?
    add.var       <- features[(which(mat.pvals==min(mat.pvals)))][1]
    # We will add this variable to the set of good features
    good.features <- c(good.features,add.var)
    # We will remove this variable from the set of remaining features
    features      <- features[-which(features==add.var)]
    # Now we repeat the procedure with remaining features until we:
    # - either have only features that have a p-value above threshold
    # - or we are with one left
  }
  spec <- gen.fm(dep=dep,x=good.features)
  return(lm(spec,data=dt))
}

# And we can create model 7 and corresponding predictions
m7 <- fwelim(dt=ocr,pt=0.1,dep='price',features=features)
p7 <- predict(m7,new=oct)
p7[p7<0] <- min(ocr$price)

# Bi-directional elimination
# The goal is to create a function that will perform the bi-directional elimination. Recall the algorithm:

# * Select a threshold p-value.
# * Estimate model with an intercept only.
# * Add one feature to the specification that has the lowest p-value.
# * Check if any other variables has now p-value below the threshold and if yes, remove.
# * Repeat until you are unable to find a feature with p-value below the threshold.

# We will create a function() that will run the algorithm.
bielim <- function(dt=ocr,pt=0.1,dep='price',features=features) {
  # Number of features
  NV <- length(features)
  # We start by the baseline model with only a constant
  spec <- gen.fm(dep=dep,x=NULL)
  # Estimate model
  m <- lm(spec,data=dt)
  # Here we will store variables that 'pass' the test
  good.features <- c()
  # Here we will store variable that should be tested
  test.features <- features
  # In forward elimination we need to start to add the 'best' variable at a time.
  # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left.
  pvalue <- 1
  while (pvalue > pt) {
    # Remaining number of features
    NF <- length(test.features)
    
    # Forward elimination first
    # Store pvalues
    mat.pvals <- matrix(NA,nrow=NF,ncol=1)
    rownames(mat.pvals) <- test.features
    # We need to try to add one-at-a-time all remaining features
    for (f in 1:NF) {
      # Generate specification
      spec  <- gen.fm(dep=dep,x=c(good.features,test.features[f]))
      # Estimate model
      model <- lm(spec,data=dt)
      # Find p-values
      estim <- coeftest(model,vcov=vcovHC(model,type='HC1'))
      # Store p-values
      mat.pvals[f,1] <- estim[dim(estim)[1],4]
    }
    # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops
    if (length(which(mat.pvals < pt))==0) break
    # Which variable has lowest p-value?
    add.var       <- rownames(mat.pvals)[which(mat.pvals==min(mat.pvals))]
    # We will add this variable to the set of good features
    good.features <- c(good.features,add.var)
    # We will remove this variable from the set of remaining features
    test.features <- features[-which(features %in% good.features)]
    
    # Backward elimination second
    tmp <- bwelim(dt=ocr,pt=pt,dep=dep,features=good.features)
    good.features <- names(tmp$coefficients[-1])
    test.features <- features[-which(features%in%good.features)]
    
  }
  spec <- gen.fm(dep=dep,x=good.features)
  return(lm(spec,data=dt))
}

# And we can create model 8 and corresponding predictions
m8 <- bielim(dt=ocr,pt=0.1,dep='price',features=features)
p8 <- predict(m8,new=oct)
p8[p8<0] <- min(ocr$price)

# Forecast evaluation
# Let's store the predictions from eight models into a matrix:
predictions = cbind(oct$price,p1,p2,p3,p4,p5,p6,p7,p8)
head(predictions)
colnames(predictions) = c('price',paste('p',1:8,sep=''))

# Now we find the Mean Square Error and Mean Absolute Error:
mse = predictions[,-1]
mae = mse
for (i in 1:8) {
  mse[,i] =    (predictions[,1] - predictions[,i+1])^2
  mae[,i] = abs(predictions[,1] - predictions[,i+1])
}

# What are the average loss values?
apply(mse,2,mean)
apply(mae,2,mean)

# p6 and p8 lead to same model and even p7 is almost the same
cor(predictions)

# Which model is the best one?
# For MCS we need only models that have not the same predictions (correlation 1.0)
head(mse)
mse <- mse[,-c(6,8)]
mae <- mae[,-c(6,8)]
library(MCS)
MCSprocedure(mse)
MCSprocedure(mae)

# Under MSE and MAE we have one model left -> p7. One model rulezz them all!

################################################################################
# Let's continue with penalized regressions
# LASSO
# Load package and write down the features we would like to include (everything)
library(glmnet)
# We will work with octavia dataset
features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs',
              'dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage')
# The function we are going to use requires features to be in a matrix object
X = as.matrix(ocr[,features])
# Outcome variable to be a matrix object as well
Y = as.matrix(ocr[,'price'])
# We do the same for the testing dataset
XT = as.matrix(oct[,features])

# Now we need to estimate \lambda using cross-validation. The function that will do that is cv.glmnet().
# Here 'nfolds' is the number of cross-validation samples (slicing of the training dataset)
# 'type.measure' is the loss function that will be used to decide what lambda is the best,
# 'alpha' [0,1] parameter is a weight between LASSO = 0 to RIDGE = 1
# Estimate the LASSO
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=1)
# Visualize the MSE as a function of log(lambda)
plot(cvm)

# You can check out coefficients for model with lambda-min
coefficients(cvm,s='lambda.min')
# or if lambda is 1se away from lambda-min
coefficients(cvm,s='lambda.1se')

# Now let's predict under lambda.min & lamba.1se
p9 = predict(cvm,newx = XT,s='lambda.min')
p9[p9<0] = min(ocr$price,na.rm=T)
p10 = predict(cvm,newx = XT,s='lambda.1se')
p10[p10<0] = min(ocr$price,na.rm=T)

## Ridge
# The procedure is very similar just change the alpha = 0
# Estimate the Ridge
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0)
# Visualize the MSE as a function of log(lambda)
plot(cvm)
# You can check out coefficients for model with lambda-min
coefficients(cvm,s='lambda.min')
# or if lambda is 1se away from lambda-min
coefficients(cvm,s='lambda.1se')
# Now let's predict under lambda.min & lamba.1se
p11 = predict(cvm,newx = XT,s='lambda.min')
p11[p11<0] = min(ocr$price,na.rm=T)
p12 = predict(cvm,newx = XT,s='lambda.1se')
p12[p12<0] = min(ocr$price,na.rm=T)

## Elastic Net
# It is similar with Elastic net. Here you can use any alpha [0,1].
# But what value to use? We use 0.5 for now.
# Estimate the Elastic Net
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0.5)
# Visualize the MSE as a function of log(lambda)
plot(cvm)
# You can check out coefficients for model with lambda-min
coefficients(cvm,s='lambda.min')
# or if lambda is 1se away from lambda-min
coefficients(cvm,s='lambda.1se')
# Now let's predict under lambda.min & lamba.1se
p13 = predict(cvm,newx = XT,s='lambda.min')
p13[p13<0] = min(ocr$price,na.rm=T)
p14 = predict(cvm,newx = XT,s='lambda.1se')
p14[p14<0] = min(ocr$price,na.rm=T)

################################################################################
# Advanced
# Task - write a code that will cross-validate alpha parameter, e.g. which from seq(0.1,0.9,by=0.1) alpha to use?
################################################################################

## Adaptive LASSO
# Also a combination of LASSO and Ridge
# Step 1) Estimate Ridge model (it could be an OLS as well)
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0)
coefficients(cvm,s='lambda.min')
# Step 2) The larger the coefficient the lower the penalization in the next step
wgt = 1/abs(coefficients(cvm,s='lambda.min'))
# Now the weights are normalized so that they sum to 1
wgt = wgt/sum(wgt)
sum(wgt)
# Step 3) Estimate LASSO where penalization is not the same for every coefficient
cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=1,penalty.factor=wgt[-1])
coefficients(cvm,s='lambda.min')
coefficients(cvm,s='lambda.1se')
# Predictions
p15 = predict(cvm,newx = XT,s='lambda.min')
p15[p15<0] = min(ocr$price,na.rm=T)
p16 = predict(cvm,newx = XT,s='lambda.1se')
p16[p16<0] = min(ocr$price,na.rm=T)

# Forecast evaluation
# How are we doing?
predictions = cbind(oct$price,p1,p2,p3,p4,p5,p6,p9,p10,p11,p12,p13,p14,p15,p16)
head(predictions)
colnames(predictions) = c('price',paste('p',1:6,sep=''),paste('p',9:16,sep=''))
# How many models do we have?
M <- ncol(predictions)-1
# Now we find the Mean Square Error and Mean Absolute Error:
mse = predictions[,-1]
mae = mse
for (i in 1:M) {
  mse[,i] =    (predictions[,1] - predictions[,i+1])^2
  mae[,i] = abs(predictions[,1] - predictions[,i+1])
}

# What are the average loss values?
apply(mse,2,mean)
apply(mae,2,mean)
# Which model is the best one?
library(MCS)
MCSprocedure(mse)
# There is a group of similarly performing models now
MCSprocedure(mae)
# There is just model preferred model

################################################################################
# Competition  
# Task
# Load a (small) dataset with rental prices
apartments <- read.csv(file='C:\\Users\\ckt\\OneDrive - MUNI\\Institutions\\MU\\ML Finance\\2024\\datasets\\aprt1.csv')

# Separate into training and testing
set.seed(99)
N     <- dim(apartments)[1]
idx   <- sample(1:N,size=floor(0.7*N),replace=FALSE)
train <- apartments[idx ,]
test  <- apartments[-idx,]
head(train)

# The goal is to predict the rent. Run a competition with multiple models. Use only OLS type of models, but you are encouraged to create your own features (use interactions, dummy variables, etc.). Try to beat others!