#--- title: "Seminar4RM" author: "Stefan Lyocsa" date: "2024-10-17" output: html_document #--- # Content # * Estimate OLS models: # + Given model specifications. # + Your own model specifications. # + Backward elimination. # + Forward elimination. # + Bi-directional elimination. # * Model prediction # * Forecast evaluation: # + Statistical loss function. # + Model confidence set # * Task # First load training and testing datasets into the workspace oct <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//oct.csv') ocr <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//ocr.csv') # Before working with backward, forward and bi-directional elimination we estimate some baseline models that we want to out-perform. m1 <- lm(price ~ km , data=ocr) m2 <- lm(price ~ age , data=ocr) m3 <- lm(price ~ km + age, data=ocr) # Add a model with all variable that will likely overfit the data m4 <- lm(price ~ km + age + kw + trans + combi + allw + ambi + styl + rs + dsg + scr + diesel + lpg + hybrid + cng + age, data=ocr) # Add some features that we engineered -> for training sample ocr$km2 <- ocr$km^2 ocr$km3 <- ocr$km^3 ocr$age2 <- ocr$age^2 ocr$age3 <- ocr$age^3 ocr$kmage <- ocr$km * ocr$age ocr$kmkw <- ocr$km * ocr$kw ocr$kmkwage <- ocr$km * ocr$kw * ocr$age # For testing sample oct$km2 <- oct$km^2 oct$km3 <- oct$km^3 oct$age2 <- oct$age^2 oct$age3 <- oct$age^3 oct$kmage <- oct$km * oct$age oct$kmkw <- oct$km * oct$kw oct$kmkwage <- oct$km * oct$kw * oct$age # Add one 'extra' model, which is my 'top' specification m5 <- lm(price ~ km + age + kw + trans + combi + allw + ambi + styl + rs + dsg + scr + diesel + lpg + hybrid + cng + km2 + age2 + kmage, data = ocr) # No predictions and make sure that negative prices are winsorized p1 <- predict(m1,new=oct) p1[p1<0] <- min(ocr$price) p2 = predict(m2,new=oct) p2[p2<0] <- min(ocr$price) p3 = predict(m3,new=oct) p3[p3<0] <- min(ocr$price) p4 = predict(m4,new=oct) p4[p4<0] <- min(ocr$price) p5 = predict(m5,new=oct) p5[p5<0] <- min(ocr$price) # Backward elimination # The goal is to create a function that will perform the backward elimination. Recall the algorithm: # * Select a threshold p-value. # * Estimate model with all features. # * Estimate p-values of regression coefficients. # * Eliminate feature with highest p-value that is above the threshold. # * Estimate model without the eliminated feature. # * Repeat until no feature can be removed. # We will create two functions(). One will use names of response (target) variable and names of features and returns an as.formula() object. Second will run the algorithm. gen.fm = function(dep='price',x=features) { # This will create the 'dep~1' string - which is y = beta_0 model # paste() is a useful function... it links strings spec = paste(dep,'~1',sep='') # If there are no features - the vector 'x' has no elements - return if (is.null(x)) return(as.formula(spec)) # Number of features NV = length(x) # Loop over all features - we add one feature at a time for (v in 1:NV) spec = paste(spec,'+',x[v],sep='') return(as.formula(spec)) } # Let's try it out: gen.fm(dep='price',x=c('km','age')) # Now the algorithm with backward elimination: bwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) { library(sandwich) library(lmtest) pval <- 1 # We start by the initial full model spec <- gen.fm(dep=dep,x=features) model <- lm(spec,data=dt) while (pval > pt) { estim <- coeftest(model,vcov=vcovHC(model,type='HC1')) idx <- which(estim[,4] > pt) if (length(idx)>0) { to.remove <- rownames(estim)[which(estim[,4] == max(estim[,4]))] features <- features[-which(features == to.remove)] new_spec <- gen.fm(dep=dep,x=features) model <- lm(new_spec,data=dt) } else { pval <- pt - 0.01 } #print(features) } return(model) } # And we can create model 6 and corresponding predictions features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs', 'dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage') m6 <- bwelim(dt=ocr,pt=0.1,dep='price',features=features) p6 <- predict(m6,new=oct) p6[p6<0] <- min(ocr$price) # Forward elimination # The goal is to create a function that will perform the forward elimination. Recall the algorithm: # * Select a threshold p-value. # * Estimate model with an intercept only. # * Add one feature to the specification that has the lowest p-value. # * Repeat until you are unable to find a feature with p-value below the threshold. # We will create a function() that will run the algorithm. fwelim <- function(dt=ocr,pt=0.1,dep='price',features=features) { # Number of features NV <- length(features) # We start by the baseline model with only a constant spec <- gen.fm(dep=dep,x=NULL) # Estimate model m <- lm(spec,data=dt) # Here we will store variables that 'pass' the test good.features <- c() # In forward elimination we need to start to add the 'best' variable at a time. # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left. for (v in 1:NV) { # Remaining number of features NF <- length(features) # Store pvalues mat.pvals <- matrix(NA,nrow=NF,ncol=1) # We need to try to add one-at-a-time all remaining features for (f in 1:NF) { # Generate specification spec <- gen.fm(dep=dep,x=c(good.features,features[f])) # Estimate model model <- lm(spec,data=dt) # Find p-values estim <- coeftest(model,vcov=vcovHC(model,type='HC1')) # Store p-values mat.pvals[f,1] <- estim[dim(estim)[1],4] } # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops if (length(which(mat.pvals < pt))==0) break # Which variable has lowest p-value? add.var <- features[(which(mat.pvals==min(mat.pvals)))][1] # We will add this variable to the set of good features good.features <- c(good.features,add.var) # We will remove this variable from the set of remaining features features <- features[-which(features==add.var)] # Now we repeat the procedure with remaining features until we: # - either have only features that have a p-value above threshold # - or we are with one left } spec <- gen.fm(dep=dep,x=good.features) return(lm(spec,data=dt)) } # And we can create model 7 and corresponding predictions m7 <- fwelim(dt=ocr,pt=0.1,dep='price',features=features) p7 <- predict(m7,new=oct) p7[p7<0] <- min(ocr$price) # Bi-directional elimination # The goal is to create a function that will perform the bi-directional elimination. Recall the algorithm: # * Select a threshold p-value. # * Estimate model with an intercept only. # * Add one feature to the specification that has the lowest p-value. # * Check if any other variables has now p-value below the threshold and if yes, remove. # * Repeat until you are unable to find a feature with p-value below the threshold. # We will create a function() that will run the algorithm. bielim <- function(dt=ocr,pt=0.1,dep='price',features=features) { # Number of features NV <- length(features) # We start by the baseline model with only a constant spec <- gen.fm(dep=dep,x=NULL) # Estimate model m <- lm(spec,data=dt) # Here we will store variables that 'pass' the test good.features <- c() # Here we will store variable that should be tested test.features <- features # In forward elimination we need to start to add the 'best' variable at a time. # We loop over all variables - initially (v=1) we have NV variables to consider. Finally, only one is left. pvalue <- 1 while (pvalue > pt) { # Remaining number of features NF <- length(test.features) # Forward elimination first # Store pvalues mat.pvals <- matrix(NA,nrow=NF,ncol=1) rownames(mat.pvals) <- test.features # We need to try to add one-at-a-time all remaining features for (f in 1:NF) { # Generate specification spec <- gen.fm(dep=dep,x=c(good.features,test.features[f])) # Estimate model model <- lm(spec,data=dt) # Find p-values estim <- coeftest(model,vcov=vcovHC(model,type='HC1')) # Store p-values mat.pvals[f,1] <- estim[dim(estim)[1],4] } # Check if there is at least one pvalue lower as threshold - if not - we are done - Algo stops if (length(which(mat.pvals < pt))==0) break # Which variable has lowest p-value? add.var <- rownames(mat.pvals)[which(mat.pvals==min(mat.pvals))] # We will add this variable to the set of good features good.features <- c(good.features,add.var) # We will remove this variable from the set of remaining features test.features <- features[-which(features %in% good.features)] # Backward elimination second tmp <- bwelim(dt=ocr,pt=pt,dep=dep,features=good.features) good.features <- names(tmp$coefficients[-1]) test.features <- features[-which(features%in%good.features)] } spec <- gen.fm(dep=dep,x=good.features) return(lm(spec,data=dt)) } # And we can create model 8 and corresponding predictions m8 <- bielim(dt=ocr,pt=0.1,dep='price',features=features) p8 <- predict(m8,new=oct) p8[p8<0] <- min(ocr$price) # Forecast evaluation # Let's store the predictions from eight models into a matrix: predictions = cbind(oct$price,p1,p2,p3,p4,p5,p6,p7,p8) head(predictions) colnames(predictions) = c('price',paste('p',1:8,sep='')) # Now we find the Mean Square Error and Mean Absolute Error: mse = predictions[,-1] mae = mse for (i in 1:8) { mse[,i] = (predictions[,1] - predictions[,i+1])^2 mae[,i] = abs(predictions[,1] - predictions[,i+1]) } # What are the average loss values? apply(mse,2,mean) apply(mae,2,mean) # p6 and p8 lead to same model and even p7 is almost the same cor(predictions) # Which model is the best one? # For MCS we need only models that have not the same predictions (correlation 1.0) head(mse) mse <- mse[,-c(6,8)] mae <- mae[,-c(6,8)] library(MCS) MCSprocedure(mse) MCSprocedure(mae) # Under MSE and MAE we have one model left -> p7. One model rulezz them all! ################################################################################ # Let's continue with penalized regressions # LASSO # Load package and write down the features we would like to include (everything) library(glmnet) # We will work with octavia dataset features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs', 'dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage') # The function we are going to use requires features to be in a matrix object X = as.matrix(ocr[,features]) # Outcome variable to be a matrix object as well Y = as.matrix(ocr[,'price']) # We do the same for the testing dataset XT = as.matrix(oct[,features]) # Now we need to estimate \lambda using cross-validation. The function that will do that is cv.glmnet(). # Here 'nfolds' is the number of cross-validation samples (slicing of the training dataset) # 'type.measure' is the loss function that will be used to decide what lambda is the best, # 'alpha' [0,1] parameter is a weight between LASSO = 0 to RIDGE = 1 # Estimate the LASSO cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=1) # Visualize the MSE as a function of log(lambda) plot(cvm) # You can check out coefficients for model with lambda-min coefficients(cvm,s='lambda.min') # or if lambda is 1se away from lambda-min coefficients(cvm,s='lambda.1se') # Now let's predict under lambda.min & lamba.1se p9 = predict(cvm,newx = XT,s='lambda.min') p9[p9<0] = min(ocr$price,na.rm=T) p10 = predict(cvm,newx = XT,s='lambda.1se') p10[p10<0] = min(ocr$price,na.rm=T) ## Ridge # The procedure is very similar just change the alpha = 0 # Estimate the Ridge cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0) # Visualize the MSE as a function of log(lambda) plot(cvm) # You can check out coefficients for model with lambda-min coefficients(cvm,s='lambda.min') # or if lambda is 1se away from lambda-min coefficients(cvm,s='lambda.1se') # Now let's predict under lambda.min & lamba.1se p11 = predict(cvm,newx = XT,s='lambda.min') p11[p11<0] = min(ocr$price,na.rm=T) p12 = predict(cvm,newx = XT,s='lambda.1se') p12[p12<0] = min(ocr$price,na.rm=T) ## Elastic Net # It is similar with Elastic net. Here you can use any alpha [0,1]. # But what value to use? We use 0.5 for now. # Estimate the Elastic Net cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0.5) # Visualize the MSE as a function of log(lambda) plot(cvm) # You can check out coefficients for model with lambda-min coefficients(cvm,s='lambda.min') # or if lambda is 1se away from lambda-min coefficients(cvm,s='lambda.1se') # Now let's predict under lambda.min & lamba.1se p13 = predict(cvm,newx = XT,s='lambda.min') p13[p13<0] = min(ocr$price,na.rm=T) p14 = predict(cvm,newx = XT,s='lambda.1se') p14[p14<0] = min(ocr$price,na.rm=T) ################################################################################ # Advanced # Task - write a code that will cross-validate alpha parameter, e.g. which from seq(0.1,0.9,by=0.1) alpha to use? ################################################################################ ## Adaptive LASSO # Also a combination of LASSO and Ridge # Step 1) Estimate Ridge model (it could be an OLS as well) cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=0) coefficients(cvm,s='lambda.min') # Step 2) The larger the coefficient the lower the penalization in the next step wgt = 1/abs(coefficients(cvm,s='lambda.min')) # Now the weights are normalized so that they sum to 1 wgt = wgt/sum(wgt) sum(wgt) # Step 3) Estimate LASSO where penalization is not the same for every coefficient cvm = cv.glmnet(x=X,y=Y,type.measure='mse',nfolds=10,alpha=1,penalty.factor=wgt[-1]) coefficients(cvm,s='lambda.min') coefficients(cvm,s='lambda.1se') # Predictions p15 = predict(cvm,newx = XT,s='lambda.min') p15[p15<0] = min(ocr$price,na.rm=T) p16 = predict(cvm,newx = XT,s='lambda.1se') p16[p16<0] = min(ocr$price,na.rm=T) # Forecast evaluation # How are we doing? predictions = cbind(oct$price,p1,p2,p3,p4,p5,p6,p9,p10,p11,p12,p13,p14,p15,p16) head(predictions) colnames(predictions) = c('price',paste('p',1:6,sep=''),paste('p',9:16,sep='')) # How many models do we have? M <- ncol(predictions)-1 # Now we find the Mean Square Error and Mean Absolute Error: mse = predictions[,-1] mae = mse for (i in 1:M) { mse[,i] = (predictions[,1] - predictions[,i+1])^2 mae[,i] = abs(predictions[,1] - predictions[,i+1]) } # What are the average loss values? apply(mse,2,mean) apply(mae,2,mean) # Which model is the best one? library(MCS) MCSprocedure(mse) # There is a group of similarly performing models now MCSprocedure(mae) # There is just model preferred model ################################################################################ # Competition # Task # Load a (small) dataset with rental prices apartments <- read.csv(file='C:\\Users\\ckt\\OneDrive - MUNI\\Institutions\\MU\\ML Finance\\2024\\datasets\\aprt1.csv') # Separate into training and testing set.seed(99) N <- dim(apartments)[1] idx <- sample(1:N,size=floor(0.7*N),replace=FALSE) train <- apartments[idx ,] test <- apartments[-idx,] head(train) # The goal is to predict the rent. Run a competition with multiple models. Use only OLS type of models, but you are encouraged to create your own features (use interactions, dummy variables, etc.). Try to beat others!