# Before we continue - run the codes from last seminar + your homework

#----------------------------------------------------------------------------------------------------
# PAM - Partition around medoids
#----------------------------------------------------------------------------------------------------
# Use PAM to make predictions directly
K  = 30
A <- Sys.time()
pm = pam(zscore.train, k=K, metric = "euclidean", stand = TRUE, nstart=2)
Sys.time()-A # 2min
names(pm)
dim(pm$medoids)
pm$clustering
clus_char <- table(pm$clustering,train.zopa$default)
clus_char
clus_char <- cbind(clus_char,clus_char[,2]/apply(clus_char,1,sum))
colnames(clus_char) <- c('NumberGood','NumberBad','ProbD')
clus_char
# Promising - correlation on the training data seems ok
cor(clus_char[pm$clustering,3],train.zopa$default,method='kendall')


# However, we need to classify each observation from training and testing datasets
# to one of the clusters.
# Same procedure as with K-means (we will use the second method)
#----------------------------------------------------------------------------------------------------
# Feature engineering - again
#----------------------------------------------------------------------------------------------------
# Extract centroid
pam.centroid <- pm$medoids
# Now we need to assign each observation from training data to one of the clusters. How do I know to which cluster the given observation belongs?
# Where is the center of each cluster? In pam.centroid
# Here we store information about to which cluster the given observation belongs
clu.train = matrix(0,nrow=dim(zscore.train)[1],ncol=K) # Number of clusters K
colnames(clu.train) <- paste('hw',c(1:K),sep='')
for (i in 1:nrow(zscore.train)) {
  # We need to find the distance between given observation from testing data to each of the centroid
  # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset)
  cld = rbind(pam.centroid,zscore.train[i,])
  # Now find the distances between all those observations
  tmp = as.matrix(dist(cld,method='euclidean'))
  # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation
  # Find to what degree/magnitude/percentage the given observation belongs to each cluster
  clu.train[i,] = (1/tmp[nrow(tmp),1:K])/sum((1/tmp[nrow(tmp),1:K]))
}
# We could use the following to train the model or to make predictions.
# We will only use it to make predictions, just like in KM1 and/or KM2
train2 = data.frame(train.zopa,clu.train)
#----------------------------------------------------------------------------------------------------
# We also need to create variables for the testing dataset
#----------------------------------------------------------------------------------------------------
clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K
colnames(clu.test) <- colnames(clu.train)
for (i in 1:nrow(zscore.test)) {
  # We need to find the distance between given observation from testing data to each of the centroid
  # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset)
  cld = rbind(centroid,zscore.test[i,])
  # Now find the distances between all those observations
  tmp = as.matrix(dist(cld,method='euclidean'))
  # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation
  # Just add 1 there
  clu.test[i,] = (1/tmp[nrow(tmp),1:K])/sum(1/tmp[nrow(tmp),1:K])
}
# Testind dataset
test2 = data.frame(test.zopa,clu.test)

##################################
# Could we use PAM with Manhattan distances directly for prediction purposes?
# Indeed
##################################
# Indeed - recall
head(clus_char)
# From test1 we know which observation belongs to which cluster.
pPAM <- c()
for (i in 1:dim(test2)[1]) pPAM[i] = weighted.mean(clus_char[,3],w=clu.test[i,])
tmp <- confusionMatrix(as.factor((pPAM>0.5)*1),reference=true,positive='1')
results[18,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
rocPAM <- roc(response=true,predictor=pPAM)
results[18,5]   <- as.numeric(auc(rocPAM))
results
# Terrible
##################################


# PCA
# zopa
# titanic
# octavia

# Load data & predictions from Seminar 10 First
library(factoextra)

#############################
# Extract principal components => use in predictive models
#############################
# Z-score scaling
names(train.zopa)
# Let's select macro-economic environment at the time when the loan is being issues
zscore.train <- scale(train.zopa[,169:196])
# We do the same with training dataset!
zscore.test  <- scale(test.zopa[,169:196])
# Number of variables in sprof
m = prcomp(x=zscore.train)
str(m)
get_eigenvalue(m)
# Weights in the matrix
m$rotation
# Rows are variables and Columns are principal components
# Values are weights
# Extract scores
pcc = zscore.train %*% m$rotation
head(pcc)
# Check - uncorrelated?
round(cor(cbind(pcc,zscore.train)),2)
# Indeed! principal components are orthogonal.
# How to select number of components
fviz_eig(m)
# Values above 1?
round(get_eig(m),2)
# Add to training dataset
train3 = data.frame(train.zopa,pcc[,1:8])
# Add to testing dataset
pcc = as.matrix(zscore.test) %*% m$rotation
test3 = data.frame(test.zopa,pcc)

#############################
# Estimate models - 
#############################
# PLM-PC
# Specification with extracted principal components 
specs = gen.fm(dep='default',x=names(train3)[c(7,11:167,201,207:215)])
# Now spec without profitability measures but instead using 2 PCs

# Probability linear model
m0 = lm(specs, data=train3)
p0 = predict(m0,new=test3)
p0[p0<0] = min(p0[p0>0])
p0[p0>1] = max(p0[p0<1])
tmp <- confusionMatrix(as.factor((p0>0.5)*1),reference=true,positive='1')
results[19,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc0 <- roc(response=true,predictor=p0)
results[19,5]   <- as.numeric(auc(roc0))

# Logistic regression
m1 = glm(specs, data=train3, family='binomial')
p1 = (1/(1+exp(-predict(m1, new=test3))))
tmp <- confusionMatrix(as.factor((p1>0.5)*1),reference=true,positive='1')
results[20,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc1 <- roc(response=true,predictor=p1)
results[20,5]   <- as.numeric(auc(roc1))

# The LASSO
x = as.matrix(train3[,all.vars(specs)[-1]])
newx = as.matrix(test3[,all.vars(specs)[-1]])
y = as.matrix(train3[,all.vars(specs)[1]])
lasso.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=1)
p21se = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.1se')))))
tmp <- confusionMatrix(as.factor((p21se>0.5)*1),reference=true,positive='1')
results[21,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc21se <- roc(response=true,predictor=p21se)
results[21,5]   <- as.numeric(auc(roc21se))

# Random forest
rf.tree = ranger(specs,data=train3,
                 num.trees=1000,mtry=9,min.node.size=5,max.depth=0)  
p3 = predict(rf.tree,data=test3)$predictions
tmp <- confusionMatrix(as.factor((p3>0.5)*1),reference=true,positive='1')
results[22,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)])
roc3 <- roc(response=true,predictor=p3)
results[22,5]   <- as.numeric(auc(roc3))
round(results,4)

#############################
# Predict the price of a used car: 
# Benchmark:
# LR, RF, XGB
# Competing:
# KM directly
# LR-KM, RF-KM, XGB-KM
# LR-PC, RF-PC, XGB-PC
# Evaluate via MSE, MAE and MCS algorithm + mean absolute percentage error
#############################
# First load training and testing data into the workspace
# rm(list=ls())
oct <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//oct.csv')
ocr <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//ocr.csv')
# For training sample
ocr$km2   <- ocr$km^2
ocr$km3   <- ocr$km^3
ocr$age2  <- ocr$age^2
ocr$age3  <- ocr$age^3
ocr$kmage <- ocr$km * ocr$age
ocr$kmkw  <- ocr$km * ocr$kw
ocr$kmkwage <- ocr$km * ocr$kw * ocr$age
# For testing sample
oct$km2   <- oct$km^2         
oct$km3   <- oct$km^3
oct$age2  <- oct$age^2
oct$age3  <- oct$age^3
oct$kmage <- oct$km * oct$age
oct$kmkw  <- oct$km * oct$kw
oct$kmkwage <- oct$km * oct$kw * oct$age
# Store results
results =            matrix(NA,nrow=10,ncol=5)
colnames(results)  = c('mse','mse_mcs','mae','mae_mcs','mpe')
rownames(results)  = c('LR','RF', 'XGB',
                       'KM',
                       'LR-KM','RF-KM', 'XGB-KM',
                       'LR-PC','RF-PC', 'XGB-PC')
loss.mse           = matrix(NA,nrow=dim(oct)[1],ncol=10)
colnames(loss.mse) = rownames(results)
loss.mae           = loss.mse

#############################
# Specification
features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage')
specs    <- gen.fm(dep='price',x=features)

#############################
# Benchmark models
#############################
# LR
m0      <- lm(specs,data=ocr)
p0      <- predict(m0,new=oct)
# RF
rf.tree <- ranger(specs,data=ocr,num.trees=5000,mtry=6,min.node.size=10,max.depth=0)
p1      <- predict(rf.tree,data=oct)$predictions
# XGB
gb.tree <- gbm(specs,data=ocr,distribution='gaussian',n.trees=5000,interaction.depth=3,shrinkage=0.01)
p2      <- predict(gb.tree,newdata=oct)
#############################
# Evaluate benchmark models
#############################
p                    <- oct$price
loss.mse[,1]         <- (p-p0)^2
loss.mae[,1]         <- abs(p-p0)
results[1,c(1,3,5)]  <- c(mean(loss.mse[,1]),mean(loss.mae[,1]),mean(abs(p-p0)/p))

loss.mse[,2]         <- (p-p1)^2
loss.mae[,2]         <- abs(p-p1)
results[2,c(1,3,5)]  <- c(mean(loss.mse[,2]),mean(loss.mae[,2]),mean(abs(p-p1)/p))

loss.mse[,3]         <- (p-p2)^2
loss.mae[,3]         <- abs(p-p2)
results[3,c(1,3,5)]  <- c(mean(loss.mse[,3]),mean(loss.mae[,3]),mean(abs(p-p2)/p))
results

#############################
# Calculate distance matrix
#############################
# Select any number of features & any number of clusters
# We use min/max standardization for those two
zscore.train     <- ocr[,all.vars(specs)[-1]]
zscore.train$km  <- (max(ocr$km) - zscore.train$km)/(max(ocr$km)-min(zscore.train$km))
zscore.train$age <- (max(ocr$age) - zscore.train$age)/(max(ocr$age)-min(zscore.train$age))
# We also need to standardize data for testing dataset oct. Note the 'subtle' difference
zscore.test      <- oct[,all.vars(specs)[-1]]
zscore.test$km  <- (max(ocr$km) - zscore.test$km)/(max(ocr$km)-min(zscore.train$km))
zscore.test$age <- (max(ocr$age) - zscore.test$age)/(max(ocr$age)-min(zscore.train$age))
# The maximum is from training not testing here!

#############################
# Calculate K-Means
#############################
# Given the distance matrix - Find K clusters
K = 25
res.hw                  <- kmeans(zscore.train[,c('km','age')],centers=K,iter.max=100,nstart=2000,algorithm='Hartigan-Wong')
# Check number of observations in cluster
table(res.hw$cluster)
cluster.id              <- unique(res.hw$cluster)
price.cluster           <- matrix(NA,nrow=K,ncol=3)
colnames(price.cluster) <- c('mean','sd','observations')
for (i in 1:length(cluster.id)) {
  tmp               <- ocr$price[res.hw$cluster == cluster.id[i]]
  price.cluster[i,] <- c(mean(tmp),sd(tmp),length(tmp)) 
}
price.cluster[order(price.cluster[,1]),]
# There is some heterogeneity between clusters and prices -> perhaps we are able to identify
# characteristics of cheap/expensive cars better.
#----------------------------------------------------------------------------------------------------
# Feature engineering
# Perhaps we can create variables that will tell to which cluster the given car belongs to.
# Now let's create dummies corresponding to K-1 clusters, i.e. create new features. 1 if the observation belongs to the given cluster and 0 otherwise
clu.train = dummy_cols(res.hw$cluster)
head(clu.train)
clu.train = clu.train[,-1]
colnames(clu.train) = paste('hw',c(1:K),sep='')
head(clu.train)
# The 'clu.train' are our new features 0/1 that we can use to train a model
train1 = data.frame(ocr,clu.train)
#----------------------------------------------------------------------------------------------------
# Now we need to assign each observation from TESTING data to one of the clusters. How do I know to which cluster the given observation belongs?
# Where is the center of each cluster?
centroid <- res.hw$centers
head(centroid)
clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K
colnames(clu.test) <- colnames(clu.train)
for (i in 1:nrow(zscore.test)) {
  # We need to find the distance between given observation from testing data to each of the centroid
  # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset)
  cld = rbind(centroid,zscore.test[i,c('km','age')])
  # Now find the distances between all those observations
  tmp = as.matrix(dist(cld,method='euclidean'))
  # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation
  # tmp[dim(tmp)[1],1:K]
  # Find to which centroid the observation is closest to
  opt.clus = which(tmp[dim(tmp)[1],1:K]==min(tmp[dim(tmp)[1],1:K]))
  # Just add 1 there
  clu.test[i,opt.clus] = 1
}
# Check if we have observations in every cluster
apply(clu.test,2,mean)
# Note: there might be observations that do not belong to any cluster
# We create two training and testing datasets with either one or the second approach of assignment of an observation to a cluster
test1 = data.frame(oct,clu.test)

#############################
# Use K-Means clusters to predict car's price
# Recall
# From test1 we know which observation belongs to which cluster.
# We can set-up a rule
# If ProbD > 0.5 => defaulted loan
p3 <- c()
for (i in 1:dim(test1)[1]) p3[i] = price.cluster[which(clu.test[i,] == 1),1]

#############################
# Competing models
features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage',
              paste('hw',1:(K-1),sep='')) # why K-1? -> multicollinearity
specs    <- gen.fm(dep='price',x=features)
# LR
m4      <- lm(specs,data=train1)
p4      <- predict(m4,new=test1)
# RF
rf.tree <- ranger(specs,data=train1,num.trees=5000,mtry=9,min.node.size=10,max.depth=0)
p5      <- predict(rf.tree,data=test1)$predictions
# XGB
gb.tree <- gbm(specs,data=train1,distribution='gaussian',n.trees=5000,interaction.depth=3,shrinkage=0.01)
p6      <- predict(gb.tree,newdata=test1)

#############################
# Evaluate competing models
#############################
loss.mse[,4]         <- (p-p3)^2
loss.mae[,4]         <- abs(p-p3)
results[4,c(1,3,5)]  <- c(mean(loss.mse[,4]),mean(loss.mae[,4]),mean(abs(p-p3)/p))

loss.mse[,5]         <- (p-p4)^2
loss.mae[,5]         <- abs(p-p4)
results[5,c(1,3,5)]  <- c(mean(loss.mse[,5]),mean(loss.mae[,5]),mean(abs(p-p4)/p))

loss.mse[,6]         <- (p-p5)^2
loss.mae[,6]         <- abs(p-p5)
results[6,c(1,3,5)]  <- c(mean(loss.mse[,6]),mean(loss.mae[,6]),mean(abs(p-p5)/p))

loss.mse[,7]         <- (p-p6)^2
loss.mae[,7]         <- abs(p-p6)
results[7,c(1,3,5)]  <- c(mean(loss.mse[,7]),mean(loss.mae[,7]),mean(abs(p-p6)/p))
results

#############################
# Extract principal components => use in predictive models
#############################
# Z-score scaling
features <- c('kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng')
# Let's select all variables and use min-max standardization
NF       <- length(features)
mm.train <- ocr
mm.test  <- oct
for (i in 1:NF) {
  # Training dataset standardization
  x       <- ocr[,features[i]]
  max.val <- max(x)
  min.val <- min(x)
  x       <- (max(x) - x)/(max(x)-min(x))
  mm.train[,which(names(mm.train)==features[i])] <- x
  
  # Testing dataset standardization
  x       <- oct[,features[i]]
  # Note that next we are using max and min values from the TRAINING dataset
  x       <- (max.val - x)/(max.val-min.val)
  mm.test[,which(names(mm.train)==features[i])] <- x
}
# Check
apply(mm.train,2,summary)
apply(mm.test ,2,summary)
# Number of variables in sprof
m = prcomp(x=mm.train[,features])
str(m)
get_eigenvalue(m)
# Weights in the matrix
m$rotation
# Rows are variables and Columns are principal components
# Values are weights
# Extract principal components
pcc = as.matrix(mm.train[,features]) %*% m$rotation
head(pcc)
# Check - uncorrelated?
round(cor(cbind(pcc,mm.train)),2)
# Indeed! principal components are orthogonal.
# How to select number of components
fviz_eig(m)
# First 5 seem to be important
# Values above 1?
round(get_eig(m),2)
# First 9 seem to explain 99% of variance in the data - use that!
# Add to training dataset
mm.train = data.frame(mm.train,pcc[,1:9])
# Add to testing dataset
pcc = as.matrix(mm.test[,features]) %*% m$rotation
mm.test = data.frame(mm.test,pcc)

#############################
# Competing models
features <- c('km','age','km2','age2','kmage','kmkw','km3','age3','kmkwage',
              paste('PC',1:9,sep=''))
specs    <- gen.fm(dep='price',x=features)
# LR
lr      <- lm(specs,data=mm.train)
p7      <- predict(lr,new=mm.test)
# RF -> I used mtry=3 not 9 as we have fewer features
rf.tree <- ranger(specs,data=mm.train,num.trees=5000,mtry=4,min.node.size=10,max.depth=0)
p8      <- predict(rf.tree,data=mm.test)$predictions
# XGB
gb.tree <- gbm(specs,data=mm.train,distribution='gaussian',n.trees=5000,interaction.depth=3,shrinkage=0.01)
p9      <- predict(gb.tree,newdata=mm.test)

#############################
# Evaluate competing models
#############################
loss.mse[,8]         <- (p-p7)^2
loss.mae[,8]         <- abs(p-p7)
results[8,c(1,3,5)]  <- c(mean(loss.mse[,8]),mean(loss.mae[,8]),mean(abs(p-p7)/p))

loss.mse[,9]         <- (p-p8)^2
loss.mae[,9]         <- abs(p-p8)
results[9,c(1,3,5)]  <- c(mean(loss.mse[,9]),mean(loss.mae[,9]),mean(abs(p-p8)/p))

loss.mse[,10]         <- (p-p9)^2
loss.mae[,10]         <- abs(p-p9)
results[10,c(1,3,5)]  <- c(mean(loss.mse[,10]),mean(loss.mae[,10]),mean(abs(p-p9)/p))

results

library(MCS)
MCSprocedure(loss.mse)
MCSprocedure(loss.mae)