# Before we continue - run the codes from last seminar + your homework #---------------------------------------------------------------------------------------------------- # PAM - Partition around medoids #---------------------------------------------------------------------------------------------------- # Use PAM to make predictions directly K = 30 A <- Sys.time() pm = pam(zscore.train, k=K, metric = "euclidean", stand = TRUE, nstart=2) Sys.time()-A # 2min names(pm) dim(pm$medoids) pm$clustering clus_char <- table(pm$clustering,train.zopa$default) clus_char clus_char <- cbind(clus_char,clus_char[,2]/apply(clus_char,1,sum)) colnames(clus_char) <- c('NumberGood','NumberBad','ProbD') clus_char # Promising - correlation on the training data seems ok cor(clus_char[pm$clustering,3],train.zopa$default,method='kendall') # However, we need to classify each observation from training and testing datasets # to one of the clusters. # Same procedure as with K-means (we will use the second method) #---------------------------------------------------------------------------------------------------- # Feature engineering - again #---------------------------------------------------------------------------------------------------- # Extract centroid pam.centroid <- pm$medoids # Now we need to assign each observation from training data to one of the clusters. How do I know to which cluster the given observation belongs? # Where is the center of each cluster? In pam.centroid # Here we store information about to which cluster the given observation belongs clu.train = matrix(0,nrow=dim(zscore.train)[1],ncol=K) # Number of clusters K colnames(clu.train) <- paste('hw',c(1:K),sep='') for (i in 1:nrow(zscore.train)) { # We need to find the distance between given observation from testing data to each of the centroid # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset) cld = rbind(pam.centroid,zscore.train[i,]) # Now find the distances between all those observations tmp = as.matrix(dist(cld,method='euclidean')) # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation # Find to what degree/magnitude/percentage the given observation belongs to each cluster clu.train[i,] = (1/tmp[nrow(tmp),1:K])/sum((1/tmp[nrow(tmp),1:K])) } # We could use the following to train the model or to make predictions. # We will only use it to make predictions, just like in KM1 and/or KM2 train2 = data.frame(train.zopa,clu.train) #---------------------------------------------------------------------------------------------------- # We also need to create variables for the testing dataset #---------------------------------------------------------------------------------------------------- clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K colnames(clu.test) <- colnames(clu.train) for (i in 1:nrow(zscore.test)) { # We need to find the distance between given observation from testing data to each of the centroid # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset) cld = rbind(centroid,zscore.test[i,]) # Now find the distances between all those observations tmp = as.matrix(dist(cld,method='euclidean')) # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation # Just add 1 there clu.test[i,] = (1/tmp[nrow(tmp),1:K])/sum(1/tmp[nrow(tmp),1:K]) } # Testind dataset test2 = data.frame(test.zopa,clu.test) ################################## # Could we use PAM with Manhattan distances directly for prediction purposes? # Indeed ################################## # Indeed - recall head(clus_char) # From test1 we know which observation belongs to which cluster. pPAM <- c() for (i in 1:dim(test2)[1]) pPAM[i] = weighted.mean(clus_char[,3],w=clu.test[i,]) tmp <- confusionMatrix(as.factor((pPAM>0.5)*1),reference=true,positive='1') results[18,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) rocPAM <- roc(response=true,predictor=pPAM) results[18,5] <- as.numeric(auc(rocPAM)) results # Terrible ################################## # PCA # zopa # titanic # octavia # Load data & predictions from Seminar 10 First library(factoextra) ############################# # Extract principal components => use in predictive models ############################# # Z-score scaling names(train.zopa) # Let's select macro-economic environment at the time when the loan is being issues zscore.train <- scale(train.zopa[,169:196]) # We do the same with training dataset! zscore.test <- scale(test.zopa[,169:196]) # Number of variables in sprof m = prcomp(x=zscore.train) str(m) get_eigenvalue(m) # Weights in the matrix m$rotation # Rows are variables and Columns are principal components # Values are weights # Extract scores pcc = zscore.train %*% m$rotation head(pcc) # Check - uncorrelated? round(cor(cbind(pcc,zscore.train)),2) # Indeed! principal components are orthogonal. # How to select number of components fviz_eig(m) # Values above 1? round(get_eig(m),2) # Add to training dataset train3 = data.frame(train.zopa,pcc[,1:8]) # Add to testing dataset pcc = as.matrix(zscore.test) %*% m$rotation test3 = data.frame(test.zopa,pcc) ############################# # Estimate models - ############################# # PLM-PC # Specification with extracted principal components specs = gen.fm(dep='default',x=names(train3)[c(7,11:167,201,207:215)]) # Now spec without profitability measures but instead using 2 PCs # Probability linear model m0 = lm(specs, data=train3) p0 = predict(m0,new=test3) p0[p0<0] = min(p0[p0>0]) p0[p0>1] = max(p0[p0<1]) tmp <- confusionMatrix(as.factor((p0>0.5)*1),reference=true,positive='1') results[19,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc0 <- roc(response=true,predictor=p0) results[19,5] <- as.numeric(auc(roc0)) # Logistic regression m1 = glm(specs, data=train3, family='binomial') p1 = (1/(1+exp(-predict(m1, new=test3)))) tmp <- confusionMatrix(as.factor((p1>0.5)*1),reference=true,positive='1') results[20,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc1 <- roc(response=true,predictor=p1) results[20,5] <- as.numeric(auc(roc1)) # The LASSO x = as.matrix(train3[,all.vars(specs)[-1]]) newx = as.matrix(test3[,all.vars(specs)[-1]]) y = as.matrix(train3[,all.vars(specs)[1]]) lasso.lr = cv.glmnet(x=x,y=y,type.measure='deviance',nfolds=10,family='binomial',alpha=1) p21se = as.numeric((1/(1+exp(-predict(lasso.lr,newx=newx,s='lambda.1se'))))) tmp <- confusionMatrix(as.factor((p21se>0.5)*1),reference=true,positive='1') results[21,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc21se <- roc(response=true,predictor=p21se) results[21,5] <- as.numeric(auc(roc21se)) # Random forest rf.tree = ranger(specs,data=train3, num.trees=1000,mtry=9,min.node.size=5,max.depth=0) p3 = predict(rf.tree,data=test3)$predictions tmp <- confusionMatrix(as.factor((p3>0.5)*1),reference=true,positive='1') results[22,1:4] <- c(tmp$overall[c(1)],tmp$byClass[c(11,1,2)]) roc3 <- roc(response=true,predictor=p3) results[22,5] <- as.numeric(auc(roc3)) round(results,4) ############################# # Predict the price of a used car: # Benchmark: # LR, RF, XGB # Competing: # KM directly # LR-KM, RF-KM, XGB-KM # LR-PC, RF-PC, XGB-PC # Evaluate via MSE, MAE and MCS algorithm + mean absolute percentage error ############################# # First load training and testing data into the workspace # rm(list=ls()) oct <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//oct.csv') ocr <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//ocr.csv') # For training sample ocr$km2 <- ocr$km^2 ocr$km3 <- ocr$km^3 ocr$age2 <- ocr$age^2 ocr$age3 <- ocr$age^3 ocr$kmage <- ocr$km * ocr$age ocr$kmkw <- ocr$km * ocr$kw ocr$kmkwage <- ocr$km * ocr$kw * ocr$age # For testing sample oct$km2 <- oct$km^2 oct$km3 <- oct$km^3 oct$age2 <- oct$age^2 oct$age3 <- oct$age^3 oct$kmage <- oct$km * oct$age oct$kmkw <- oct$km * oct$kw oct$kmkwage <- oct$km * oct$kw * oct$age # Store results results = matrix(NA,nrow=10,ncol=5) colnames(results) = c('mse','mse_mcs','mae','mae_mcs','mpe') rownames(results) = c('LR','RF', 'XGB', 'KM', 'LR-KM','RF-KM', 'XGB-KM', 'LR-PC','RF-PC', 'XGB-PC') loss.mse = matrix(NA,nrow=dim(oct)[1],ncol=10) colnames(loss.mse) = rownames(results) loss.mae = loss.mse ############################# # Specification features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage') specs <- gen.fm(dep='price',x=features) ############################# # Benchmark models ############################# # LR m0 <- lm(specs,data=ocr) p0 <- predict(m0,new=oct) # RF rf.tree <- ranger(specs,data=ocr,num.trees=5000,mtry=6,min.node.size=10,max.depth=0) p1 <- predict(rf.tree,data=oct)$predictions # XGB gb.tree <- gbm(specs,data=ocr,distribution='gaussian',n.trees=5000,interaction.depth=3,shrinkage=0.01) p2 <- predict(gb.tree,newdata=oct) ############################# # Evaluate benchmark models ############################# p <- oct$price loss.mse[,1] <- (p-p0)^2 loss.mae[,1] <- abs(p-p0) results[1,c(1,3,5)] <- c(mean(loss.mse[,1]),mean(loss.mae[,1]),mean(abs(p-p0)/p)) loss.mse[,2] <- (p-p1)^2 loss.mae[,2] <- abs(p-p1) results[2,c(1,3,5)] <- c(mean(loss.mse[,2]),mean(loss.mae[,2]),mean(abs(p-p1)/p)) loss.mse[,3] <- (p-p2)^2 loss.mae[,3] <- abs(p-p2) results[3,c(1,3,5)] <- c(mean(loss.mse[,3]),mean(loss.mae[,3]),mean(abs(p-p2)/p)) results ############################# # Calculate distance matrix ############################# # Select any number of features & any number of clusters # We use min/max standardization for those two zscore.train <- ocr[,all.vars(specs)[-1]] zscore.train$km <- (max(ocr$km) - zscore.train$km)/(max(ocr$km)-min(zscore.train$km)) zscore.train$age <- (max(ocr$age) - zscore.train$age)/(max(ocr$age)-min(zscore.train$age)) # We also need to standardize data for testing dataset oct. Note the 'subtle' difference zscore.test <- oct[,all.vars(specs)[-1]] zscore.test$km <- (max(ocr$km) - zscore.test$km)/(max(ocr$km)-min(zscore.train$km)) zscore.test$age <- (max(ocr$age) - zscore.test$age)/(max(ocr$age)-min(zscore.train$age)) # The maximum is from training not testing here! ############################# # Calculate K-Means ############################# # Given the distance matrix - Find K clusters K = 25 res.hw <- kmeans(zscore.train[,c('km','age')],centers=K,iter.max=100,nstart=2000,algorithm='Hartigan-Wong') # Check number of observations in cluster table(res.hw$cluster) cluster.id <- unique(res.hw$cluster) price.cluster <- matrix(NA,nrow=K,ncol=3) colnames(price.cluster) <- c('mean','sd','observations') for (i in 1:length(cluster.id)) { tmp <- ocr$price[res.hw$cluster == cluster.id[i]] price.cluster[i,] <- c(mean(tmp),sd(tmp),length(tmp)) } price.cluster[order(price.cluster[,1]),] # There is some heterogeneity between clusters and prices -> perhaps we are able to identify # characteristics of cheap/expensive cars better. #---------------------------------------------------------------------------------------------------- # Feature engineering # Perhaps we can create variables that will tell to which cluster the given car belongs to. # Now let's create dummies corresponding to K-1 clusters, i.e. create new features. 1 if the observation belongs to the given cluster and 0 otherwise clu.train = dummy_cols(res.hw$cluster) head(clu.train) clu.train = clu.train[,-1] colnames(clu.train) = paste('hw',c(1:K),sep='') head(clu.train) # The 'clu.train' are our new features 0/1 that we can use to train a model train1 = data.frame(ocr,clu.train) #---------------------------------------------------------------------------------------------------- # Now we need to assign each observation from TESTING data to one of the clusters. How do I know to which cluster the given observation belongs? # Where is the center of each cluster? centroid <- res.hw$centers head(centroid) clu.test = matrix(0,nrow=dim(zscore.test)[1],ncol=K) # Number of clusters K colnames(clu.test) <- colnames(clu.train) for (i in 1:nrow(zscore.test)) { # We need to find the distance between given observation from testing data to each of the centroid # Combine centroids (they are just point in the feature-space with the given obseration from the training dataset) cld = rbind(centroid,zscore.test[i,c('km','age')]) # Now find the distances between all those observations tmp = as.matrix(dist(cld,method='euclidean')) # 'tmp' is the distance matrix and LAST row are distances from each centroid to the i^{th} observation # tmp[dim(tmp)[1],1:K] # Find to which centroid the observation is closest to opt.clus = which(tmp[dim(tmp)[1],1:K]==min(tmp[dim(tmp)[1],1:K])) # Just add 1 there clu.test[i,opt.clus] = 1 } # Check if we have observations in every cluster apply(clu.test,2,mean) # Note: there might be observations that do not belong to any cluster # We create two training and testing datasets with either one or the second approach of assignment of an observation to a cluster test1 = data.frame(oct,clu.test) ############################# # Use K-Means clusters to predict car's price # Recall # From test1 we know which observation belongs to which cluster. # We can set-up a rule # If ProbD > 0.5 => defaulted loan p3 <- c() for (i in 1:dim(test1)[1]) p3[i] = price.cluster[which(clu.test[i,] == 1),1] ############################# # Competing models features <- c('km','age','kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng','km2','age2','kmage','kmkw','km3','age3','kmkwage', paste('hw',1:(K-1),sep='')) # why K-1? -> multicollinearity specs <- gen.fm(dep='price',x=features) # LR m4 <- lm(specs,data=train1) p4 <- predict(m4,new=test1) # RF rf.tree <- ranger(specs,data=train1,num.trees=5000,mtry=9,min.node.size=10,max.depth=0) p5 <- predict(rf.tree,data=test1)$predictions # XGB gb.tree <- gbm(specs,data=train1,distribution='gaussian',n.trees=5000,interaction.depth=3,shrinkage=0.01) p6 <- predict(gb.tree,newdata=test1) ############################# # Evaluate competing models ############################# loss.mse[,4] <- (p-p3)^2 loss.mae[,4] <- abs(p-p3) results[4,c(1,3,5)] <- c(mean(loss.mse[,4]),mean(loss.mae[,4]),mean(abs(p-p3)/p)) loss.mse[,5] <- (p-p4)^2 loss.mae[,5] <- abs(p-p4) results[5,c(1,3,5)] <- c(mean(loss.mse[,5]),mean(loss.mae[,5]),mean(abs(p-p4)/p)) loss.mse[,6] <- (p-p5)^2 loss.mae[,6] <- abs(p-p5) results[6,c(1,3,5)] <- c(mean(loss.mse[,6]),mean(loss.mae[,6]),mean(abs(p-p5)/p)) loss.mse[,7] <- (p-p6)^2 loss.mae[,7] <- abs(p-p6) results[7,c(1,3,5)] <- c(mean(loss.mse[,7]),mean(loss.mae[,7]),mean(abs(p-p6)/p)) results ############################# # Extract principal components => use in predictive models ############################# # Z-score scaling features <- c('kw','trans','combi','allw','ambi','styl','rs','dsg','scr','diesel','lpg','hybrid','cng') # Let's select all variables and use min-max standardization NF <- length(features) mm.train <- ocr mm.test <- oct for (i in 1:NF) { # Training dataset standardization x <- ocr[,features[i]] max.val <- max(x) min.val <- min(x) x <- (max(x) - x)/(max(x)-min(x)) mm.train[,which(names(mm.train)==features[i])] <- x # Testing dataset standardization x <- oct[,features[i]] # Note that next we are using max and min values from the TRAINING dataset x <- (max.val - x)/(max.val-min.val) mm.test[,which(names(mm.train)==features[i])] <- x } # Check apply(mm.train,2,summary) apply(mm.test ,2,summary) # Number of variables in sprof m = prcomp(x=mm.train[,features]) str(m) get_eigenvalue(m) # Weights in the matrix m$rotation # Rows are variables and Columns are principal components # Values are weights # Extract principal components pcc = as.matrix(mm.train[,features]) %*% m$rotation head(pcc) # Check - uncorrelated? round(cor(cbind(pcc,mm.train)),2) # Indeed! principal components are orthogonal. # How to select number of components fviz_eig(m) # First 5 seem to be important # Values above 1? round(get_eig(m),2) # First 9 seem to explain 99% of variance in the data - use that! # Add to training dataset mm.train = data.frame(mm.train,pcc[,1:9]) # Add to testing dataset pcc = as.matrix(mm.test[,features]) %*% m$rotation mm.test = data.frame(mm.test,pcc) ############################# # Competing models features <- c('km','age','km2','age2','kmage','kmkw','km3','age3','kmkwage', paste('PC',1:9,sep='')) specs <- gen.fm(dep='price',x=features) # LR lr <- lm(specs,data=mm.train) p7 <- predict(lr,new=mm.test) # RF -> I used mtry=3 not 9 as we have fewer features rf.tree <- ranger(specs,data=mm.train,num.trees=5000,mtry=4,min.node.size=10,max.depth=0) p8 <- predict(rf.tree,data=mm.test)$predictions # XGB gb.tree <- gbm(specs,data=mm.train,distribution='gaussian',n.trees=5000,interaction.depth=3,shrinkage=0.01) p9 <- predict(gb.tree,newdata=mm.test) ############################# # Evaluate competing models ############################# loss.mse[,8] <- (p-p7)^2 loss.mae[,8] <- abs(p-p7) results[8,c(1,3,5)] <- c(mean(loss.mse[,8]),mean(loss.mae[,8]),mean(abs(p-p7)/p)) loss.mse[,9] <- (p-p8)^2 loss.mae[,9] <- abs(p-p8) results[9,c(1,3,5)] <- c(mean(loss.mse[,9]),mean(loss.mae[,9]),mean(abs(p-p8)/p)) loss.mse[,10] <- (p-p9)^2 loss.mae[,10] <- abs(p-p9) results[10,c(1,3,5)] <- c(mean(loss.mse[,10]),mean(loss.mae[,10]),mean(abs(p-p9)/p)) results library(MCS) MCSprocedure(loss.mse) MCSprocedure(loss.mae)