# title: "Seminar_3" # author: "Stefan Lyocsa" # date: "2024-10-09" # output: html_document ################################ # Content ## Data cleaning and manipulation. ## Separating data into training and testing. ## Summary statistics: ## + mean, ## + sd, ## + quantiles, ## + skewness, ## + kurtosis, ## + normality test, ## + correlation table. # Visualize data: Overlapping histograms, Ordered box-plots, x-y plots. # Estimate OLS models: # Given model specifications. # Your own model specifications. # Model prediction # Forecast evaluation: ## + Statistical loss function. ## + Model confidence set ################################ ################################ # Data cleaning and manipulation ################################ # Load the original dataset into the environment oc <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//Week 3//octavia.csv',dec='.',sep=',') # Let's inspect the small data. head(oc,n=10) tail(oc,n=10) # It seems that in the 'fuel' column we have special characters. Let's get rid of that. # First we transform the values in the column to strings (text). oc$fuel <- as.character(oc$fuel) table(oc$fuel) # Now we create three dummy variables. One for each type of fuel: oc$diesel <- (oc$fuel == 'Diesel')*1 oc$petrol <- (oc$fuel == 'Benz\xedn')*1 oc$petgas <- (oc$fuel == 'Benz\xedn+Plyn')*1 head(oc,n=10) # It seems good now. We remove the fuel variable, but also the first column seems uninformative. So why have it? oc[,c('X','fuel')] = NULL head(oc,n=10) # I would also rename variables: names(oc)[c(4,5,6)] = c('power','region','transmission') names(oc) # We could transform year into age. The data were retrieved in 2022 so the age is 2022 - year oc$age = 2022-oc$year summary(oc$age) # But we notice that there are not many older cars. Perhaps we should winsorize: table(oc$age) oc$age[oc$age >= 18] = 18 table(oc$age) # The mileage is a little bit suspicious - why? summary(oc$km) # I also do not like the unnecessary large numbers oc$km = oc$km/1000 # Let's visualize the data using a simple histogram hist(oc$km,prob=T) # We can do better: hist(oc$km,breaks=20,prob=T,yaxt='n',xaxt='n',density=30,col=rgb(0.5,0.5,0.5,alpha=0.8),xlab=c(),ylab=c(),main='') axis(1,at=seq(from=0,to=max(oc$km),by=100),label=seq(from=0,to=max(oc$km),by=100),cex.axis=0.85) axis(2,at=seq(from=0,to=0.01,by=0.001),label=seq(from=0,to=0.01,by=0.001),cex.axis=0.85,las=2) legend('bottomright',bty='n',legend='Kilometers') legend('topleft',bty='n',legend='Density') # Relatively unused cars are perhaps unique and deserve a separate variable oc$km10 = (oc$km<10)*1 # Also notice that mileage is right-skewed so perhaps we should use a log-transform hist(log(oc$km)) # But we will not, the right-skewness just transformed into left-skewness. # Let's take a look at the power of the cars. summary(oc$power) table(oc$power) # It seems like there might be errors but also power levels. Let's create dummies for different ranges of car power. Here the domain knowledge might guide us to make more accurate power categories. oc$power_lowest = (oc$power<77)*1 oc$power_low = (oc$power>=77 & oc$power<85)*1 oc$power_mid = (oc$power>=85 & oc$power<103)*1 oc$power_high = (oc$power>=103 & oc$power<118)*1 oc$power_highest = (oc$power>118)*1 # We could also create interaction terms: oc$eng1 = c(oc$power == 77 & oc$diesel == 1)*1 oc$eng2 = c(oc$power == 81 & oc$diesel == 1)*1 oc$eng3 = c(oc$power == 85 & oc$diesel == 1)*1 oc$eng4 = c(oc$power == 103 & oc$diesel == 1)*1 oc$eng6 = c(oc$power == 110 & oc$diesel == 1)*1 # Now let's focus on the price - key variable of interest. summary(oc$price) hist(oc$price,prob=F,breaks=15) # dev.off() # this is used if you want to reset any plot attributes (layout of the plot, size of text, etc.) hist(oc$price,breaks=15,prob=F,yaxt='n',xaxt='n',ylim=c(0,180),xlim=c(0,40000),density=30,col=rgb(0.5,0.5,0.5,alpha=0.8),xlab=c(),ylab=c(),main='') axis(1,at=seq(from=0,to=40000,by=5000),label=seq(from=0,to=40000,by=5000),cex.axis=0.85) axis(2,at=seq(from=0,to=180,by=30),label=seq(from=0,to=180,by=30),cex.axis=0.85) legend('bottomright',bty='n',legend='Price') legend('topleft',bty='n',legend='Density') # For now, we will let the price be what it is. Finalize the dataset and remove unnecessary variables. names(oc) oc[,c('year','power','region','fuel')] = NULL names(oc) ############################################# # Import new data from 2024 oc <- read.csv(file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//octavia2024.csv',dec='.',sep=',') oc <- oc[,-1] oc$km <- oc$km/1000 oc$age <- 2024 - oc$year oc$year<- NULL ############################################# # Separating data into training and testing # This should be random. In order to make our analysis reproducible we set initial conditions of the pseudo-random generator to be the same. set.seed(50) # Now we randomly select 80% of observations into the training dataset and 20% into the testing dataset. N = dim(oc)[1] idx = sample(1:N,size=floor(0.8*N),replace=F) ocr = oc[ idx,] NR = dim(ocr)[1] oct = oc[-idx,] NT = dim(oct)[1] # Let's save the data before we start making analysis: write.csv(oct,file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//oct.csv') write.csv(ocr,file='C://Users//ckt//OneDrive - MUNI//Institutions//MU//ML Finance//2024//ocr.csv') ################################ # Summary statistics ################################ # TASK # Home assignment: Write a function() that will return 2 tables: # * Descriptive statistics for given variables # * Correlation table with statistical significance # Find out if there are any two features that are excessively correlated (say above 0.95?). NV = dim(ocr)[2] table1 <- matrix(NA,nrow=NV,ncol=11) #Mean, SD, min, 5%, 25%, Median, 75%, 95%, max, skew, kurt rownames(table1) <- names(ocr) colnames(table1) <- c("Mean", "SD", "min", "5%", "25%", "Median", "75%", "95%", "max", "skew", "kurt") library(moments) for (i in 1:NV) { x <- na.omit(ocr[,i],na.rm=T) table1[i,] <- c(mean(x),sd(x),quantile(x,p=c(0,0.05,0.25,0.50,0.75,0.95,1)),skewness(x),kurtosis(x)) } table1 <- round(table1,2) # Correlation table table2 <- matrix(NA,nrow=NV,ncol=NV) rownames(table2) <- names(ocr) colnames(table2) <- names(ocr) for (i in 1:(NV-1)) { for (j in (i+1):NV) { m = cor.test(ocr[,i],ocr[,j],method='spearman') # The warnings -> we have dummy variables that might cause the issues (use 'kendall') table2[i,j] <- m$estimate table2[j,i] <- m$p.value } } table2 <- round(table2,2) # Visualize data: Overlapping histograms, Ordered box-plots, x-y plots. # Let's take a look at some examples of plots. Overlapping histograms? par(mfrow=c(1, 1)) # layout of figures - (rows,columns) par(cex = 1.1) par(oma = c(2, 2.0, 1.0, 1.0)) par(tcl = -0.25) par(mgp = c(2, 0.6, 0)) par(mar = c(2.0, 3.0, 1.5, 0.5)) hist(ocr$price[ocr$trans==1],breaks=15,prob=T,xaxt='n', xlim=c(0,40000),density=10, col=rgb(0.85,0.5,0.05,alpha=0.9), xlab=c(),ylab=c(),main='',cex.axis=0.55, ylim=c(0,9.5^(-4))) hist(ocr$price[oc$trans==0],breaks=15,prob=T,add=T, col=rgb(0.15,0.85,0.85,alpha=0.9),density=10) axis(1,at=seq(from=0,to=40000,by=5000),label=seq(from=0,to=40000,by=5000),cex.axis=0.65) lines(density(ocr$price[ocr$trans==1]),col=rgb(0.85,0.5,0.05),lwd=1.25,lty=2) lines(density(ocr$price[ocr$trans==0]),col=rgb(0.15,0.85,0.85),lwd=1.25,lty=2) legend('topright',bty='n',legend=c('Distribution of price\nautomatic shift\n', 'Distribution of price\nmanual shift'), col=c(rgb(0.85,0.5,0.05),rgb(0.15,0.85,0.85)),cex=0.75,lty=1) # Separated box-plots? boxplot(price~trans,data=ocr,pch=19,cex=0.35,yaxt='n',xlab='', ylab = 'Price of the car',xaxt='n',cex.lab=0.75) axis(2,at=seq(from=0,to=40000,by=5000),label=seq(from=0,to=40000,by=5000),cex.axis=0.65,las=2) axis(1,at=c(1,2),label=c('Manual shift', 'Automatic shift'), cex.axis=0.65) # Separated x-y plots? par(mfrow=c(1, 1)) # layout of figures - (rows,columns) par(cex = 1.1) par(oma = c(2, 2.0, 1.0, 1.0)) par(tcl = -0.25) par(mgp = c(2, 0.6, 0)) par(mar = c(3.0, 3.0, 1.5, 0.5)) plot(x=ocr$km[ocr$age > 9],y=ocr$price[ocr$age > 9], pch=19, cex=0.5, col='black',ylab='Price',xlab='Kilometers', ylim=c(0,max(ocr$price)),xlim=c(0,max(ocr$km)), cex.axis=0.85,cex.lab=0.95) abline(lm(price~km,data=ocr[ocr$age > 9,]),lwd=1.5) points(x=ocr$km[ocr$age <= 9],y=ocr$price[ocr$age <= 9], col='red',pch=19,cex=0.5) abline(lm(price~km,data=ocr[ocr$age <= 9,]),lwd=1.5, col='red') legend('topright',bty='n',legend=c('Dependence for older cars\n(9+ years)\n', 'Dependence for younger cars\n(9- years)'), col=c('black','red'),cex=0.75,lty=1) ################################ # Estimate OLS models ################################ # Start with a simplest of models. Note that the model is estimated using the training dataset! m1 = lm(price ~ km, data=ocr) summary(m1) # Let's try age and combine it. m2 = lm(price ~ age, data=ocr) summary(m2) m3 = lm(price ~ km + age, data=ocr) summary(m3) # Now let's look at an interaction terms. m4 = lm(price ~ km + age + I(km*age), data=ocr) summary(m4) # What if we add almost everything? m5 = lm(price ~ km + + age + trans + combi + allw + ambi + styl + rs + dsg + scr + diesel + lpg + hybrid + cng + I(km*age), data=ocr) summary(m5) # We cannot use all 'fuel' types or we have collinearity ################################ # Model prediction ################################ # Now we will use models m1 to m5 and make predictions on the training data. I ll first create a matrix where I ll store all the predictions and the predicted price: predictions = matrix(NA,nrow=NT,ncol=5+1) colnames(predictions) = c('True',paste('p',1:5,sep='')) predictions[,1] = oct$price # Predictions itself - from the model 1 and 2. p1 = predict(m1,new=oct); summary(p1) # Notice that model to predicted negative price... is that possible? What to do? p1[p1 < 0] = min(p1[p1>0]); summary(p1) predictions[,2] = p1 p2 = predict(m2,new=oct); summary(p2) p2[p2 < 0] = min(p2[p2>0]); summary(p2) predictions[,3] = p2 p3 = predict(m3,new=oct); summary(p3) p3[p3 < 0] = min(p3[p3>0]); summary(p3) predictions[,4] = p3 p4 = predict(m4,new=oct); summary(p4) p4[p4 < 0] = min(p4[p4>0]); summary(p4) predictions[,5] = p4 p5 = predict(m5,new=oct); summary(p5) # Again p5[p5 < 0] = min(p5[p5>0]); summary(p5) predictions[,6] = p5 ################################ # Forecast evaluation: ################################ # Statistical loss function # Now we evaluate forecasts using mean square error loss function mse = matrix(NA,nrow=NT,ncol=5) for (i in 1:5) mse[,i] = (predictions[,1] - predictions[,i+1])^2 apply(mse,2,mean) # We can also check a figure - predicted vs. true. par(mfrow=c(1, 1)) par(cex = 1.1) par(oma = c(2, 2.0, 1.0, 1.0)) par(tcl = -0.25) par(mgp = c(2, 0.6, 0)) par(mar = c(3.0, 3.0, 1.5, 0.5)) plot(x=predictions[,1],y=predictions[,2], pch=19, cex=0.5, col='black',ylab='Predicted',xlab='True', cex.axis=0.85,cex.lab=0.95,ylim=range(predictions),xlim=range(predictions)) lines(x=c(0,45000),y=c(0,45000),lwd=2) points(x=predictions[,1],y=predictions[,6], col='red',pch=19,cex=0.5) legend('bottomright',bty='n',legend=c('Simple model\n', 'Best model'), col=c('black','red'),cex=0.75,pch=19) # Statistical significance test # Finally, we can run a hypothesis test which of the model - under the MSE loss function - led to be most accurate forecasts. library(MCS) MCSprocedure(mse) ## TASK # * Home assignment: Evaluate forecasts using mean absolute error # * Who provides an OLS model with highest accuracy? # * Does you model have a tendency to under- or over-perform?