# ==============================================================================
# --------------------------------- SEMINAR 4 ----------------------------------
# ==============================================================================


# install.packages("corrplot")

library(corrplot)
library(ggfortify)
library(dplyr)


# ----------------------------------- TASK 1 -----------------------------------


data <- read.csv("market.csv")

names(data) <- c("nace_R2", "market", "time", "state", "unit", "NA_item", "value")

str(data)
head(data)

unique(data$market)     # specifies different market segments
unique(data$state)      # specifies the country

# DATA CLEANING:

# we are interested just in the different states of EU, not in the summaries,
# ignore the rows with the EU summaries and rename the Germany value:

data <- data[data$state != "European Union - 28 countries (2013-2020)" &
             data$state != "Euro area - 19 countries  (from 2015)", ]
data$state[data$state == "Germany (until 1990 former territory of the FRG)"] <- "Germany"

# pick up only the variables which we are interested in:

data <- data[, c("market", "state", "value")]


# ..................................... A ......................................


# Transform data from long format to short format. Use column GEO (indicates
# state) as key variable for each row. Use values in column NACE R2 LABEL
# (indicates market segment) to identify new columns. Use values from column
# Value (indicates percent of people working in the given segment).


?reshape

data.short <- reshape(data, idvar = "state", timevar = "market", direction = "wide")

# idvar = unique identificator of the rows
# timevar = identificator of columns
# direction = wide format

# in original table each row represents one distinct combinations of
# market segment and country, now each row represents just one country

# rename the columns:

colnames(data.short)[2:12] <-
  c("Agriculture, forest", "Industry", "Manufacturing", "Construction",
    "Wholesale and retail", "Information", "Financial", "Real estate",
    "Science, Technical", "Public administration", "Arts, entertainment")
head(data.short)

# rename the rows:
rownames(data.short) <- data.short$state
head(data.short)

# now we can ignore the first column STATE (we have the same in rownames)
data.short <- data.short[2:12]
head(data.short)


# ..................................... B ......................................


# create a correlation matrix using built-in
# R function and store it into variable M

# HINT: use cor() and corrplot() functions

(M <- cor(data.short))

# correlation measures a linear dependence of two variables:
# countries with great value of Manufacturing will TEND
# to have a great value of Industry etc.

# zero correlation does not imply independence of variables
# hight correlation does not imply proportionality
# it implies just a tendency in the data, not a rule

corrplot(M, method = "circle")
corrplot.mixed(M, lower.col = "black", diag = "n", tl.pos = "lt")
corrplot(M, order = "hclust", addrect = 4)


# ..................................... C ......................................


# create a single scatter plot for Industry and Manufacturing,
# create a scatter plot matrix for all segments

x <- data.short$Industry        # x axis
y <- data.short$Manufacturing   # y axis

plot(x, y, col = "red", pch = 19,
     xlab = "Industry", ylab = "Manufacturing", main = "Industry vs. Manufacturing")

cor(x, y) # strong positive linear dependance
pairs(data.short, lower.panel = NULL, pch = 19, col = "red")


# ..................................... E ......................................


# Use PCA to analyze the data. Use built-in R function prcomp and
# autoplot function from package ggfortify.

data.pca <- prcomp(data.short, scale = TRUE)

str(data.pca)

# rotation = eigenvectors
# x = principal components
# sdev = eigenvalues

summary(data.pca)

# Standard deviation = sqrt(eigenvalues)
# Proportion of Variance = variance explained by each principal component
# Cumulative Proportion = cumulative sum of eigenvalues divided by their sum =
# how much of variance of the original data can be explained using first
# few principal components

# for example: the first 4 principal component explain 79.74% of total variance

autoplot(data.pca)
autoplot(data.pca, data = data.short, label = TRUE, shape = FALSE,
         loadings = TRUE, loadings.label = TRUE)

# important markets in the states of eastern europe:
# agriculture, manufacturing and industry

# important markets in the states of western europe:
# science and technical

# these market segments are dividing our observations (countries) in the
# eastern and western part = our data proves the difference
# in the market structure in these two parts of Europe

head(data.pca$rotation)

# eigenvectors specify the linear combination of all variables for computing
# principal components

max(data.pca$rotation[, "PC1"])
which.max(data.pca$rotation[, "PC1"])

# industry is the most important segment for the first principal component

# the coordinates of the first eigenvector specify the importance of each variable
# in the first principal component etc.


# ..................................... D ......................................


# FOR VOLUNTEERS
# Use PCA to analyze the data. Do not use build-in R function prcomp.

# find covariance matrix of the data:

M <- cov(data.short)

# the total variance s is the sum of the variances of all variables,
# it is also equal to the sum of the eigenvalues of cov_matrix:

(s <- sum(diag(M)))

# find eigenvalues and eigenvectors of cov_matrix:

s.eigen <- eigen(M)

plot(s.eigen$values, type = 'o', col = "red", pch = 19,
     xlab = 'Eigenvalue Number', ylab = 'Eigenvalue Size', main = 'Scree Graph')

# eigenvalue i = how much of the total variance can be explained by ith principal component

# first two principal components explain 78.6% of the total variance:

cumsum(s.eigen$values / s)
plot(cumsum(s.eigen$values / s), type = 'o', col = "red", pch = 19,
     xlab = 'Component', ylab = 'acconted varriance', main = 'Total variance')

# first two eigenvectors:

c1 <- s.eigen$vectors[, 1]
c2 <- s.eigen$vectors[, 2]

# prinipal component = original data transformed by multiplication
# with eig_vector = data in new coordinates (of eig_vectors)
# eigen vector = new coordinates

# the first principal component:
pc1 <- as.matrix(data.short) %*% c1

# the second:
pc2 <- as.matrix(data.short) %*% c2

plot(pc1, pc2, col = "red", pch = 19, xlim = c(-26, 16),
     ylim = c(-23, 7), xlab = "First component", ylab = "Second component")
text(pc1, pc2 + 1.5,
     labels = rownames(data.short), cex = 0.7, font = 2)

# the dependences between states and market segments
# can be seen very easily using just two first principal components


# ----------------------------------- TASK 2 -----------------------------------
  

library(ggfortify)

# load the data:

load("customer_behaviour.RData")


# ..................................... A ......................................


# create new column BIG containing value 2 if money_spent is greater than 5000
# and value 3 if money_spent is smaller than 5000:

data$big <- ifelse(data$money_spent > 5000, 2, 3)


# ..................................... B ......................................


# perform PCA using the built-in function (remember SCALING your data), investigate
# the summary and store the summary object into special variable called s:

data.pca <- prcomp(data[, 1:5], scale = TRUE)

(s <- summary(data.pca))


# ..................................... C ......................................


# plot the cumulative proportion of the explained variance
# see structure of object s

importance <- s$importance[3, ]

plot(importance, col = "red", type = "b", pch = 19,
     xlab = "number of component", ylab = "cumulative proportion explained")

# for explaining at least 90% of the total variability
# we need 4 principal components


# ..................................... D ......................................


s$rotation

# rotation matrix contains the eingen vectors = new coordinates,
# principal components (new data) are linear combinations of the original
# data observations and these eigen vecotrs

# numbers in the first eigen vector represents the importance of 
# different variables on the first principal componenet etc.

# the most important variables are: for PC1 = money_spent, for PC2 = mail_ads,
# for PC3 = web_visits (the maximal absolute values) etc.


# ..................................... E ......................................


# visualize the first two principal components (use autoplot() function),
# use colour input argument to color data points by two colors according to
# the BIG variable


autoplot(data.pca, data = data, colour = data$big)
autoplot(data.pca, data = data, colour = data$big,
         loadings = TRUE, loadings.label = TRUE)


# ----------------------------------- TASK 3 -----------------------------------


# FOR VOLUNTEERS

library(jpeg)

# PLOT IMAGES:

par(mfrow = c(3, 3), mar = c(1, 1, 1, 1))

for (i in 1:9){
  img <- readJPEG(paste("0", "0", i, ".jpg", sep = ""), native = FALSE)
  
  plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n', main = i)
  rasterImage(img, 1, 1, 2, 2)
}

par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1))

# LOAD THE IMAGES INTO THE SAME TABLE

data <- c()

for (i in 1:9){
  img <- readJPEG(paste("0", "0", i, ".jpg", sep = ""), native = FALSE)
  data <- rbind(data, c(img)) # we cut the image data by columns
}

# in each iteration we loaded one image in the matrix representation and using
# c(img) we reshaped it into the vector BY COLUMNS, then we added this vector
# to the final data matrix as a new row.

# columns = pixels of each figure -> variables
# rows = figures -> observations

# PERFORM PCA

data.pca <- prcomp(data, scale = F) # try scale=T

# eigenvectors visualization:

par(mfrow = c(3, 3), mar = c(1, 1, 1, 1))

for (i in 1:9){
  d <- data.pca$rotation[, i]
  d <- (d - min(d)) / (max(d) - min(d))
  m <- matrix(d, nrow = dim(img)[1], byrow = FALSE) # returns new coordinates = eigenvectors
  plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n')
  rasterImage(m, 1, 1, 2, 2)
}

par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1))

# plot the cumulative importances of the first few principal components:

s <- summary(data.pca)
plot(s$importance[3, ], col = "red", pch = 19,
     main = "Cumulative Proportion of variance", ylab = "proportion", type = 'o')

# find the number of principal components needed for
# explaining at least 90% of variablity:

(total <- sum(s$importance[3, ] < 0.9) + 1)

# reconstruct the original data matrix using just first "total" principal components:

dd <- data.pca$x[, 1:total] %*% t(data.pca$rotation[, 1:total]) +
  rep(data.pca$center, each = 9)

# we added the means of each variable to each column again
# (because of default centering of the prcomp built-in function)

# reconstruct images using the first few principal components:

par(mfrow = c(3, 3), mar = c(1, 1, 1, 1))

for (i in 1:9){
  d <- dd[i, ]
  d <- (d - min(d)) / (max(d) - min(d))
  m <- matrix(d, nrow = dim(img)[1], byrow = FALSE) 
  plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n', main = i)
  rasterImage(m, 1, 1, 2, 2)
}

par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1))