# ==============================================================================
# --------------------------------- SEMINAR 3 ----------------------------------
# ==============================================================================


# install.packages("corrplot")
library(corrplot)
library(ggfortify)
library(dplyr)


# ----------------------------------- TASK 1 -----------------------------------


data <- read.csv("market.csv")
str(data)
head(data)

unique(data$NACE_R2_LABEL)
unique(data$GEO)

# we are interested just in the different states of EU, not in the summaries,
# ignore the rows with the EU summaries and rename the Germany value:

data <- data[, ]
data$GEO[] <- "Germany"

# pick up only the variables which we are interested in:

data <- data[, c("NACE_R2_LABEL", "GEO", "Value")]


# ..................................... A ......................................


# Transform data from long format to short format. Use column GEO (indicates
# state) as key variable for each row. Use values in column NACE R2 LABEL
# (indicates market segment) to identify new columns. Use values from column
# Value (indicates percent of people working in the given segment).


?reshape
data.short <- reshape(data, idvar = "GEO", timevar = "NACE_R2_LABEL", direction = "wide")

# rename the columns:
colnames(data.short)[2:12] <-
  c("Agriculture, forest", "Industry", "Manufacturing", "Construction",
    "Wholesale and retail", "Information", "Financial", "Real estate",
    "Science, Technical", "Public administration", "Arts, entertainment")
head(data.short)

# rename the rows:
rownames(data.short) <- data.short$GEO
head(data.short)

# now we can ignore the first column GEO (we have the same in rownames)
data.short <- data.short[2:12]
head(data.short)


# ..................................... B ......................................


# create a correlation matrix M using build-in R function

# HINT: use cor() and corrplot() functions

(M <- cor(data.short))

corrplot(M, method = "circle")
corrplot.mixed(M, lower.col = "black", diag = "n", tl.pos = "lt")
corrplot(M, order = "hclust", addrect = 4)


# ..................................... C ......................................


# create a single scatter plot for Industry and Manufacturing,
# create a scatter plot matrix for all segments

x <- data.short$Industry
y <- data.short$Manufacturing

plot(x, y, col = "red", pch = 19,
     xlab = "Industry", ylab = "Manufacturing", main = "Industry vs. Manufacturing")
cor(x, y)
pairs(data.short, lower.panel = NULL, pch = 19, col = "red")


# ..................................... E ......................................


# Use PCA to analyze the data. Use build-in R function prcomp and
# autoplot function from package ggfortify.

# https://www.rdocumentation.org/packages/mixOmics/versions/6.3.2/topics/pca

data.pca <- prcomp(data.short)

data.pca
summary(data.pca) # standard deviation = sqrt(eigenvalues)

autoplot(data.pca, data = data.short, label = T, shape = F,
         loadings = TRUE, loadings.label = TRUE)


# ..................................... D ......................................


# Use PCA to analyze the data. Do not use build-in R function prcomp.

# find covariance matrix of the data:
cov_matrix <- cov(data.short)

# the total variance s is the sum of the variances of all variables,
# it is also equal to the sum of the eigenvalues of cov_matrix:
(s <- sum(diag(cov_matrix)))

# find eigenvalues and eigenvectors of cov_matrix:
s.eigen <- eigen(cov_matrix)
plot(s.eigen$values, type = 'o', col = "red", pch = 19,
     xlab = 'Eigenvalue Number', ylab = 'Eigenvalue Size', main = 'Scree Graph')
# eigenvalue i = how much of the total variance can be explained by PC_i

# the first two principal components explain 78.6% of the total variance:
cumsum(s.eigen$values / s)
plot(cumsum(s.eigen$values / s), type = 'o', col = "red", pch = 19,
     xlab = 'Component', ylab = 'acconted varriance', main = 'Total variance')

c1 <- s.eigen$vectors[, 1]
c2 <- s.eigen$vectors[, 2]

# prinipal component = original data transformed by the multiplication
# with eig_vector = data in new coordinates (of eig_vectors)

# the first principal component (direction):
pc1 <- as.matrix(data.short) %*% c1
# the second:
pc2 <- as.matrix(data.short) %*% c2

plot(pc1, pc2, col = "red", pch = 19, xlim = c(-26, 16),
     ylim = c(-23, 7), xlab = "First component", ylab = "Second component")
text(pc1, pc2 + 1.5,
     labels = rownames(data.short), cex = 0.7, font = 2)
# the dependences between states and market segments
# can be seen very easily using just two first principal components

# Following code is for volunteers: what does it compute? How?

pc1 <- (pc1 - mean(pc1)) / (max(pc1) - min(pc1))
pc2 <- (pc2 - mean(pc2)) / (max(pc2) - min(pc2))

xold1 <- c(1, rep(0, 10)) %*% c1
yold1 <- c(1, rep(0, 10)) %*% c2

xold2 <- c(0, 1, rep(0, 9)) %*% c1
yold2 <- c(0, 1, rep(0, 9)) %*% c2

xold3 <- c(0, 0, 1, rep(0, 8)) %*% c1
yold3 <- c(0, 0, 1, rep(0, 8)) %*% c2

plot(pc1, pc2, col = "red", pch = 19, xlim = c(-0.7, 0.6), ylim = c(-0.6, 0.75),
     xlab = "First component", ylab = "Second component")
text(pc1, pc2 + 0.05, labels = rownames(data.short), cex = 0.7, font = 2)
arrows(x0 = c(0, 0, 0), x1 = c(xold1, xold2, xold3), y0 = c(0, 0, 0),
       y1 = c(yold1, yold2, yold3), lwd = 1.5, col = "blue", length = 0.1)
text(c(xold1, xold2, xold3), c(yold1, yold2, yold3) + 0.05,
     labels = colnames(data.short)[1:3], cex = 0.7, font = 2, col = "blue")


# ----------------------------------- TASK 2 -----------------------------------


library(jpeg)

img <- readJPEG("001.jpg", native = FALSE)

par(mar = c(0, 0, 0, 0))
plot(1:2, type = "n", xlab = "", ylab = "", xaxt = 'n', yaxt = 'n')
rasterImage(img, 1, 1, 2, 2)
# columns (186 in total) -> variables
# rows (258 in total) -> observations

data.pca <- prcomp(img)
# returns only first 200 eigenvectors of cov matrix and principal components

# original data = PCs * "inverse" of eigenvector matrix
# (the same as the transpose now):
data.reconst <- data.pca$x %*% t(data.pca$rotation)

plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n')
rasterImage(data.reconst, 1, 1, 2, 2)

# what is wrong? check the center and scale inputs of prcomp()
?prcomp
# the data were centered

# there are two ways how to correct that:

# 1) use default: center = T, scale = F and then add the substracted means
data.pca <- prcomp(img, center = T, scale = F)
d <- data.pca$x %*% t(data.pca$rotation) +
  matrix(rep(data.pca$center, each = ), ncol = ) # FILL

plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n')
rasterImage(d, 1, 1, 2, 2)

# 2) use: center = F, scale = F
data.pca <- prcomp(img, center = F, scale = F)
d <- data.pca$x %*% t(data.pca$rotation)

plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n')
rasterImage(d, 1, 1, 2, 2)


# if the data was also scaled (center = T, scale = T):
data.pca <- prcomp(img, center = T, scale = T)
d <- (data.pca$x %*% t(data.pca$rotation)) *
  matrix(rep(data.pca$scale, each = 258), ncol = 186) +
  matrix(rep(data.pca$center, each = 258), ncol = 186)

plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n')
rasterImage(d, 1, 1, 2, 2)

# we will work with the default: center = T, scale = F

data.pca <- prcomp(img)
# by default variables are centered to zero at first and then the PCA is done
# we will need to shift them back for reconstracting the image
s <- summary(data.pca)
par(mar = c(5.1, 4.1, 4.1, 2.1))
plot(s$importance[3, ], main = "Cumulative Proportion of variance", ylab = "proportion",
     type = 'o', col = "red", pch = 19)

# importances = eigenvalues
# rotation = eigenvectors = new coordinates
# x = principal components = new variables

# plot the eigenvectors:
par(mfrow = c(5, 5), mar = c(1, 1, 1, 1))
for (i in 1:25){
  d <- matrix(rep(data.pca$rotation[, i], each = 200), ncol = 300, byrow = F)
  d <- (d - min(d)) / (max(d) - min(d)) # scale between 0 and 1
  plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n', main = i)
  rasterImage(d, 1, 1, 2, 2)
}
par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1))

# choose the number of PCs for explaining 90% of variance at least:
(num_components <- sum(s$importance[3, ] < 0.9) + 1)

# the reconstructed image using just few (the most important) components:
d <- data.pca$x[, 1:num_components] %*% t(data.pca$rotation[, 1:num_components])
#  original data x = PCs * (inversion of the eigen vectors matrix)

d <- d + rep(data.pca$center, each = 258) # TRY ALSO
d <- (d - min(d)) / (max(d) - min(d))

par(mar = c(0, 0, 0, 0))
plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n')
rasterImage(d, 1, 1, 2, 2)

# ORIGINAL IMAGE (using all principal components):
d <- data.pca$x %*% t(data.pca$rotation) + rep(data.pca$center, each = 258)
d <- (d - min(d)) / (max(d) - min(d))
plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n')
rasterImage(d, 1, 1, 2, 2)


# ----------------------------------- TASK 3 -----------------------------------


data <- c()
for (i in 1:9){
  img <- readJPEG(paste(0, 0, i, ".jpg", sep = ""), native = FALSE)
  data <- rbind(data, c(img)) # we cut the image data by columns
}
# columns = pixels of each figure -> variables
# rows = figures -> observations

par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1))
data.pca <- prcomp(data, scale = F) # try scale=T

s <- summary(data.pca)
plot(s$importance[3, ], col = "red", pch = 19,
     main = "Cumulative Proportion of variance", ylab = "proportion", type = 'o')

par(mfrow = c(3, 3), mar = c(1, 1, 1, 1))
for (i in 1:9){
  d <- data.pca$rotation[, i]
  d <- (d - min(d)) / (max(d) - min(d))
  m <- matrix(d, nrow = dim(img)[1]) # returns new coordinates = eigenvectors
  plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n')
  rasterImage(m, 1, 1, 2, 2)
}

# find the number of principal components needed for
# explaining at least 90% of variablity:
(total <- sum(s$importance[3, ] < 0.8) + 1)
dd <- data.pca$x[, 1:total] %*% t(data.pca$rotation[, 1:total]) +
  rep(data.pca$center, each = 9)

# reconstruct images using the first few principal components:
par(mfrow = c(3, 3), mar = c(1, 1, 1, 1))
for (i in 1:9){
  d <- dd[i, ]
  d <- (d - min(d)) / (max(d) - min(d))
  m <- matrix(d, nrow = dim(img)[1], byrow = FALSE) 
  plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n', main = i)
  rasterImage(m, 1, 1, 2, 2)
}
par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1))