# ============================================================================== # --------------------------------- SEMINAR 4 ---------------------------------- # ============================================================================== # install.packages("corrplot") library(corrplot) library(ggfortify) library(dplyr) # ----------------------------------- TASK 1 ----------------------------------- data <- read.csv("market.csv") names(data) <- c("nace_R2", "market", "time", "state", "unit", "NA_item", "value") str(data) head(data) unique(data$market) # specifies different market segments unique(data$state) # specifies the country # DATA CLEANING: # we are interested just in the different states of EU, not in the summaries, # ignore the rows with the EU summaries and rename the Germany value: data <- data[data$state != "European Union - 28 countries (2013-2020)" & data$state != "Euro area - 19 countries (from 2015)", ] data$state[data$state == "Germany (until 1990 former territory of the FRG)"] <- "Germany" # pick up only the variables which we are interested in: data <- data[, c("market", "state", "value")] # ..................................... A ...................................... # Transform data from long format to short format. Use column GEO (indicates # state) as key variable for each row. Use values in column NACE R2 LABEL # (indicates market segment) to identify new columns. Use values from column # Value (indicates percent of people working in the given segment). ?reshape data.short <- reshape(data, idvar = "state", timevar = "market", direction = "wide") # idvar = unique identificator of the rows # timevar = identificator of columns # direction = wide format # in original table each row represents one distinct combinations of # market segment and country, now each row represents just one country # rename the columns: colnames(data.short)[2:12] <- c("Agriculture, forest", "Industry", "Manufacturing", "Construction", "Wholesale and retail", "Information", "Financial", "Real estate", "Science, Technical", "Public administration", "Arts, entertainment") head(data.short) # rename the rows: rownames(data.short) <- data.short$state head(data.short) # now we can ignore the first column STATE (we have the same in rownames) data.short <- data.short[2:12] head(data.short) # ..................................... B ...................................... # create a correlation matrix using built-in # R function and store it into variable M # HINT: use cor() and corrplot() functions (M <- cor(data.short)) # correlation measures a linear dependence of two variables: # countries with great value of Manufacturing will TEND # to have a great value of Industry etc. # zero correlation does not imply independence of variables # hight correlation does not imply proportionality # it implies just a tendency in the data, not a rule corrplot(M, method = "circle") corrplot.mixed(M, lower.col = "black", diag = "n", tl.pos = "lt") corrplot(M, order = "hclust", addrect = 4) # ..................................... C ...................................... # create a single scatter plot for Industry and Manufacturing, # create a scatter plot matrix for all segments x <- data.short$Industry # x axis y <- data.short$Manufacturing # y axis plot(x, y, col = "red", pch = 19, xlab = "Industry", ylab = "Manufacturing", main = "Industry vs. Manufacturing") cor(x, y) # strong positive linear dependance pairs(data.short, lower.panel = NULL, pch = 19, col = "red") # ..................................... E ...................................... # Use PCA to analyze the data. Use built-in R function prcomp and # autoplot function from package ggfortify. data.pca <- prcomp(data.short, scale = TRUE) str(data.pca) # rotation = eigenvectors # x = principal components # sdev = eigenvalues summary(data.pca) # Standard deviation = sqrt(eigenvalues) # Proportion of Variance = variance explained by each principal component # Cumulative Proportion = cumulative sum of eigenvalues divided by their sum = # how much of variance of the original data can be explained using first # few principal components # for example: the first 4 principal component explain 79.74% of total variance autoplot(data.pca) autoplot(data.pca, data = data.short, label = TRUE, shape = FALSE, loadings = TRUE, loadings.label = TRUE) # important markets in the states of eastern europe: # agriculture, manufacturing and industry # important markets in the states of western europe: # science and technical # these market segments are dividing our observations (countries) in the # eastern and western part = our data proves the difference # in the market structure in these two parts of Europe head(data.pca$rotation) # eigenvectors specify the linear combination of all variables for computing # principal components max(data.pca$rotation[, "PC1"]) which.max(data.pca$rotation[, "PC1"]) # industry is the most important segment for the first principal component # the coordinates of the first eigenvector specify the importance of each variable # in the first principal component etc. # ..................................... D ...................................... # FOR VOLUNTEERS # Use PCA to analyze the data. Do not use build-in R function prcomp. # find covariance matrix of the data: M <- cov(data.short) # the total variance s is the sum of the variances of all variables, # it is also equal to the sum of the eigenvalues of cov_matrix: (s <- sum(diag(M))) # find eigenvalues and eigenvectors of cov_matrix: s.eigen <- eigen(M) plot(s.eigen$values, type = 'o', col = "red", pch = 19, xlab = 'Eigenvalue Number', ylab = 'Eigenvalue Size', main = 'Scree Graph') # eigenvalue i = how much of the total variance can be explained by ith principal component # first two principal components explain 78.6% of the total variance: cumsum(s.eigen$values / s) plot(cumsum(s.eigen$values / s), type = 'o', col = "red", pch = 19, xlab = 'Component', ylab = 'acconted varriance', main = 'Total variance') # first two eigenvectors: c1 <- s.eigen$vectors[, 1] c2 <- s.eigen$vectors[, 2] # prinipal component = original data transformed by multiplication # with eig_vector = data in new coordinates (of eig_vectors) # eigen vector = new coordinates # the first principal component: pc1 <- as.matrix(data.short) %*% c1 # the second: pc2 <- as.matrix(data.short) %*% c2 plot(pc1, pc2, col = "red", pch = 19, xlim = c(-26, 16), ylim = c(-23, 7), xlab = "First component", ylab = "Second component") text(pc1, pc2 + 1.5, labels = rownames(data.short), cex = 0.7, font = 2) # the dependences between states and market segments # can be seen very easily using just two first principal components # ----------------------------------- TASK 2 ----------------------------------- library(ggfortify) # load the data: load("customer_behaviour.RData") # ..................................... A ...................................... # create new column BIG containing value 2 if money_spent is greater than 5000 # and value 3 if money_spent is smaller than 5000: data$big <- ifelse(data$money_spent > 5000, 2, 3) # ..................................... B ...................................... # perform PCA using the built-in function (remember SCALING your data), investigate # the summary and store the summary object into special variable called s: data.pca <- prcomp(data[, 1:5], scale = TRUE) (s <- summary(data.pca)) # ..................................... C ...................................... # plot the cumulative proportion of the explained variance # see structure of object s importance <- s$importance[3, ] plot(importance, col = "red", type = "b", pch = 19, xlab = "number of component", ylab = "cumulative proportion explained") # for explaining at least 90% of the total variability # we need 4 principal components # ..................................... D ...................................... s$rotation # rotation matrix contains the eingen vectors = new coordinates, # principal components (new data) are linear combinations of the original # data observations and these eigen vecotrs # numbers in the first eigen vector represents the importance of # different variables on the first principal componenet etc. # the most important variables are: for PC1 = money_spent, for PC2 = mail_ads, # for PC3 = web_visits (the maximal absolute values) etc. # ..................................... E ...................................... # visualize the first two principal components (use autoplot() function), # use colour input argument to color data points by two colors according to # the BIG variable autoplot(data.pca, data = data, colour = data$big) autoplot(data.pca, data = data, colour = data$big, loadings = TRUE, loadings.label = TRUE) # ----------------------------------- TASK 3 ----------------------------------- # FOR VOLUNTEERS library(jpeg) # PLOT IMAGES: par(mfrow = c(3, 3), mar = c(1, 1, 1, 1)) for (i in 1:9){ img <- readJPEG(paste("0", "0", i, ".jpg", sep = ""), native = FALSE) plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n', main = i) rasterImage(img, 1, 1, 2, 2) } par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1)) # LOAD THE IMAGES INTO THE SAME TABLE data <- c() for (i in 1:9){ img <- readJPEG(paste("0", "0", i, ".jpg", sep = ""), native = FALSE) data <- rbind(data, c(img)) # we cut the image data by columns } # in each iteration we loaded one image in the matrix representation and using # c(img) we reshaped it into the vector BY COLUMNS, then we added this vector # to the final data matrix as a new row. # columns = pixels of each figure -> variables # rows = figures -> observations # PERFORM PCA data.pca <- prcomp(data, scale = F) # try scale=T # eigenvectors visualization: par(mfrow = c(3, 3), mar = c(1, 1, 1, 1)) for (i in 1:9){ d <- data.pca$rotation[, i] d <- (d - min(d)) / (max(d) - min(d)) m <- matrix(d, nrow = dim(img)[1], byrow = FALSE) # returns new coordinates = eigenvectors plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n') rasterImage(m, 1, 1, 2, 2) } par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1)) # plot the cumulative importances of the first few principal components: s <- summary(data.pca) plot(s$importance[3, ], col = "red", pch = 19, main = "Cumulative Proportion of variance", ylab = "proportion", type = 'o') # find the number of principal components needed for # explaining at least 90% of variablity: (total <- sum(s$importance[3, ] < 0.9) + 1) # reconstruct the original data matrix using just first "total" principal components: dd <- data.pca$x[, 1:total] %*% t(data.pca$rotation[, 1:total]) + rep(data.pca$center, each = 9) # we added the means of each variable to each column again # (because of default centering of the prcomp built-in function) # reconstruct images using the first few principal components: par(mfrow = c(3, 3), mar = c(1, 1, 1, 1)) for (i in 1:9){ d <- dd[i, ] d <- (d - min(d)) / (max(d) - min(d)) m <- matrix(d, nrow = dim(img)[1], byrow = FALSE) plot(1:2, type = 'n', xlab = "", ylab = "", xaxt = 'n', yaxt = 'n', main = i) rasterImage(m, 1, 1, 2, 2) } par(mfrow = c(1, 1), mar = c(5.1, 4.1, 4.1, 2.1))