library(fda) library(ggplot2) library(plyr) #### Canadian Weather temp <- CanadianWeather$dailyAv[,,1] precip <- CanadianWeather$dailyAv[,,2] daytime <- (1:365) - 0.5 day5 <- seq(0, 365, 5) dayrng <- c(0, 365) knots <- day5 norder <- 4 nbasis <- length(knots) + norder - 2 bbasis <- create.bspline.basis(dayrng,nbasis,norder,knots) curv.Lfd <- int2Lfd(2) lambda <- 1e6 curv.fdPar <- fdPar(bbasis, curv.Lfd, lambda) lambdas <- 10^seq(-4, 4, by = 0.5) # lambdas to look over mean.gcv <- rep(0, length(lambdas)) # store mean gcv for(ilam in 1:length(lambdas)){ curv.fdPari <- curv.fdPar curv.fdPari$lambda <- lambdas[ilam] tempSmoothi <- smooth.basis(daytime, temp, curv.fdPari) mean.gcv[ilam] <- mean(tempSmoothi$gcv) } # Lets select the lowest of these and smooth best <- which.min(mean.gcv) lambdabest <- lambdas[best] curv.fdPar$lambda <- lambdabest tempSmooth <- smooth.basis(daytime,temp,curv.fdPar) # Plot smooth data plot(tempSmooth$fd) # ggplot # fdobjSmootheval <- eval.fd(fdobj = tempSmooth$fd, evalarg = daytime) # # dfs <- data.frame(daytime = daytime, temp = c(fdobjSmootheval), precip = c(precip), # place = rep(CanadianWeather$place, each = 365), region = rep(CanadianWeather$region, each = 365)) # # x11() # p0 <- ggplot (data = dfs, aes (x = daytime, y = temp, colour = place)) # p0 <- p0 + geom_line () # p0 <- p0 + labs (x = "Days", y = "Temperature") # p0 <- p0 + theme_bw () # print (p0) # # x11() # p0 <- ggplot (data = dfs, aes (x = daytime, y = temp, colour = region, group = place)) # p0 <- p0 + geom_line () # p0 <- p0 + labs (x = "Days", y = "Temperature") # p0 <- p0 + theme_bw () # print (p0) ################################################################################ ### EXPLORATORY DATA ANALYSIS #### ################################################################################ tempfd <- tempSmooth$fd ### Mean mtempfd <- mean.fd(tempfd) plot(mtempfd) # ggplot # fdm <- eval.fd(fdobj = mtempfd, evalarg = daytime) # colnames(fdm) <- NULL # df <- data.frame(dfs, fdmean = fdm) # # x11() # p0 <- ggplot (data = df, aes (x = daytime, y = temp, group = place)) # p0 <- p0 + geom_line (color = "grey", size = 0.5) # p0 <- p0 + geom_line (aes(y = fdmean), color = 2, size = 1.5) # p0 <- p0 + labs (x = "Days", y = "Mean Curve") # p0 <- p0 + theme_bw () # print (p0) ### Variance temp.varbifd <- var.fd(tempfd) temp.var <- eval.bifd(daytime, daytime, temp.varbifd) plot(daytime, diag(temp.var), type = "l") # ggplot # df <- data.frame(dfs, fdmean = fdm, fdvar = diag(temp.var)) # # x11() # p0 <- ggplot (data = df, aes (x = daytime, y = fdvar)) # p0 <- p0 + geom_line (color = 2, size = 1.5) # p0 <- p0 + labs (x = "Days", y = "Variance Curve") # p0 <- p0 + theme_bw () # print (p0) ### Covariance contour(daytime, daytime, temp.var) # Mostly high variance in the winter, low in summer. Let's have a look at # correlation. In this case, evaluation arguments go in with the function call # ggplot # df <- merge(daytime, daytime) # df <- data.frame(df, fdcov = c(temp.var), fdcor = c(temp.cor)) # # x11() # p0 <- ggplot (data = df, aes (x, y, z = fdcov)) # p0 <- p0 + geom_raster(aes(fill = fdcov)) # p0 <- p0 + geom_contour (colour = "white") # p0 <- p0 + labs (x = "Days", y = "Days") # p0 <- p0 + coord_fixed (ratio = 1) # p0 <- p0 + theme_bw () # print (p0) ### Corelation temp.cor <- cor.fd(daytime, tempfd) filled.contour(daytime, daytime, temp.cor) # Here we see high correlation between Summer and Winter temperatures, but # much less in the spring and fall (although spring to fall correlation is still # high). # ggplot # x11() # p0 <- ggplot (data = df, aes (x, y, z = fdcor)) # p0 <- p0 + geom_raster(aes(fill = fdcor)) # p0 <- p0 + geom_contour (colour = "white") # p0 <- p0 + labs (x = "Days", y = "Days") # p0 <- p0 + coord_fixed (ratio = 1) # p0 <- p0 + theme_bw () # print (p0) ################################################################################ ### FUNCTIONAL PRINCIPAL COMPONENTS #### ################################################################################ tempPCA <- pca.fd(tempfd, nharm = 6) # Here we can look at proportion of variance explained: plot(tempPCA$varprop,type='b') # Looks like we could have stopped at 3. # Looking at the principal components: plot(tempPCA$harmonics[1:3]) # 1 Looks like over-all temperature. # 2 Looks like Summer vs Winter # 3 Is Spring vs Fall. ## But let's plot these plot(tempPCA, harm = 1:3) # ggplot # df <- data.frame(Components = 1:6, varprop = cumsum(tempPCA$varprop)) # # x11() # p0 <- ggplot (data = df, aes (x = Components, y = varprop)) # p0 <- p0 + geom_line () # p0 <- p0 + geom_point () # p0 <- p0 + labs (x = "Components", y = "Variable Explained") # p0 <- p0 + scale_x_continuous (breaks = 1:6) # p0 <- p0 + theme_bw () # print (p0) ## Looking at the principal components: # fdobjPCAeval <- eval.fd(fdobj = tempPCA$harmonics[1:3], evalarg = daytime) # df.comp <- data.frame(daytime = daytime, harmonics = c(fdobjPCAeval), component = factor(rep(1:3, each = 365))) # # x11() # p0 <- ggplot (data = df.comp, aes (x = daytime, y = harmonics, color = component, linetype = component)) # p0 <- p0 + geom_line () # p0 <- p0 + labs (x = "Days", y = "Main Components") # p0 <- p0 + theme_bw () # print (p0) # 1a. Scree-plot # day3 = seq(0,365,3) # fdobjPCAeval3 <- eval.fd(fdobj = tempPCA$harmonics[1:4], evalarg = day3) # # df.comp3 <- data.frame(daytime = day3, harmonics = c(fdobjPCAeval3), component = factor(rep(1:4, each = length(day3)))) # fdm3 <- c(eval.fd(fdobj = mtempfd, evalarg = day3)) # df.pokus3 <- data.frame(df.comp3, m = fdm3) # df.pv3 <- ddply(df.pokus3, .(component), mutate, m1 = m + 2*sqrt(tempPCA$values[component])*harmonics, # m2 = m - 2*sqrt(tempPCA$values[component])*harmonics, # pov = paste0("Component ", component,", Percentage of Variability = ", round(100*tempPCA$varprop[component], 1))) # # x11() # p0 <- ggplot (data = df.pv3, aes (x = daytime, y = m, color = component)) # p0 <- p0 + geom_line () # p0 <- p0 + geom_point (aes(y = m1), pch = "+", cex = 3) # p0 <- p0 + geom_point (aes(y = m2), pch = "-", cex = 3) # p0 <- p0 + labs (x = "Days", y = "Harmonic") # p0 <- p0 + theme_bw () # p0 <- p0 + facet_wrap(~ pov, nrow = 2) # p0 <- p0 + guides(colour = FALSE) # print (p0) # 1b. Score distribution plot(tempPCA$scores[, 1:2]) # dfsc <- data.frame(sc1 = tempPCA$scores[, 1], sc2 = tempPCA$scores[, 2], place = colnames(temp)) # # x11() # p0 <- ggplot (dfsc, aes(x = sc1, y = sc2, fill = place, label = place)) # p0 <- p0 + geom_hline(yintercept = 0, linetype = "dashed") # p0 <- p0 + geom_vline(xintercept = 0, linetype = "dashed") # p0 <- p0 + geom_label(fontface = "bold", col = "white") # p0 <- p0 + labs (x = "Score 1", y = "Score 2") # p0 <- p0 + guides(fill = guide_legend(title = "Place", ncol = 1)) # p0 <- p0 + theme_bw () # print (p0) # dfsc <- data.frame(sc1 = tempPCA$scores[, 1], sc2 = tempPCA$scores[, 2], place = colnames(temp), region = CanadianWeather$region) # # x11() # p0 <- ggplot (dfsc, aes(x = sc1, y = sc2, fill = region, label = place)) # p0 <- p0 + geom_hline(yintercept = 0, linetype = "dashed") # p0 <- p0 + geom_vline(xintercept = 0, linetype = "dashed") # p0 <- p0 + geom_label(fontface = "bold", col = "white") # p0 <- p0 + labs (x = "Score 1", y = "Score 2") # p0 <- p0 + guides(fill = guide_legend(title = "Region", ncol = 1)) # p0 <- p0 + theme_bw () # print (p0) # 2. PCA and Smoothing pca.fdPar <- fdPar(bbasis, curv.Lfd, 1e4) tempPCAsmooth <- pca.fd(tempfd, nharm = 6, harmfdPar = pca.fdPar) # Here we can look at proportion of variance explained: plot(tempPCAsmooth$varprop, type='b') # Looking at the principal components: plot(tempPCAsmooth$harmonics[1:3]) plot(tempPCA, harm = 1:3) # ggplot # fdobjPCAsmootheval <- eval.fd(fdobj = tempPCAsmooth$harmonics[1:3], evalarg = daytime) # df <- data.frame(daytime = daytime, harmonics = c(fdobjPCAsmootheval), component = factor(rep(1:3, each = 365))) # # x11() # p0 <- ggplot (data = df, aes (x = daytime, y = harmonics, color = component, linetype = component)) # p0 <- p0 + geom_line () # p0 <- p0 + labs (x = "Days", y = "Main Components") # p0 <- p0 + theme_bw () # print (p0) # 3. PCA and Reconstructing Data mtempfd <- mean.fd(tempfd) fdm <- eval.fd(fdobj = mtempfd, evalarg = daytime) colnames(fdm) <- NULL scores <- tempPCAsmooth$scores PCs <- eval.fd(fdobj = tempPCAsmooth$harmonics, evalarg = daytime) # plot reconstructed data with k harmonics k <- 3 matplot(fdm%*%matrix(rep(1, 35), c(1, 35)) + PCs[, 1:k] %*% t(scores[, 1:k]), type = "l") # ggplot # x11() # for (k in 1:6){ # df <- data.frame(dfs, reconstruction = rep(fdm, times = 35) + c(PCs[, 1:k] %*% t(scores[, 1:k]))) # p0 <- ggplot (data = df, aes (x = daytime, y = temp, group = place)) # p0 <- p0 + geom_line (color = "grey", size = 0.5) # p0 <- p0 + geom_line (aes(y = reconstruction, color = place)) # p0 <- p0 + labs (x = "Days", y = "Reconstructed Curves") # p0 <- p0 + theme_bw () # print (p0) # # dev.copy2pdf (file = paste0("reconpca", k, "_canad.pdf"), width = 12, height = 7) # } # just one # df <- data.frame(dfs[1:365, ], reconstruction = fdm) # p0 <- ggplot (data = df, aes (x = daytime, y = temp, group = place)) # p0 <- p0 + geom_line (color = "grey", size = 0.5) # p0 <- p0 + geom_line (aes(y = reconstruction, color = place)) # p0 <- p0 + labs (x = "Days", y = "Reconstructed Curve") # p0 <- p0 + scale_y_continuous(limits = c(-16, 20)) # p0 <- p0 + theme_bw () # print (p0) # x11() # for (k in 1:6){ # df <- data.frame(dfs[1:365, ], reconstruction = fdm + c(PCs[, 1:k] %*% t(scores[, 1:k]))[1:365]) # p0 <- ggplot (data = df, aes (x = daytime, y = temp, group = place)) # p0 <- p0 + geom_line (color = "grey", size = 0.5) # p0 <- p0 + geom_line (aes(y = reconstruction, color = place)) # p0 <- p0 + labs (x = "Days", y = "Reconstructed Curve") # p0 <- p0 + scale_y_continuous(limits = c(-16, 20)) # p0 <- p0 + theme_bw () # print (p0) # # dev.copy2pdf (file = paste0("recononepca", k, "_canad.pdf"), width = 12, height = 7) # } # # 4. VARIMAX rotPCA <- varmx.pca.fd(tempPCAsmooth) # Here we can look at proportion of variance explained: plot(rotPCA$varprop, type='b') # Looking at the principal components: plot(rotPCA$harmonics[1:3]) plot(rotPCA, harm = 1:3) # ggplot # day3 = seq(0,365,3) # fdm3 <- c(eval.fd(fdobj = mtempfd, evalarg = day3)) # scores <- rotPCA$scores # PCs <- eval.fd(fdobj = rotPCA$harmonics, evalarg = day3) # # rotPCAeval <- eval.fd(fdobj = rotPCA$harmonics[1:4], evalarg = day3) # # df.rotcomp <- data.frame(daytime = day3, harmonics = c(rotPCAeval), component = factor(rep(1:4, each = length(day3)))) # # df.pokus <- data.frame(df.rotcomp, m = fdm3) # df.rotpv <- ddply(df.pokus, .(component), mutate, m1 = m + 20*harmonics, # m2 = m - 20*harmonics, # pov = paste0("Rotated Component ", component,", Percentage of Variability = ", round(100*rotPCA$varprop[component], 1))) # # x11() # p0 <- ggplot (data = df.rotpv, aes (x = daytime, y = m, color = component)) # p0 <- p0 + geom_line () # p0 <- p0 + geom_point (aes(y = m1), pch = "+", cex = 3) # p0 <- p0 + geom_point (aes(y = m2), pch = "-", cex = 3) # p0 <- p0 + labs (x = "Days", y = "Rotated Harmonics") # p0 <- p0 + theme_bw () # p0 <- p0 + facet_wrap(~ pov, nrow = 2) # p0 <- p0 + guides(colour = FALSE) # print (p0) ####################### # Problems # ####################### ## Pinch Force Data # a) PCA data x11() matplot (pinchtime, pinch, type = "l", lty = 1, cex=2, col = 1, lwd = 1, xlab = "Seconds", ylab = "Force (N)") #x <- pinchtime x <- 0:150 y <- pinch rangeval <- range(x) bbasis <- create.bspline.basis(rangeval = rangeval, norder = 4, breaks = x) curv.Lfd <- int2Lfd(2) lambdas <- exp(-10:10) gcvs <- rep(0, length(lambdas)) for(i in 1:length(lambdas)){ pPar <- fdPar(bbasis, curv.Lfd, lambdas[i]) gcvs[i] <- mean(smooth.basis(x, y, pPar)$gcv) } best <- which.min(gcvs) lambda <- lambdas[best] pPar <- fdPar(bbasis, curv.Lfd, lambda) pinchSmooth <- smooth.basis(x, y, pPar) plot(pinchSmooth$fd) # b) Smoothed PCA analysis lambdas <- exp(1:5) CVmat <- matrix(0, length(lambdas), 20) for(i in 1:length(lambdas)){ tfdPar <- fdPar(bbasis, curv.Lfd, lambdas[i]) for(j in 1:20){ tpca <- pca.fd(pinchSmooth$fd[-j], nharm = 2, harmfdPar = tfdPar, centerfns = TRUE) tpfd <- pinchSmooth$fd[j] - tpca$meanfd tharmvals <- eval.fd(x, tpca$harmonics) tpvals <- eval.fd(x, tpfd) CVmat[i,j] <- mean(lm(tpvals ~ tharmvals - 1)$res^2) } } CV <- apply(CVmat, 1, mean) ## Handwriting Data ?handwrit ### Medfly Data load("medfly.Rdata") lifetime <- as.numeric(medfly$lifetime) eggcount <- medfly$eggcount days <- 1:26