library(fda) library(ggplot2) library(plyr) data(CanadianWeather) temp <- CanadianWeather$dailyAv[,,1] precip <- CanadianWeather$dailyAv[,,2] daytime <- (1:365)-0.5 day5 <- seq(0,365,5) dayrng <- c(0,365) #### B-spline bases with knots every 5 days breaks <- day5 norder <- 4 #nbasis <- length(knots) + norder - 2 bbasis <- create.bspline.basis(dayrng, norder = norder, breaks = breaks) curv.Lfd <- int2Lfd(2) curv.fdPar <- fdPar(bbasis,curv.Lfd) ## CV lambdas <- 10^seq(-4, 4, by = 0.5) # lambdas to look over mean.gcv <- rep(0, length(lambdas)) # store mean gcv for(ilam in 1:length(lambdas)){ # Set lambda curv.fdPari <- curv.fdPar curv.fdPari$lambda <- lambdas[ilam] # Smooth tempSmoothi <- smooth.basis(daytime, temp, curv.fdPari) # Record average gcv mean.gcv[ilam] <- mean(tempSmoothi$gcv) } best <- which.min(mean.gcv) lambdabest <- lambdas[best] curv.fdPar$lambda <- lambdabest tempSmooth <- smooth.basis(daytime, temp, curv.fdPar) tempfd <- tempSmooth$fd fdobjSmootheval <- eval.fd(fdobj = tempfd, evalarg = daytime) # Regression model with scalar response # Classical linear regression model p <- 5 # number of predictors month.time <- seq(1, 365, length.out = p) X <- t(eval.fd(month.time, tempfd)) y <- log10(apply(daily$precav, 2, sum)) lm.mod <- lm(y ~ X) summary(lm.mod) x11() yhat1.0 <- lm.mod$fitted.values # Assesing quality of fit # a) by SSE n <- length(y) yres1.0 <- y - yhat1.0 SSE1.0 <- sum(yres1.0^2) SSE0 <- sum((y - mean(y))^2) RSQ1.0 <- (SSE0-SSE1.0)/SSE0 Fratio1.0 <- ((SSE0-SSE1.0)/p)/(SSE1.0/(n - p - 1)) # b) by predicted values plot(y, yhat1.0) abline(0,1) # c) by CV ytesthat <- numeric(35) for (ind in 1:35){ ytrain <- y[-ind] Xtrain <- X[-ind, ] ytest <- y[ind] Xtest <- X[ind, ] lm.mod <- lm(ytrain ~ Xtrain) ytesthat[ind] <- coefficients(lm.mod)%*%c(1, Xtest) } plot(y, ytesthat) abline(0,1) # FDA approach # Annual precipitation y <- log10(apply(daily$precav, 2, sum)) tempbasis <- create.fourier.basis(dayrng, 65) tempSmooth <- smooth.basis(daytime, daily$tempav, tempbasis) tempfd <- tempSmooth$fd # 1. Low-Dimensional Regression Coefficient Function # setting covariates templist <- vector("list", 2) templist[[1]] <- rep(1, 35) templist[[2]] <- tempfd # setting betas conbasis <- create.constant.basis(c(0, 365)) betabasis <- create.fourier.basis(c(0, 365), 5) # dim of basis for beta = 5 betalist <- vector("list", 2) betalist[[1]] <- conbasis betalist[[2]] <- betabasis # regression fRegressList1 <- fRegress(y, templist, betalist) betaestlist <- fRegressList1$betaestlist tempbetafd <- betaestlist[[2]]$fd yhat1 <- fRegressList1$yhatfdobj # plot(tempbetafd, xlab="Day", # ylab="Beta for temperature") # Confidence Intervals resid1 <- y - yhat1 SigmaEps <- sum(resid1^2)/(35 - fRegressList1$df) SigmaE <- SigmaEps*diag(rep(1,35)) y2cMap <- tempSmooth$y2cMap stderrList <- fRegress.stderr(fRegressList1, y2cMap, SigmaE) betafd <- betaestlist[[2]]$fd # same as tempbetafd betastderrList <- stderrList$betastderrlist betastderrfd <- betastderrList[[2]] plot(betafd, xlab="Day", ylab="Temperature Reg. Coeff.", ylim=c(-1.9e-3,2e-03), lwd=2) lines(betafd+2*betastderrfd, lty=2, lwd=1) lines(betafd-2*betastderrfd, lty=2, lwd=1) # ggplot # Beta1 <- eval.fd(daytime, betafd) # pom1 <- eval.fd(daytime, betastderrfd) # df1.CI <- data.frame(Day = daytime, Beta = Beta1, # lower = Beta1 - 1.96*pom1, upper = Beta1 + 1.96*pom1) # # p <- ggplot(df1.CI, aes(x = Day, y = Beta1)) # p <- p + geom_line() # p <- p + geom_hline(aes(yintercept = 0), linetype = "dashed") # p <- p + geom_line(aes(y = lower), linetype = "dashed") # p <- p + geom_line(aes(y = upper), linetype = "dashed") # p <- p + labs (x = "Days", y = "Beta for Temperature") # p <- p + theme_bw() # p # points(month.time, lm.mod$coefficients[-1]) # Assesing quality of fit yhat1 <- fRegressList1$yhatfdobj yres1 <- y - yhat1 degfr <- fRegressList1$df n <- length(yhat1) SSE1 <- sum(yres1^2) SSE0 <- sum((y - mean(y))^2) RSQ1 <- (SSE0-SSE1)/SSE0 Fratio1 <- ((SSE0-SSE1)/(degfr - 1))/(SSE1/(n - degfr)) # 2. Estimate Using a Roughness Penalty Lcoef <- c(0,(2*pi/365)^2,0) harmaccelLfd <- vec2Lfd(Lcoef, c(0,365)) betabasis2 <- create.fourier.basis(c(0, 365), 35) lambda <- 10^12.5 betafdPar <- fdPar(betabasis2, harmaccelLfd, lambda) betalist[[2]] <- betafdPar # Choosing lambda loglam <- seq(5, 15, 0.5) nlam <- length(loglam) SSE.CV <- matrix(0, nlam, 1) for (ilam in 1:nlam) { lambda <- 10^loglam[ilam] betalisti <- betalist betafdPar2 <- betalisti[[2]] betafdPar2$lambda <- lambda betalisti[[2]] <- betafdPar2 fRegi <- fRegress.CV(y, templist, betalisti) SSE.CV[ilam] <- fRegi$SSE.CV # OR # fRegi <- fRegress(y, templist, betalisti) # SSE.CV[ilam] <- fRegi$OCV } # plot CV plot(loglam, SSE.CV, type = "b") lines(loglam, SSE.CV, col = "red") # ggplot # df.CV <- data.frame(lambda = loglam, CV = SSE.CV) # # p <- ggplot(df.CV, aes(x = lambda, y = CV)) # p <- p + geom_line() # p <- p + geom_point() # p <- p + labs (x = "Log10 of lambda", y = "CV score") # p <- p + theme_bw() # p best <- which.min(SSE.CV) lambdabest <- 10^loglam[best] lambda <- lambdabest betafdPar <- fdPar(betabasis2, harmaccelLfd, lambda) betalist[[2]] <- betafdPar # regression fRegressList2 <- fRegress(y, templist, betalist) betaestlist2 <- fRegressList2$betaestlist yhat2 <- fRegressList2$yhatfdobj # vykresleni pro konkretni lambda 10^12.5 (vyslo nejlepsi) # Confidence Intervals resid2 <- y - yhat2 SigmaEps <- sum(resid2^2)/(35 - fRegressList2$df) SigmaE <- SigmaEps*diag(rep(1,35)) y2cMap <- tempSmooth$y2cMap stderrList <- fRegress.stderr(fRegressList2, y2cMap, SigmaE) betafd2 <- betaestlist2[[2]]$fd # same as tempbetafd2 betastderrList <- stderrList$betastderrlist betastderrfd <- betastderrList[[2]] # plotting plot(betafd2, xlab = "Day", ylab = "Temperature Reg. Coeff.", ylim = c(-1.9e-3,2e-03), lwd = 2) lines(betafd2 + 2*betastderrfd, lty = 2, lwd = 1) lines(betafd2 - 2*betastderrfd, lty = 2, lwd = 1) # ggplot # Beta2 <- eval.fd(daytime, betafd2) # pom2 <- eval.fd(daytime, betastderrfd) # df2.CI <- data.frame(Day = daytime, Beta = Beta2, # lower = Beta2 - 1.96*pom2, upper = Beta2 + 1.96*pom2) # # p <- ggplot(df2.CI, aes(x = Day, y = Beta)) # p <- p + geom_line() # p <- p + geom_hline(aes(yintercept = 0), linetype = "dashed") # p <- p + geom_line(aes(y = lower), linetype = "dashed") # p <- p + geom_line(aes(y = upper), linetype = "dashed") # p <- p + labs (x = "Days", y = "Beta for Temperature") # p <- p + theme_bw() # p # Assesing quality of fit yhat2 <- fRegressList2$yhatfdobj yres2 <- y - yhat2 degfr <- fRegressList2$df n <- length(yhat2) SSE2 <- sum(yres2^2) SSE0 <- sum((y - mean(y))^2) RSQ2 <- (SSE0-SSE2)/SSE0 Fratio2 <- ((SSE0-SSE2)/(degfr - 1))/(SSE2/(n - degfr)) # just constant beta betalist[[2]] <- fdPar(conbasis) fRegressList0 <- fRegress(y, templist, betalist) betaestlist <- fRegressList0$betaestlist yhat2.0 <- fRegressList0$yhatfdobj degfr <- fRegressList0$df n <- length(yhat2.0) SSE2.0 <- sum((y - yhat2.0)^2) RSQ2.0 <- (SSE0 - SSE2.0)/SSE0 Fratio2.0 <- ((SSE0-SSE2.0)/(degfr - 1))/(SSE2.0/(n - degfr)) # 3. Scalar Response Models by Functional Principal Components daybasis365 <- create.fourier.basis(c(0, 365), 365) lambda <- 1e6 tempfdPar <- fdPar(daybasis365, harmaccelLfd, lambda) tempfd <- smooth.basis(daytime, daily$tempav, tempfdPar)$fd lambda <- 1e0 tempfdPar <- fdPar(daybasis365, harmaccelLfd, lambda) tempPCA <- pca.fd(tempfd, 4, tempfdPar) harmonics <- tempPCA$harmonics pcamodel <- lm(y ~ tempPCA$scores) pcacoefs <- summary(pcamodel)$coef # Variance proportion (staci 2 komponenty) cumsum(tempPCA$varprop) betafd3 <- pcacoefs[2, 1]*harmonics[1] + pcacoefs[3,1]*harmonics[2] + pcacoefs[4,1]*harmonics[3] # Confidence Intervals coefvar <- pcacoefs[,2]^2 betavar <- coefvar[2]*harmonics[1]^2 + coefvar[3]*harmonics[2]^2 + coefvar[4]*harmonics[3]^2 plot(betafd3, xlab="Day", ylab="Regression Coef.", ylim=c(-6e-4,1.2e-03), lwd=2) lines(betafd3 + 2*sqrt(betavar), lty=2, lwd=1) lines(betafd3 - 2*sqrt(betavar), lty=2, lwd=1) # ggplot # Beta3 <- c(eval.fd(daytime, betafd3)) # pom3 <- c(sqrt(eval.fd(daytime, betavar))) # df3.CI <- data.frame(Day = daytime, Beta = c(Beta3), # lower = Beta3 - 1.96*pom3, upper = Beta3 + 1.96*pom3) # # p <- ggplot(df3.CI, aes(x = Day, y = Beta)) # p <- p + geom_line() # p <- p + geom_hline(aes(yintercept = 0), linetype = "dashed") # p <- p + geom_line(aes(y = lower), linetype = "dashed") # p <- p + geom_line(aes(y = upper), linetype = "dashed") # p <- p + labs (x = "Days", y = "Beta for Temperature") # p <- p + theme_bw() # p # regression # yhat3 <- inprod(tempfd, betafd3) # inner product XI <- cbind(1, tempPCA$scores) yhat3 <- XI%*%pcacoefs[,1] # Assesing quality of fit yres3 <- y - yhat3 degfr <- 5 n <- length(yhat2) SSE3 <- sum(yres3^2) SSE0 <- sum((y - mean(y))^2) RSQ3 <- (SSE0 - SSE3)/SSE0 Fratio3 <- ((SSE0-SSE3)/(degfr - 1))/(SSE3/(n - degfr)) # 4. Nonparametric regression source("npfda.R") X <- t(eval.fd(daytime, tempfd)) ytesthat <- numeric(35) for (ind in 1:35){ ytrain <- y[-ind] Xtrain <- X[-ind, ] ytest <- y[ind] Xtest <- X[ind, ] result.cv1 <- funopare.kernel.cv(ytrain, Xtrain, Xtest, q = 0, range.grid = dayrng, nknot = min(c(length(daytime)-4)%/%2, 20)) ytesthat[ind] <- result.cv1$Predicted.values } # plot(y, ytesthat) # abline(0,1) # srovnani fitu df.fits.srov <- data.frame(y = c(y), yhat = c(yhat1, yhat2, yhat3, ytesthat), Type = factor(rep(c("1. Basis expansion", "2. Roughness Penalty", "3. Functional PCA", "4. Nonparametric"), each = length(y)))) p <- ggplot(df.fits.srov, aes(x = y, y = yhat, colour = Type)) p <- p + geom_point(size = 2) p <- p + geom_abline(slope = 1, intercept = 0) p <- p + labs (x = "y", y = "Estimates of y") p <- p + theme_bw() p ####################### # Problems # ####################### ### Medfly Data # 0) From the previous lesson load("medfly.Rdata") lifetime <- as.numeric(medfly$lifetime) days <- 1:26 rangeval <- range(days) bbasis <- create.bspline.basis(rangeval, norder = 4, breaks = days) curv.Lfd <- int2Lfd(2) lambda <- 54.6 mPar <- fdPar(bbasis, curv.Lfd, lambda) medfd <- smooth.basis(days, medfly$eggcount, mPar) plot(medfd) # PCA medpca <- pca.fd(medfd$fd, nharm = 6) cumsum(medpca$varprop) # 3 components are needed plot(medpca$harmonics[1:3])