FRUTALES # Load PASWR library from the menu and Type satfruit library(PASWR) attach(satfruit) #File dimension and variables names dim(satfruit) names(satfruit) # Scatterplots pairs(satfruit[,c(17:10)]) pairs(satfruit[,c(17,9:3)]) #Descriptive analysis summary(satfruit) # show fruits histograms in each small area library(lattice) histogram(~OBS|SArea,as.table=TRUE) # show classified fruits histograms in each small area histogram(~FR|SArea,as.table=TRUE) # Use boxplot to show the observed fruits variability per areas. # Use barplot to show the observed fruits total per area #and their standard errors par(mfrow=c(2,2)) boxplot(split(OBS,SArea),col="blue",main="Observed Fruits") boxplot(split(FR,SArea),col="yellow",main="Classified Fruits") medias<-sapply(split(OBS,SArea),mean) des.e<-sapply(split(OBS,SArea),sd) ee<-des.e/sqrt(table(SArea)) #se of the mean tabla<-rbind(medias,ee) barplot(tabla,col=c("blue","red"),ylab="OBS",xlab="SArea", legend=rownames(tabla),main="Observed Fruits Means") medias<-sapply(split(FR,SArea),mean) des.e<-sapply(split(FR,SArea),sd) ee<-des.e/sqrt(table(SArea)) tabla<-rbind(medias,ee) barplot(tabla,col=c("yellow","red"),ylab="FR",xlab="SArea", legend=rownames(tabla),main="Classified Fruits Means") # Marginal correlation of OBS with the rest of variables round(cor(satfruit[,17], satfruit[,3:16]),2) # Linear regression of OBS vs. FR per small areas and coefficients of determination par(mfrow=c(2,2)) r2=summary(lm(satfruit$OBS~satfruit$FR))$r.squared r=sqrt(r2) plot(satfruit$FR, satfruit$OBS,main=expression(paste(plain(R^2),plain("=0.82 ")))) abline(lsfit(satfruit$FR,satfruit$OBS),lwd=2,col=1) datos.R63=satfruit[satfruit$SArea=="R63",] cor(datos.R63$OBS,datos.R63$FR) plot(datos.R63$FR,datos.R63$OBS,main=expression(paste(plain(R^2),plain("=1 ")))) abline(lsfit(datos.R63$FR,datos.R63$OBS),col="green",lwd=2) datos.R67=satfruit[satfruit$SArea=="R67",] cor(datos.R67$OBS,datos.R67$FR) plot(datos.R67$FR,datos.R67$OBS,main=expression(paste(plain(R^2),plain("=0.89 ")))) abline(lsfit(datos.R67$FR,datos.R67$OBS),col="blue",lwd=2) datos.R68=satfruit[satfruit$SArea=="R68",] cor(datos.R68$OBS,datos.R68$FR) plot(datos.R68$FR,datos.R68$OBS,main=expression(paste(plain(R^2),plain("=0.73 ")))) abline(lsfit(datos.R68$FR,datos.R68$OBS),col="red",lwd=2) # To make simultaneously graphics of linear regression and robust regression panel.scatreg=function(x,y) {panel.xyplot(x,y) panel.abline(lm(y~x),col=1,lwd=2) panel.abline(lqs(y~x),col=3,lty=3,lwd=2)} xyplot(FR~OBS|SArea,as.table=T,panel=panel.scatreg) xyplot(FR~OBS,as.table=T,panel=panel.scatreg) # Fit a linear regression of OBS versus the rest of numerical variables # in the same order as they are recorded in the file model.A<-lm(OBS~WH+BA+NAR+COR+SF+VI+PS+ES+AF+CO+AR+AL+OL+FR) summary.aov(model.A) # Selection of explanatory variables using leaps: library(leaps) a<-leaps(cbind(WH,BA,NAR,COR,SF,VI,PS,ES,AF,CO,AR,AL,OL,FR),OBS,method="adjr2") which(a$adjr2==max(a$adjr2)) a$which[81,] model.B<-lm(OBS~SF+PS+ES+AF+CO+AR+AL+OL+FR) summary.aov(model.B) summary(model.B)$r.squared #compute R^2 for the selected model summary(model.B)$adj.r.squared AIC(model.B) # AIC criterion AIC(model.B, k = log(nrow(satfruit))) #BIC criterion # Selection of explanatory variables using step: step(model.A) model.C<-lm(OBS~PS+AL+OL+FR) summary.aov(model.C) AIC(model.C) AIC(model.C, k = log(nrow(satfruit))) #BIC criterion # Default diagnostic plots with Model (C) par(mfrow=c(2,2)) plot(model.C) # Hat values, residuals, observed versus fitted Model (C) a<-model.C iden<-function(y, a = 3, c=0.05) { n <- length(y) oy <- order(abs(y)) b<-y*c which <- oy[(n - a + 1):n] text(seq(1:n)[which], y[which]+b[which], as.character(which)) list(y=y,b=b)} par(mfrow=c(2,2),pty="s") plot(hatvalues(a),type="h",xlab="",ylim=c(0,1), ylab="diagonales de la matriz hat") X<-model.matrix(a) abline(h=2*(ncol(X))/nrow(X)) iden(hatvalues(a)) title("a) Elementos diagonales \n de la matriz hat") plot(rstandard(a),type="n",xlab="",ylab="r_i") text(rstandard(a)) title("b) Standardized Residuals") plot(rstudent(a),type="n",xlab="",ylab="r_i^*") text(rstudent(a)) abline(h=qt(0.025, a$df.residual-1)) abline(h=qt(0.975,a$df.residual-1)) title("c) Studentized Residuals") prediccion<-predict.lm(a) plot(prediccion, satfruit$OBS,xlab="fitted values",ylab="y",type="n", main="d) Observados vs. ajustados") abline(0,1) text(prediccion, satfruit$OBS) # Cook distance and Dffits par(mfrow=c(2,2)) cd.C<-cooks.distance(a) plot(cd.C, ylab="Cooks Distance", ylim=c(0,12)) iden(cd.C, a=1) crit.value<-qf(0.5, ncol(X), nrow(X)-ncol(X)) abline(h=crit.value, lty=2) dffits.modelC<-dffits(a) plot(dffits.modelC, ylab="Dffits", ylim=c(-10,2)) iden(dffits.modelC, a=3) crit.value<-2*sqrt(ncol(X)/nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) #DFbetas par(mfrow=c(2,2)) dfbetas.modelC<-dfbetas(a) plot(dfbetas.modelC[,2], ylab="dfbetas[,2]", ylim=c(-1,1)) iden(dfbetas.modelC[,2], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) plot(dfbetas.modelC[,3], ylab="dfbetas[,3]", ylim=c(-1,1)) iden(dfbetas.modelC[,3], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) plot(dfbetas.modelC[,4], ylab="dfbetas[,4]", ylim=c(-1,1)) iden(dfbetas.modelC[,4], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) plot(dfbetas.modelC[,5], ylab="dfbetas[,5]", ylim=c(-1,1)) iden(dfbetas.modelC[,5], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) # Checking for normality with Shapiro Wilk test shapiro.test(residuals(model.C)) # Checking for heteroscedasticity library(lmtest) bptest(model.C) # Introducing SArea as a new explanatory variable to reduce possible spatial autocorrelation model.A1<-lm(OBS~WH+BA+NAR+COR+SF+VI+PS+ES+AF+CO+AR+AL+OL+FR+SArea) model.D<-step(model.A1) formula(model.D) # Anova of model D summary.aov(model.D) # Confidence intervals of explanatory variables confint(model.D) # R2, R2a, AIC, and BIC of Model (D) summary(model.D)$r.squared summary(model.D)$adj.r.squared AIC(model.D) AIC(model.D, k = log(nrow(satfruit))) # Checking the significance of PS and AL drop1(model.D,test="Chisq") drop1(model.D,test="F") # Checking simultaneously the significance of PS and AL library(car) confidence.ellipse(model.D, Scheffe=TRUE) points(0,0,pch=20) abline(v=confint(model.D)[2,],lty=2) abline(h=confint(model.D)[3,],lty=2) # Drop out variables PS and AL . New model E model.E<-lm(OBS~FR+SArea) summary.aov(model.E) # Default diagnostics par(mfrow=c(2,2)) plot(model.E) # Checking for normality with Shapiro Wilk test shapiro.test(residuals(model.E)) # Checking for heteroscedasticity library(lmtest) bptest(model.E) # Hat values, residuals, observed versus fitted Model (E) a<-model.E iden<-function(y, a = 3, c=0.05) { n <- length(y) oy <- order(abs(y)) b<-y*c which <- oy[(n - a + 1):n] text(seq(1:n)[which], y[which]+b[which], as.character(which)) list(y=y,b=b)} par(mfrow=c(2,2),pty="s") plot(hatvalues(a),type="h",xlab="",ylim=c(0,1), ylab="diagonales de la matriz hat") X<-model.matrix(a) abline(h=2*(ncol(X))/nrow(X)) iden(hatvalues(a)) title("a) Diagonal Elements of the Hat matrix") plot(rstandard(a),type="n",xlab="",ylab="r_i") text(rstandard(a)) title("b) Standardized Residuals") plot(rstudent(a),type="n",xlab="",ylab="r_i^*") text(rstudent(a)) abline(h=qt(0.025, a$df.residual-1)) abline(h=qt(0.975,a$df.residual-1)) title("c) Studentized Residuals") prediccion<-predict.lm(a) plot(prediccion, satfruit$OBS,xlab="fitted values",ylab="y",type="n", main="d) Observed versus Fitted Values") abline(0,1) text(prediccion, satfruit$OBS) # Cook distance and Dffits Model E par(mfrow=c(2,2)) cd.E<-cooks.distance(a) plot(cd.E, ylab="Cooks Distance", ylim=c(0,12)) iden(cd.E, a=1) crit.value<-qf(0.5, ncol(X), nrow(X)-ncol(X)) abline(h=crit.value, lty=2) dffits.modelE<-dffits(a) plot(dffits.modelE, ylab="Dffits", ylim=c(-1,1)) iden(dffits.modelE, a=3) crit.value<-2*sqrt(ncol(X)/nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) #DFbetas Model E par(mfrow=c(2,2)) dfbetas.modelE<-dfbetas(a) plot(dfbetas.modelE[,2], ylab="dfbetas[,2]", ylim=c(-1,1)) iden(dfbetas.modelE[,2], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) plot(dfbetas.modelE[,3], ylab="dfbetas[,3]", ylim=c(-1,1)) iden(dfbetas.modelE[,3], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) plot(dfbetas.modelE[,4], ylab="dfbetas[,4]", ylim=c(-1,1)) iden(dfbetas.modelE[,4], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) #Drop out the 46 record of Model (E) satfruit1<-satfruit[-46,] dim(satfruit1) detach(satfruit) attach(satfruit1) model.F<-lm(OBS ~ FR + SArea) #Default diagnostics plots Model (F) par(mfrow=c(2,2)) plot(model.F) # Hat values, residuals, observed versus fitted Model (F) a<-model.F par(mfrow=c(2,2),pty="s") plot(hatvalues(a),type="h",xlab="",ylim=c(0,1), ylab="diagonales de la matriz hat") X<-model.matrix(a) abline(h=2*(ncol(X))/nrow(X)) iden(hatvalues(a)) title("a) Diagonal Elements of the Hat matrix") plot(rstandard(a),type="n",xlab="",ylab="r_i") text(rstandard(a)) title("b) Standardized Residuals") plot(rstudent(a),type="n",xlab="",ylab="r_i^*") text(rstudent(a)) abline(h=qt(0.025, a$df.residual-1)) abline(h=qt(0.975,a$df.residual-1)) title("c) Studentized Residuals") prediccion<-predict.lm(a) plot(prediccion, satfruit1$OBS, xlab="fitted values", ylab="y",type="n", main="d) Observed versus Fitted Values") abline(0,1) text(prediccion, satfruit1$OBS) # Cook distance and Dffits Model F par(mfrow=c(2,2)) cd.F<-cooks.distance(a) plot(cd.F, ylab="Cooks Distance", ylim=c(0,12)) iden(cd.F, a=1) crit.value<-qf(0.5, ncol(X), nrow(X)-ncol(X)) abline(h=crit.value, lty=2) dffits.modelF<-dffits(a) plot(dffits.modelF, ylab="Dffits", ylim=c(-1,1)) iden(dffits.modelF, a=3) crit.value<-2*sqrt(ncol(X)/nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) #DFbetas Model F par(mfrow=c(2,2)) dfbetas.modelF<-dfbetas(a) plot(dfbetas.modelF[,2], ylab="dfbetas[,2]", ylim=c(-1,1)) iden(dfbetas.modelF[,2], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) plot(dfbetas.modelF[,3], ylab="dfbetas[,3]", ylim=c(-1,1)) iden(dfbetas.modelF[,3], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) plot(dfbetas.modelF[,4], ylab="dfbetas[,4]", ylim=c(-1,1)) iden(dfbetas.modelF[,4], a=3) crit.value<-2/sqrt(nrow(X)) abline(h=c(-crit.value, crit.value), lty=2) # Checking for normality with Shapiro Wilk test shapiro.test(residuals(model.F)) # Checking for heteroscedasticity library(lmtest) bptest(model.F) # Confidence intervals Model F confint(model.F) # estimate \beta_1 in square meters summary(model.F)$coef[2,1]*10000 # Predictions summary(model.F)$coef[1,1]+summary(model.F)$coef[2,1]*(97044.28) #R63 summary(model.F)$coef[1,1]+summary(model.F)$coef[2,1]*4878603.43+summary(model.F)$coef[3,1] summary(model.F)$coef[1,1]+summary(model.F)$coef[2,1]*2883488.24+summary(model.F)$coef[4,1] # Simpler FR.pob<-c(97044.28, 4878603.43, 2883488.24) SArea.pob<-c("R63","R67","R68") newdata<-data.frame(FR.pob, SArea.pob) names(newdata)<-c("FR", "SArea") predict(model.F, newdata) # Coefficients of the regression lines for every area Model F contrasts(satfruit1$SArea) coef(model.F) coef.R63<-cbind(coef(model.F)[1],coef(model.F)[2]) coef.R67<-cbind(coef(model.F)[1]+coef(model.F)[3], coef(model.F)[2]) coef.R68<-cbind(coef(model.F)[1]+coef(model.F)[4], coef(model.F)[2]) # Regression lines per area par(mfrow=c(2,2)) satfruit1.R63=satfruit1[satfruit1$SArea=="R63",] plot(satfruit1.R63$FR,satfruit1.R63$OBS,main="R63") abline(coef.R63,col="green",lwd=2) satfruit1.R67=satfruit1[satfruit1$SArea=="R67",] plot(satfruit1.R67$FR,satfruit1.R67$OBS,main="R67") abline(coef.R67,col="blue",lwd=2) satfruit1.R68=satfruit1[satfruit1$SArea=="R68",] plot(satfruit1.R68$FR,satfruit1.R68$OBS,main="R68") abline(coef.R68,col="red",lwd=2) # Observed versus predicted GLOBAL Model F par(mfrow=c(1,1)) prediccion<-predict.lm(model.F) plot(satfruit1$OBS,prediccion,ylab="fitted values",xlab="y",type="n", main="Fitted vs. Observed") abline(0,1,col="red",lwd=2) text(satfruit1$OBS,prediccion) # Direct Estimation means.AREAS<-sapply(split(satfruit$OBS,satfruit$SArea),mean) TOTAL.AREAS<-means.AREAS*c(119,703, 564) #Model prediction TOTAL.AREASPRED<-predict(model.F, newdata) #Comparison of Direct estimation versus Model Prediction resumen<-rbind(TOTAL.AREAS,TOTAL.AREASPRED) row.names(resumen)<-c("Direct Est.","Model Prediction") barplot(resumen/10000,legend=rownames(resumen),main="Direct estimates and Predicted Fruits Totals in ha.",beside=TRUE,col=c(3,4))