FRUTALES

 
# Load PASWR library from the menu and Type satfruit



library(PASWR)

attach(satfruit) 

#File dimension and variables names

dim(satfruit)

names(satfruit)


# Scatterplots

pairs(satfruit[,c(17:10)])
pairs(satfruit[,c(17,9:3)])


#Descriptive analysis

summary(satfruit)

# show fruits histograms in each small area

library(lattice)
histogram(~OBS|SArea,as.table=TRUE)

# show classified fruits histograms in each small area

histogram(~FR|SArea,as.table=TRUE)

# Use boxplot  to show the observed fruits variability per areas.

# Use barplot to show the observed fruits total per area 
#and their standard errors



par(mfrow=c(2,2))
boxplot(split(OBS,SArea),col="blue",main="Observed Fruits")
boxplot(split(FR,SArea),col="yellow",main="Classified Fruits")

medias<-sapply(split(OBS,SArea),mean) 
des.e<-sapply(split(OBS,SArea),sd)
ee<-des.e/sqrt(table(SArea)) #se of the mean
tabla<-rbind(medias,ee)
barplot(tabla,col=c("blue","red"),ylab="OBS",xlab="SArea",
legend=rownames(tabla),main="Observed Fruits Means")



medias<-sapply(split(FR,SArea),mean) 
des.e<-sapply(split(FR,SArea),sd)
ee<-des.e/sqrt(table(SArea))
tabla<-rbind(medias,ee)
barplot(tabla,col=c("yellow","red"),ylab="FR",xlab="SArea",
legend=rownames(tabla),main="Classified Fruits Means")


# Marginal correlation of OBS with the rest of variables


round(cor(satfruit[,17], satfruit[,3:16]),2)

# Linear regression of OBS vs. FR per small areas and coefficients of determination

par(mfrow=c(2,2))
 r2=summary(lm(satfruit$OBS~satfruit$FR))$r.squared
 r=sqrt(r2)
 plot(satfruit$FR, satfruit$OBS,main=expression(paste(plain(R^2),plain("=0.82 "))))
 abline(lsfit(satfruit$FR,satfruit$OBS),lwd=2,col=1)

 datos.R63=satfruit[satfruit$SArea=="R63",]
 cor(datos.R63$OBS,datos.R63$FR)
 plot(datos.R63$FR,datos.R63$OBS,main=expression(paste(plain(R^2),plain("=1 "))))
 abline(lsfit(datos.R63$FR,datos.R63$OBS),col="green",lwd=2)


 datos.R67=satfruit[satfruit$SArea=="R67",]
 cor(datos.R67$OBS,datos.R67$FR)
  plot(datos.R67$FR,datos.R67$OBS,main=expression(paste(plain(R^2),plain("=0.89 "))))
 abline(lsfit(datos.R67$FR,datos.R67$OBS),col="blue",lwd=2)


 datos.R68=satfruit[satfruit$SArea=="R68",]
 cor(datos.R68$OBS,datos.R68$FR)
 plot(datos.R68$FR,datos.R68$OBS,main=expression(paste(plain(R^2),plain("=0.73 "))))
 abline(lsfit(datos.R68$FR,datos.R68$OBS),col="red",lwd=2)


# To make simultaneously graphics of linear regression and robust regression

panel.scatreg=function(x,y) {panel.xyplot(x,y)
panel.abline(lm(y~x),col=1,lwd=2)
panel.abline(lqs(y~x),col=3,lty=3,lwd=2)}
xyplot(FR~OBS|SArea,as.table=T,panel=panel.scatreg)
xyplot(FR~OBS,as.table=T,panel=panel.scatreg)

# Fit a linear regression of OBS versus the rest of numerical variables
# in the same order as they are recorded in the file

model.A<-lm(OBS~WH+BA+NAR+COR+SF+VI+PS+ES+AF+CO+AR+AL+OL+FR)
summary.aov(model.A)

# Selection of explanatory variables using leaps:

library(leaps)
a<-leaps(cbind(WH,BA,NAR,COR,SF,VI,PS,ES,AF,CO,AR,AL,OL,FR),OBS,method="adjr2")

which(a$adjr2==max(a$adjr2))

a$which[81,]

model.B<-lm(OBS~SF+PS+ES+AF+CO+AR+AL+OL+FR)
summary.aov(model.B)
summary(model.B)$r.squared  #compute R^2 for the selected model
summary(model.B)$adj.r.squared
AIC(model.B) # AIC criterion
AIC(model.B, k = log(nrow(satfruit)))  #BIC criterion


# Selection of explanatory variables using step:

step(model.A)
model.C<-lm(OBS~PS+AL+OL+FR)
summary.aov(model.C)
AIC(model.C)
AIC(model.C, k = log(nrow(satfruit)))  #BIC criterion


# Default diagnostic plots with Model (C)

par(mfrow=c(2,2))
plot(model.C)



# Hat values, residuals, observed versus fitted Model (C)

a<-model.C
iden<-function(y, a = 3, c=0.05)
{
        n <- length(y)
        oy <- order(abs(y))
        b<-y*c
        which <- oy[(n - a + 1):n]
        text(seq(1:n)[which], y[which]+b[which], as.character(which))
list(y=y,b=b)}

 par(mfrow=c(2,2),pty="s")
 plot(hatvalues(a),type="h",xlab="",ylim=c(0,1),
 ylab="diagonales de la matriz hat")
 X<-model.matrix(a)
 abline(h=2*(ncol(X))/nrow(X))
 iden(hatvalues(a))
 title("a) Elementos
 diagonales \n de la matriz hat")

 plot(rstandard(a),type="n",xlab="",ylab="r_i")
 text(rstandard(a))
 title("b) Standardized Residuals")

 plot(rstudent(a),type="n",xlab="",ylab="r_i^*")
 text(rstudent(a))
 abline(h=qt(0.025, a$df.residual-1))
 abline(h=qt(0.975,a$df.residual-1))
 title("c) Studentized Residuals")

 prediccion<-predict.lm(a)
 plot(prediccion, satfruit$OBS,xlab="fitted values",ylab="y",type="n",
 main="d) Observados vs. ajustados")
 abline(0,1)
 text(prediccion, satfruit$OBS)

# Cook distance and Dffits

par(mfrow=c(2,2))
cd.C<-cooks.distance(a)
plot(cd.C, ylab="Cooks Distance", ylim=c(0,12))
iden(cd.C, a=1)
crit.value<-qf(0.5, ncol(X), nrow(X)-ncol(X))
abline(h=crit.value, lty=2)

dffits.modelC<-dffits(a)
plot(dffits.modelC, ylab="Dffits", ylim=c(-10,2))
iden(dffits.modelC, a=3)
crit.value<-2*sqrt(ncol(X)/nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)


#DFbetas


par(mfrow=c(2,2))
dfbetas.modelC<-dfbetas(a)
plot(dfbetas.modelC[,2], ylab="dfbetas[,2]", ylim=c(-1,1))
iden(dfbetas.modelC[,2], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)

 
plot(dfbetas.modelC[,3], ylab="dfbetas[,3]", ylim=c(-1,1))
iden(dfbetas.modelC[,3], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)

 
plot(dfbetas.modelC[,4], ylab="dfbetas[,4]", ylim=c(-1,1))
iden(dfbetas.modelC[,4], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)


plot(dfbetas.modelC[,5], ylab="dfbetas[,5]", ylim=c(-1,1))
iden(dfbetas.modelC[,5], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)


# Checking for normality with Shapiro Wilk test

shapiro.test(residuals(model.C))


# Checking for heteroscedasticity

library(lmtest)
bptest(model.C)


# Introducing SArea as a new explanatory variable to reduce possible spatial autocorrelation

model.A1<-lm(OBS~WH+BA+NAR+COR+SF+VI+PS+ES+AF+CO+AR+AL+OL+FR+SArea)
model.D<-step(model.A1)
formula(model.D)


# Anova of model D

summary.aov(model.D)

# Confidence intervals of explanatory variables

confint(model.D)

# R2, R2a, AIC, and  BIC of Model (D)

summary(model.D)$r.squared
summary(model.D)$adj.r.squared
AIC(model.D)
AIC(model.D, k = log(nrow(satfruit)))
 

# Checking the significance of PS and AL

drop1(model.D,test="Chisq")
drop1(model.D,test="F")


# Checking simultaneously the significance of PS and AL

library(car)
confidence.ellipse(model.D, Scheffe=TRUE)
points(0,0,pch=20)
abline(v=confint(model.D)[2,],lty=2)
abline(h=confint(model.D)[3,],lty=2)

# Drop out variables PS and AL . New model E

model.E<-lm(OBS~FR+SArea)
summary.aov(model.E)


# Default diagnostics

par(mfrow=c(2,2))
plot(model.E)


# Checking for normality with Shapiro Wilk test

shapiro.test(residuals(model.E))


# Checking for heteroscedasticity

library(lmtest)
bptest(model.E)


# Hat values, residuals, observed versus fitted Model (E)

a<-model.E
iden<-function(y, a = 3, c=0.05)
{
        n <- length(y)
        oy <- order(abs(y))
        b<-y*c
        which <- oy[(n - a + 1):n]
        text(seq(1:n)[which], y[which]+b[which], as.character(which))
list(y=y,b=b)}

 par(mfrow=c(2,2),pty="s")
 plot(hatvalues(a),type="h",xlab="",ylim=c(0,1),
 ylab="diagonales de la matriz hat")
 X<-model.matrix(a)
 abline(h=2*(ncol(X))/nrow(X))
 iden(hatvalues(a))
 title("a) Diagonal Elements  of the Hat matrix")

 plot(rstandard(a),type="n",xlab="",ylab="r_i")
 text(rstandard(a))
 title("b) Standardized Residuals")

 plot(rstudent(a),type="n",xlab="",ylab="r_i^*")
 text(rstudent(a))
 abline(h=qt(0.025, a$df.residual-1))
 abline(h=qt(0.975,a$df.residual-1))
 title("c) Studentized Residuals")

 prediccion<-predict.lm(a)
 plot(prediccion, satfruit$OBS,xlab="fitted values",ylab="y",type="n",
 main="d) Observed versus Fitted Values")
 abline(0,1)
 text(prediccion, satfruit$OBS)

# Cook distance and Dffits Model E

par(mfrow=c(2,2))
cd.E<-cooks.distance(a)
plot(cd.E, ylab="Cooks Distance", ylim=c(0,12))
iden(cd.E, a=1)
crit.value<-qf(0.5, ncol(X), nrow(X)-ncol(X))
abline(h=crit.value, lty=2)

dffits.modelE<-dffits(a)
plot(dffits.modelE, ylab="Dffits", ylim=c(-1,1))
iden(dffits.modelE, a=3)
crit.value<-2*sqrt(ncol(X)/nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)


#DFbetas Model E


par(mfrow=c(2,2))
dfbetas.modelE<-dfbetas(a)
plot(dfbetas.modelE[,2], ylab="dfbetas[,2]", ylim=c(-1,1))
iden(dfbetas.modelE[,2], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)

 
plot(dfbetas.modelE[,3], ylab="dfbetas[,3]", ylim=c(-1,1))
iden(dfbetas.modelE[,3], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)

 
plot(dfbetas.modelE[,4], ylab="dfbetas[,4]", ylim=c(-1,1))
iden(dfbetas.modelE[,4], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)

  

#Drop out the 46 record of Model (E)

satfruit1<-satfruit[-46,]
dim(satfruit1)
detach(satfruit)
attach(satfruit1)
model.F<-lm(OBS ~ FR + SArea)
 
#Default diagnostics plots  Model (F)


par(mfrow=c(2,2))
plot(model.F)
 
# Hat values, residuals, observed versus fitted Model (F)

a<-model.F

 par(mfrow=c(2,2),pty="s")
 plot(hatvalues(a),type="h",xlab="",ylim=c(0,1),
 ylab="diagonales de la matriz hat")
 X<-model.matrix(a)
 abline(h=2*(ncol(X))/nrow(X))
 iden(hatvalues(a))
 title("a) Diagonal Elements  of the Hat matrix")

 plot(rstandard(a),type="n",xlab="",ylab="r_i")
 text(rstandard(a))
 title("b) Standardized Residuals")

 plot(rstudent(a),type="n",xlab="",ylab="r_i^*")
 text(rstudent(a))
 abline(h=qt(0.025, a$df.residual-1))
 abline(h=qt(0.975,a$df.residual-1))
 title("c) Studentized Residuals")

 prediccion<-predict.lm(a)
 plot(prediccion, satfruit1$OBS, xlab="fitted values", ylab="y",type="n",
 main="d) Observed versus Fitted Values")
 abline(0,1)
 text(prediccion, satfruit1$OBS)

# Cook distance and Dffits Model F

par(mfrow=c(2,2))
cd.F<-cooks.distance(a)
plot(cd.F, ylab="Cooks Distance", ylim=c(0,12))
iden(cd.F, a=1)
crit.value<-qf(0.5, ncol(X), nrow(X)-ncol(X))
abline(h=crit.value, lty=2)

dffits.modelF<-dffits(a)
plot(dffits.modelF, ylab="Dffits", ylim=c(-1,1))
iden(dffits.modelF, a=3)
crit.value<-2*sqrt(ncol(X)/nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)


#DFbetas Model F


par(mfrow=c(2,2))
dfbetas.modelF<-dfbetas(a)
plot(dfbetas.modelF[,2], ylab="dfbetas[,2]", ylim=c(-1,1))
iden(dfbetas.modelF[,2], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)

 
plot(dfbetas.modelF[,3], ylab="dfbetas[,3]", ylim=c(-1,1))
iden(dfbetas.modelF[,3], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)

 
plot(dfbetas.modelF[,4], ylab="dfbetas[,4]", ylim=c(-1,1))
iden(dfbetas.modelF[,4], a=3)
crit.value<-2/sqrt(nrow(X))
abline(h=c(-crit.value, crit.value), lty=2)

  

# Checking for normality with Shapiro Wilk test

shapiro.test(residuals(model.F))


# Checking for heteroscedasticity

library(lmtest)
bptest(model.F)

 
 


# Confidence intervals Model F

confint(model.F)


# estimate \beta_1 in square meters

summary(model.F)$coef[2,1]*10000

# Predictions


summary(model.F)$coef[1,1]+summary(model.F)$coef[2,1]*(97044.28) #R63

summary(model.F)$coef[1,1]+summary(model.F)$coef[2,1]*4878603.43+summary(model.F)$coef[3,1]

summary(model.F)$coef[1,1]+summary(model.F)$coef[2,1]*2883488.24+summary(model.F)$coef[4,1]
 

   # Simpler

FR.pob<-c(97044.28, 4878603.43, 2883488.24)
SArea.pob<-c("R63","R67","R68")
newdata<-data.frame(FR.pob, SArea.pob)
names(newdata)<-c("FR", "SArea")
predict(model.F, newdata)

 

# Coefficients of the regression lines for every area Model F

contrasts(satfruit1$SArea)

coef(model.F) 

coef.R63<-cbind(coef(model.F)[1],coef(model.F)[2])

coef.R67<-cbind(coef(model.F)[1]+coef(model.F)[3], coef(model.F)[2])

coef.R68<-cbind(coef(model.F)[1]+coef(model.F)[4], coef(model.F)[2])


# Regression lines per area 

par(mfrow=c(2,2))

satfruit1.R63=satfruit1[satfruit1$SArea=="R63",]
plot(satfruit1.R63$FR,satfruit1.R63$OBS,main="R63")
abline(coef.R63,col="green",lwd=2)


satfruit1.R67=satfruit1[satfruit1$SArea=="R67",]
plot(satfruit1.R67$FR,satfruit1.R67$OBS,main="R67")
abline(coef.R67,col="blue",lwd=2)


satfruit1.R68=satfruit1[satfruit1$SArea=="R68",]
plot(satfruit1.R68$FR,satfruit1.R68$OBS,main="R68")
abline(coef.R68,col="red",lwd=2)


# Observed versus predicted GLOBAL Model F

par(mfrow=c(1,1))
 prediccion<-predict.lm(model.F)
 plot(satfruit1$OBS,prediccion,ylab="fitted values",xlab="y",type="n",
 main="Fitted vs. Observed")
 abline(0,1,col="red",lwd=2)
 text(satfruit1$OBS,prediccion)


# Direct Estimation

means.AREAS<-sapply(split(satfruit$OBS,satfruit$SArea),mean)
TOTAL.AREAS<-means.AREAS*c(119,703, 564)

#Model prediction

TOTAL.AREASPRED<-predict(model.F, newdata)

#Comparison of Direct estimation versus Model Prediction

resumen<-rbind(TOTAL.AREAS,TOTAL.AREASPRED)

row.names(resumen)<-c("Direct Est.","Model Prediction")
barplot(resumen/10000,legend=rownames(resumen),main="Direct estimates
and Predicted Fruits Totals in ha.",beside=TRUE,col=c(3,4))