Data from 1991-1995 are contained in 05-1-galton-x.csv (05-1-galton-x.csv), Although the book says the
data is from HistData: Data Sets from the History of Statistics and Data Visualization, 2018 (https://cran.rproject.org/web/packages/HistData/index.html),
I have actually used this version of Galton’s Height Data
(http://www.randomservices.org/random/data/Galton.html)
galton<-read.csv("05-1-galton-x.csv",header=TRUE) # read csv file into dataframe ga
lton
attach(galton) #uncomment if/while necessary
summary(galton)
## Family Father Mother Gender
## Length:898 Min. :62.00 Min. :58.00 Length:898
## Class :character 1st Qu.:68.00 1st Qu.:63.00 Class :character
## Mode :character Median :69.00 Median :64.00 Mode :character
## Mean :69.23 Mean :64.08
## 3rd Qu.:71.00 3rd Qu.:65.50
## Max. :78.50 Max. :70.50
## Height Kids
## Min. :56.00 Min. : 1.000
## 1st Qu.:64.00 1st Qu.: 4.000
## Median :66.50 Median : 6.000
## Mean :66.76 Mean : 6.136
## 3rd Qu.:69.70 3rd Qu.: 8.000
## Max. :79.00 Max. :15.000
# summary statistics
# need means for unique fathers and mothers - identify first mention of each family
Unique.Fathers=numeric()
Unique.Mothers=numeric()
nunique=1 # number of unique families
Unique.Fathers[1] = Father[1]
Unique.Mothers[1] = Mother[1]
for(i in 2:length(Family))
{
if(Family[i] != Family[i-1]){
nunique=nunique+1
Unique.Fathers[nunique]=Father[i]
Unique.Mothers[nunique]=Mother[i]
}
}
length(Unique.Fathers)
## [1] 197
summary(Unique.Fathers)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 62.00 68.00 69.50 69.35 71.00 78.50
05-1-galton-heights-x.knit ﬁle:///home/mojmird/ownCloud%20-%20Mojm%C3%ADr...
1 z 6 10. 03. 23 10:39
sd(Unique.Fathers)
## [1] 2.622034
length(Unique.Mothers)
## [1] 197
summary(Unique.Mothers)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 58.00 62.70 64.00 63.98 65.50 70.50
sd(Unique.Mothers)
## [1] 2.355607
Son = Height[Gender=="M"]
length(Son)
## [1] 465
summary(Son)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 60.00 67.50 69.20 69.23 71.00 79.00
sd(Son)
## [1] 2.631594
Daughter = Height[Gender=="F"]
length(Daughter)
## [1] 433
summary(Daughter)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 56.00 62.50 64.00 64.11 65.50 70.50
sd(Daughter)
05-1-galton-heights-x.knit ﬁle:///home/mojmird/ownCloud%20-%20Mojm%C3%ADr...
2 z 6 10. 03. 23 10:39
## [1] 2.37032
Figure 5.1 (page 124) Linear regression of sons’ on fathers’
heights
# Heights of fathers of sons
FatherS = Father[Gender=="M"]
fit <- lm(Son ~ FatherS) # linear regression data in fit
Predicted <- predict(fit) # Get the predicted values
summary(fit)
##
## Call:
## lm(formula = Son ~ FatherS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3774 -1.4968 0.0181 1.6375 9.3987
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.25891 3.38663 11.30 <2e-16 ***
## FatherS 0.44775 0.04894 9.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.424 on 463 degrees of freedom
## Multiple R-squared: 0.1531, Adjusted R-squared: 0.1513
## F-statistic: 83.72 on 1 and 463 DF, p-value: < 2.2e-16
FatherS.j <- jitter(FatherS, factor=5)
Son.j <- jitter(Son, factor=5)
xlims=ylims=c(55,80)
par(mfrow=c(1,1), mar=c(4,4,2,0), pty="s") # square plot
plot(FatherS.j, Son.j, xlim=xlims,ylim=ylims,cex=0.7,
xlab="father's height (inches)",ylab="son's height (inches)" , col="gray68")
lines(c(xlims[1],xlims[2]),c(xlims[1],xlims[2]),lty=2 )
lines(Predicted~FatherS,lwd=2)
05-1-galton-heights-x.knit ﬁle:///home/mojmird/ownCloud%20-%20Mojm%C3%ADr...
3 z 6 10. 03. 23 10:39
Now in ggplot
library(ggplot2)
# create new data frame with exact and jittered, and predcted values
Males = cbind.data.frame(FatherS,FatherS.j,Son,Son.j,Predicted)
p <- ggplot(Males, aes(x=FatherS, y=Son)) # initial plot object
p <- p + geom_point(x=FatherS.j,y=Son.j,shape= 1) # defines scatter type plot
p <- p + labs(x="Father's height (inches)", y= "Son's height (inches)") # adds x an
d y axis labels
p <- p + theme(legend.position="none")#, legend.box = "horizontal") # removes the l
egend
p <- p + expand_limits(x = c(55,80),y = c(55,80)) # expand the axis limits
p <- p + geom_line(aes(FatherS,Predicted),size=1.5) # add previously fitted linear
regression line
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
05-1-galton-heights-x.knit ﬁle:///home/mojmird/ownCloud%20-%20Mojm%C3%ADr...
4 z 6 10. 03. 23 10:39
p <- p + geom_abline(slope=1, linetype="dashed") # line to represent equality betwe
en son and father height
# select single data points by CSV datarow numbers
pointA=c(137)
pointB=c(28)
# plot residual line and end points for selectedpointA
p <- p + geom_point(aes(x=FatherS.j[pointA], y = Predicted[pointA]), shape = 1)
p <- p + geom_point(aes(x=FatherS.j[pointA], y = Son.j[pointA]), shape = 1)
p <- p + geom_segment(linetype="dashed", size=1, colour="purple",aes(x=FatherS.j[po
intA],y=Son.j[pointA],xend = FatherS.j[pointA], yend = Predicted[pointA])) #p <- p
+ p
# plot residual line and end points for pointB
p <- p + geom_point(aes(x=FatherS.j[pointB], y = Predicted[pointB]), shape = 1)
p <- p + geom_point(aes(x=FatherS.j[pointB], y = Son.j[pointB]), shape = 1)
p <- p + geom_segment(linetype="dashed", size=1, colour="purple",aes(x=FatherS.j[po
intB],y=Son.j[pointB],xend = FatherS.j[pointB], yend = Predicted[pointB]))
p #displays the result
Figure 5.1 Scatter of heights of 465 fathers and sons from Galton’s data (many fathers are repeated since
they have multiple sons). A jitter has been added to separate the points, and the diagonal dashed line
represents exact equality between son and father’s heights. The solid line is the standard ‘best-ﬁt’ line. Each
point gives rise to a ‘residual’ (dashed line), which is the size of the error were we to use the line to predict a
son’s height from his father’s.
05-1-galton-heights-x.knit ﬁle:///home/mojmird/ownCloud%20-%20Mojm%C3%ADr...
5 z 6 10. 03. 23 10:39
05-1-galton-heights-x.knit ﬁle:///home/mojmird/ownCloud%20-%20Mojm%C3%ADr...
6 z 6 10. 03. 23 10:39