#Prevzato z knihy: Andersen, P. K., Skovgaard, L. T. (2010) Regression with Linear Predictors, Springer.\

###   Example concerning body mass index and vitamin D status.
###   
###   The data presented here come from a large study on vitamin D status in
###   four European countries, conducted by Rikke Andersen,
###   Fodevaredirektoratet, Denmark. The data provide age, body mass index
###   and vitamin D status for 420 women from 4 European countries.  Body
###   mass index (BMI) is a height corrected weight measure defined as
###   weight (in kilos) divided by height (in meters) squared.  Vitamin D
###   status is given via a measurement of 25-hydroxy-vitamin D (25OHD) in
###   serum and among the questions to be addressed using these data is
###   whether the 25OHD-level depends on BMI and on age.
###   
###   Variable list:
###   
###   age                    age of the individual
###   
###   bmi                    body mass index
###   
###   country                country of residence
###                          1: Denmark
###                          2: Finland
###                          4: Ireland
###                          6: Poland
###   
###   category               1: girls
###                          2: women
###   
###   vitd		       the level of vitamin D, serum-25 hydroxi
###   		       S25OHD
###   
###   sunexp		       sun exposure
###   		       1: avoid sun
###   		       2: sometimes
###   		       3: prefer sun
###   
###   vitdintake	       Vitamin D intake


####
#priprava dat
vitaminD <- read.csv2("http://staff.pubhealth.ku.dk/~linearpredictors/datafiles/VitaminD.csv",
                      sep = ";",
                      dec = ".",
                      header = TRUE,
                      colClasses =
                      ## "country","category","vitd", "age","bmi","sunexp","vitdintake"
                      c("factor", "factor","numeric","numeric","numeric","factor","numeric"),
                      na.strings="."
                      )


vitaminD <- within(vitaminD, {
	country <- factor(country,levels=c(1,2,4,6),labels=c("Denmark","Finland","Ireland","Poland"))
	category <- factor(category,levels=c(1,2),labels=c("girls","women"))	
	sunexp <- factor(sunexp,levels=c(1,2,3),labels=c("avoid sun","sometimes","prefer sun"))	
})


## omitting individuals considered underweight
vitaminD <- vitaminD[vitaminD$bmi >= 18.5,]





####
#women
women <- subset(vitaminD, category == "women")

## number of women from each country
table(women$country)

## countrywise medians of variables
#aplikovat podle statu
countrymedians <- function(x)tapply(x, women$country, median)
#aplikovat jednotlive na vybrane sloupce
apply(subset(women, select = c(vitd, age, bmi, vitdintake)), 2, countrymedians) 

#Polsko je ponekud specificke - malo vitaminu, hodne BMI





## Irish women
irlwomen <- subset(vitaminD, vitaminD$country == "Ireland" & vitaminD$category == "women")
#irlwomen <- women

###





#A priori overeni normality rezidui na jednoduchem modelu


## Factor with levels corresponding to normal resp. overweight
## 1 = greater or equal to 25
irlwomen$bmigroup2 <- cut(irlwomen$bmi, c(0,25,Inf), right = F, labels = 0:1) 


## Linear regression
model <- lm(vitd ~ bmi,
            data = irlwomen)

par(mfrow=c(2,2))

resrange <- c(-30,60)
qqnorm(model$residuals[irlwomen$bmigroup2 == 0],
       ylab = "Residual, normal weight women",
       xlab = "Normal quantiles",
       main = "",
       ylim = resrange)
abline(0, sqrt(var(model$residuals[irlwomen$bmigroup2 == 0])), lty = "21")
  
qqnorm(model$residuals[irlwomen$bmigroup2 == 1],
       ylab = "Residual, overweight women",
       xlab = "Normal quantiles",
       main = "",
       ylim = resrange)
abline(0, sqrt(var(model$residuals[irlwomen$bmigroup2 == 1])), lty = "21")

qqnorm(model$residuals,
       ylab = "Residual, all women",
       xlab = "Normal quantiles",
       main = "",
       ylim = resrange)
abline(0, sqrt(var(model$residuals)), lty = "21")

#prozkoumani rozlozeni rezidui
hist(model$residuals,
     freq = FALSE,
     yaxp = c(0,0.04,4),
     xlim =  c(-60,60),
     breaks = seq(-57.5, 57.5, 5),
     main = "",
     xlab = "Residual"
     )
box()
curve(dnorm(x,
            mean = mean(model$residuals),
            sd = sd(model$residuals)),
      add = TRUE
      )
      
      
      


## Linear regression - TRANSFORMATION
model <- lm(log10(vitd) ~ bmi,
            data = irlwomen)

par(mfrow=c(2,2))

resrange <- c(-0.4,0.4)
qqnorm(model$residuals[irlwomen$bmigroup2 == 0],
       ylab = "Residual, normal weight women",
       xlab = "Normal quantiles",
       main = "",
       ylim = resrange)
abline(0, sqrt(var(model$residuals[irlwomen$bmigroup2 == 0])), lty = "21")

qqnorm(model$residuals[irlwomen$bmigroup2 == 1],
       ylab = "Residual, overweight women",
       xlab = "Normal quantiles",
       main = "",
       ylim = resrange)
abline(0, sqrt(var(model$residuals[irlwomen$bmigroup2 == 1])), lty = "21")

qqnorm(model$residuals,
       ylab = "Residual, all women",
       xlab = "Normal quantiles",
       main = "",
       ylim = resrange)
abline(0, sqrt(var(model$residuals)), lty = "21")

hist(model$residuals,
     freq = FALSE,
     breaks = seq(-0.4,0.4,length.out = 10),
     main = "",
     xlab = "Residual"
     )
box()
curve(dnorm(x,
            mean = mean(model$residuals),
            sd = sd(model$residuals)),
      add = TRUE
      )
      
#POUZIJEME TEDY PRO MODELOVANI LOGARITMICKOU TRANSFORMACI      






################
#adjustace STATEM

## Poland is used as reference group in the linear regressions
## marginalni a adjustovane koeficienty

model1 <- lm(log10(vitd) ~ relevel(country, ref = "Poland"), data = women)
summary(model1)
round(confint(model1), digits = 3)

model2 <- lm(log10(vitd) ~ relevel(country, ref = "Poland") + bmi, data = women)
summary(model2)
round(confint(model2), digits = 3)

drop1(model2,test="F")
#porad jsme to moc nevyresili, staty skryvaji hodne variability





##################################
#prediktory - marginalni vztah

#spojite
par(mfrow = c(2,2))
scatter.smooth(log10(women$vitd) ~ women$bmi, 
               degree = 1,
               xlab = "BMI",
               ylab = expression(paste(log[10],"(vitamin D)")))

scatter.smooth(log10(women$vitd) ~ women$age, 
               degree = 1,
               xlab = "Age",
               ylab = expression(paste(log[10],"(vitamin D)")))

scatter.smooth(log10(women$vitd) ~ women$vitdintake, 
               degree = 1,
               xlab = "Vitamin D intake",
               ylab = expression(paste(log[10],"(vitamin D)")))

scatter.smooth(log10(women$vitd) ~ log10(women$vitdintake),
               degree = 1,
               xlab = expression(paste(log[10],"(vitamin D intake)")),
               ylab = expression(paste(log[10],"(vitamin D)")))
               
               
            
#kategorialni          
par(mfrow = c(1,2))
boxplot(log10(vitd)~sunexp,
				xlab="Sun habits",
				ylab="log10 Vitamin D",
				data=women)
boxplot(bmi~sunexp,
				xlab="Sun habits",
				ylab="BMI",
				data=women)
				
## dve kategorialni
t <- table(women$sunexp, women$country, dnn = c("Sun habits", "Country"))
addmargins(t, margin = 1)
## the same table, but now as percentages rather than counts
apply(t, 2, function(x)x/sum(x)*100)
#GRAFICKY KONTINGENCNI TABULKA
par(mfrow = c(1,1))
par(mar=c(5.1, 4.1, 4.1, 13.1), xpd=TRUE)
barplot(prop.table(table(women$sunexp, women$country),2),legend.text=c("Avoid","Sometimes","Prefer"),args.legend=list(x="topright",inset=c(-0.3,0)))

par(mar=c(5, 4, 4, 2) + 0.1, xpd=FALSE)


#################################
## ROZSIRENY MODEL


## poland as reference level
women$country <- relevel(women$country, ref="Poland")
## prefer sun sometimes (sunexp = 2) as reference level
women$sunexp <- relevel(women$sunexp, ref="sometimes")

## Linear regression on all variables
model1 <- lm(log10(vitd) ~ bmi + age + log10(vitdintake) + sunexp + country, data = women)
summary(model1)

## removing age
model2 <- lm(log10(vitd) ~ bmi + log10(vitdintake) + sunexp + country, data = women)
summary(model2)

drop1(model2,test="F")

## removing sun habits
model3 <- lm(log10(vitd) ~ bmi + log10(vitdintake) + country, data = women)
summary(model3)


drop1(model3,test="F")
#porad jsme to moc nevyresili, staty skryvaji hodne variability





#################################
## ANALYZA REZIDUI

#abecedni poradi
women$country <- relevel(women$country, ref="Ireland")
women$country <- relevel(women$country, ref="Finland")
women$country <- relevel(women$country, ref="Denmark")

## attaching residuals to dataset
women$residuals <- model3$residuals

## split into countrywise datasets
countries <- split(women, women$country)

levels <- c("Denmark", "Finland", "Ireland", "Poland")
countrypch <- c(4, 2, 19, 21)

par(mfrow = c(3,2))
## Residuals vs country  
stripchart(residuals ~ relevel(country, ref = "Denmark"),
           data = women,
           vertical = T,
           xlim = c(0.5,4.5),
           ylim = c(-0.9, 0.5),
           yaxp = c(-0.8,0.4,3),
           method = "jitter",
           xlab = "Country",
           ylab = "Residual",
           group.names = levels,
           pch = countrypch)

abline(h = 0, lty = "22")


#APLIKACE LOKALNIHO VYHLAZOVANI LOESS PRO VETSI PREHLEDNOST


## Residuals vs bmi, smoothers countrywise
scatter.smooth(women$residuals ~ women$bmi,
               pch = countrypch[women$country],
               ylim = c(-0.9, 0.5),
               yaxp = c(-0.8,0.4,3),
               xlab = "BMI",
               ylab = "Residual",
               degree = 1
               )

## Residuals vs vitamin D intake, and smoother
scatter.smooth(residuals(model3) ~ log10(women$vitdintake),
               pch = countrypch[women$country],
               degree = 1,
               ylim = c(-0.9, 0.5),
               yaxp = c(-0.8,0.4,3),
               xlab = expression(paste(log[10], "(vitamin D intake)")),
               ylab = "Residual"
               )

## residuals vs sun exposure
stripchart(countries[["Denmark"]]$residuals ~ countries[["Denmark"]]$sunexp,
           vertical = T,
           xlim = c(0.5,3.5),
           ylim = c(-0.9, 0.5),
           yaxp = c(-0.8,0.4,3),
           xlab = "Sun exposure",
           ylab = "Residual",
           method = "jitter",
           pch = 4)

stripchart(countries[["Finland"]]$residuals ~ countries[["Finland"]]$sunexp,
           vertical = T,
           method = "jitter",
           add = T,
           pch = 2)
stripchart(countries[["Ireland"]]$residuals ~ countries[["Ireland"]]$sunexp,
           vertical = T,
           method = "jitter",
           add = T,
           pch = 19)
stripchart(countries[["Poland"]]$residuals ~ countries[["Poland"]]$sunexp,
           vertical = T,
           method = "jitter",
           add = T,
           pch = 21)

abline(h = 0, lty = "11")

## residuals vs age
scatter.smooth(women$residuals ~ women$age,
               pch = countrypch[women$country],
               ylim = c(-0.9, 0.5),
               yaxp = c(-0.8,0.4,3),
               xlab = "Age",
               ylab = "Residual"
               )

## Residuals vs predicted values of log10(Vitamin D)
plot(residuals(model3) ~ predict(model3),
     pch = countrypch[women$country],
     xlab = expression(paste("Predicted value of ", log[10], "(vitamin D)")),
     ylab = "Residual",
     ylim = c(-0.9, 0.5),
     yaxp = c(-0.8,0.4,3)
     )
abline(h = 0, lty = "11")


#velka rezidua
women[women$residuals < -0.5,]



#################################
## HLEDANI INTERAKCI


### country-vitdintake

par(mfrow = c(1,2))
## log10(vitaminD) vs log10(vitdintake)
## smoothers countrywise
plot(log10(vitd) ~ log10(vitdintake),
     pch = countrypch[country],
     data = women,
     xlab = expression(paste(log[10], "(vitamin D intake)")),
     ylab = expression(paste(log[10], "(vitamin D)")),
     col = "grey"
     )

## for smoothing on a subdataset
f <- function(d){
  newvitdintake <- seq(min(d$vitdintake), max(d$vitdintake), length.out = 100)
  lines(log10(newvitdintake),
        predict(loess(log10(vitd) ~ log10(vitdintake), data = d),
                data.frame(vitdintake = newvitdintake))
        )
}
lapply(countries, f)


## Residuals vs bmi
## smoothers countrywise
plot(residuals(model3) ~ log10(vitdintake),
     pch = countrypch[country],
     data = women,
     ylim = c(-0.9, 0.5),
     yaxp = c(-0.8,0.4,3),
     xlab = expression(paste(log[10], "(vitamin D intake)")),
     ylab = "Residual",
     col = "grey"
     )

## for smoothing on a subdataset
f <- function(d){
  newvitdintake <- seq(min(d$vitdintake), max(d$vitdintake), length.out = 100)
  lines(log10(newvitdintake),
        predict(loess(residuals ~ log10(vitdintake), data = d),
                data.frame(vitdintake = newvitdintake))
        )
}

lapply(countries, f)




### country-sunexp

median.logintake <- median(log10(women$vitdintake))
median.intake <- median((women$vitdintake))
median.bmi <- median(women$bmi)

## linear regression, where bmi and log10(vitamin D intake) are centered around the medians,
## resulting in more interpretable parameters
model4 <- lm(log10(vitd) ~ I(bmi-median.bmi) + country:I(log10(vitdintake)-median.logintake) + country:sunexp + country-1,
            data = women)

summary(model4)

drop1(model4,test="F")

## graf interakci

# predpoved pro ruzne moznosti
	newdata <- expand.grid(sunexp = factor(c("avoid sun","sometimes","prefer sun"),levels=c("avoid sun","sometimes","prefer sun")),
	                       country = factor(c("Denmark","Finland","Ireland","Poland")),
	                       bmi = median.bmi,
	                       vitdintake = median(women$vitdintake))
	newdata <- data.frame(newdata)
newdata$pred <- predict(model4, newdata = newdata)


# vykresleni predpovedi
countries <<- split(newdata, newdata$country)

par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
stripchart(newdata$pred ~ newdata$sunexp,
           xlim = c(0.75,3.25),
           vertical = T,
           col = "white", ## not plotting
           group.names = c("Avoid", "Sometimes", "Prefer"),
           xlab = "Sun exposure",
           ylab = expression(paste("Predicted ", log[10], "(vitamin D)")))

points(countries[["Denmark"]]$pred ~ countries[["Denmark"]]$sunexp,
       type = "b",
       pch = 4)  
points(countries[["Finland"]]$pred ~ countries[["Finland"]]$sunexp,
       type = "b",
       pch = 2)
points(countries[["Ireland"]]$pred ~ countries[["Ireland"]]$sunexp,
       type = "b",
       pch = 19)
points(countries[["Poland"]]$pred ~ countries[["Poland"]]$sunexp,
       type = "b",
       pch = 21)
       
legend("topright", c("Denmark","Finland","Ireland","Poland"), pch=c(4,2,19,21),inset=c(-0.3,0))






#VLIVNA POZOROVANI

plot(cooks.distance(model4) ~ women$bmi,
     xlab = "BMI",
     ylab = "Cook's distance",
     pch = countrypch[women$country])
legend("topright", c("Denmark","Finland","Ireland","Poland"), pch=c(4,2,19,21))

women[cooks.distance(model4)>0.1,]




##### FINALNI MODEL

model <- lm(log10(vitd) ~ country + bmi + country:log10(vitdintake) -1 ,
            data = women)
summary(model)

## Effect of vitamin D intake
print(b <- confint(model))

## Effect of doubling the intake 
2^model$coef[6:9]

## and confidence intervals obtained by transformation of CI for b
2^b[6:9,]