#Prevzato z knihy: Andersen, P. K., Skovgaard, L. T. (2010) Regression with Linear Predictors, Springer.\ ### Example concerning body mass index and vitamin D status. ### ### The data presented here come from a large study on vitamin D status in ### four European countries, conducted by Rikke Andersen, ### Fodevaredirektoratet, Denmark. The data provide age, body mass index ### and vitamin D status for 420 women from 4 European countries. Body ### mass index (BMI) is a height corrected weight measure defined as ### weight (in kilos) divided by height (in meters) squared. Vitamin D ### status is given via a measurement of 25-hydroxy-vitamin D (25OHD) in ### serum and among the questions to be addressed using these data is ### whether the 25OHD-level depends on BMI and on age. ### ### Variable list: ### ### age age of the individual ### ### bmi body mass index ### ### country country of residence ### 1: Denmark ### 2: Finland ### 4: Ireland ### 6: Poland ### ### category 1: girls ### 2: women ### ### vitd the level of vitamin D, serum-25 hydroxi ### S25OHD ### ### sunexp sun exposure ### 1: avoid sun ### 2: sometimes ### 3: prefer sun ### ### vitdintake Vitamin D intake #### #priprava dat vitaminD <- read.csv2("http://staff.pubhealth.ku.dk/~linearpredictors/datafiles/VitaminD.csv", sep = ";", dec = ".", header = TRUE, colClasses = ## "country","category","vitd", "age","bmi","sunexp","vitdintake" c("factor", "factor","numeric","numeric","numeric","factor","numeric"), na.strings="." ) vitaminD <- within(vitaminD, { country <- factor(country,levels=c(1,2,4,6),labels=c("Denmark","Finland","Ireland","Poland")) category <- factor(category,levels=c(1,2),labels=c("girls","women")) sunexp <- factor(sunexp,levels=c(1,2,3),labels=c("avoid sun","sometimes","prefer sun")) }) ## omitting individuals considered underweight vitaminD <- vitaminD[vitaminD$bmi >= 18.5,] #### #women women <- subset(vitaminD, category == "women") ## number of women from each country table(women$country) ## countrywise medians of variables countrymedians <- function(x)tapply(x, women$country, median) apply(subset(women, select = c(vitd, age, bmi, vitdintake)), 2, countrymedians) #Polsko je ponekud specificke - malo vitaminu, hodne BMI ## Irish women irlwomen <- subset(vitaminD, vitaminD$country == "Ireland" & vitaminD$category == "women") ### #A priori overeni normality rezidui na jednoduchem modelu ## Factor with levels corresponding to normal resp. overweight ## 1 = greater or equal to 25 irlwomen$bmigroup2 <- cut(irlwomen$bmi, c(0,25,Inf), right = F, labels = 0:1) ## Linear regression model <- lm(vitd ~ bmi, data = irlwomen) par(mfrow=c(2,2)) resrange <- c(-30,60) qqnorm(model$residuals[irlwomen$bmigroup2 == 0], ylab = "Residual, normal weight women", xlab = "Normal quantiles", main = "", ylim = resrange) abline(0, sqrt(var(model$residuals[irlwomen$bmigroup2 == 0])), lty = "21") qqnorm(model$residuals[irlwomen$bmigroup2 == 1], ylab = "Residual, overweight women", xlab = "Normal quantiles", main = "", ylim = resrange) abline(0, sqrt(var(model$residuals[irlwomen$bmigroup2 == 1])), lty = "21") qqnorm(model$residuals, ylab = "Residual, all women", xlab = "Normal quantiles", main = "", ylim = resrange) abline(0, sqrt(var(model$residuals)), lty = "21") #prozkoumani rozlozeni rezidui hist(model$residuals, freq = FALSE, yaxp = c(0,0.04,4), xlim = c(-60,60), breaks = seq(-57.5, 57.5, 5), main = "", xlab = "Residual" ) box() curve(dnorm(x, mean = mean(model$residuals), sd = sd(model$residuals)), add = TRUE ) ## Linear regression - TRANSFORMATION model <- lm(log10(vitd) ~ bmi, data = irlwomen) par(mfrow=c(2,2)) resrange <- c(-0.4,0.4) qqnorm(model$residuals[irlwomen$bmigroup2 == 0], ylab = "Residual, normal weight women", xlab = "Normal quantiles", main = "", ylim = resrange) abline(0, sqrt(var(model$residuals[irlwomen$bmigroup2 == 0])), lty = "21") qqnorm(model$residuals[irlwomen$bmigroup2 == 1], ylab = "Residual, overweight women", xlab = "Normal quantiles", main = "", ylim = resrange) abline(0, sqrt(var(model$residuals[irlwomen$bmigroup2 == 1])), lty = "21") qqnorm(model$residuals, ylab = "Residual, all women", xlab = "Normal quantiles", main = "", ylim = resrange) abline(0, sqrt(var(model$residuals)), lty = "21") hist(model$residuals, freq = FALSE, breaks = seq(-0.4,0.4,length.out = 10), main = "", xlab = "Residual" ) box() curve(dnorm(x, mean = mean(model$residuals), sd = sd(model$residuals)), add = TRUE ) #POUZIJEME TEDY PRO MODELOVANI LOGARITMICKOU TRANSFORMACI ################ #adjustace STATEM ## Poland is used as reference group in the linear regressions ## marginalni a adjustovane koeficienty model1 <- lm(log10(vitd) ~ relevel(country, ref = "Poland"), data = women) summary(model1) round(confint(model1), digits = 3) model2 <- lm(log10(vitd) ~ relevel(country, ref = "Poland") + bmi, data = women) summary(model2) round(confint(model2), digits = 3) drop1(model2,test="F") #porad jsme to moc nevyresili, staty skryvaji hodne variability ################################## #prediktory - marginalni vztah #spojite par(mfrow = c(2,2)) scatter.smooth(log10(women$vitd) ~ women$bmi, degree = 1, xlab = "BMI", ylab = expression(paste(log[10],"(vitamin D)"))) scatter.smooth(log10(women$vitd) ~ women$age, degree = 1, xlab = "Age", ylab = expression(paste(log[10],"(vitamin D)"))) scatter.smooth(log10(women$vitd) ~ women$vitdintake, degree = 1, xlab = "Vitamin D intake", ylab = expression(paste(log[10],"(vitamin D)"))) scatter.smooth(log10(women$vitd) ~ log10(women$vitdintake), degree = 1, xlab = expression(paste(log[10],"(vitamin D intake)")), ylab = expression(paste(log[10],"(vitamin D)"))) #kategorialni par(mfrow = c(1,2)) boxplot(log10(vitd)~sunexp, xlab="Sun habits", ylab="log10 Vitamin D", data=women) boxplot(bmi~sunexp, xlab="Sun habits", ylab="BMI", data=women) ## dve kategorialni t <- table(women$sunexp, women$country, dnn = c("Sun habits", "Country")) addmargins(t, margin = 1) ## the same table, but now as percentages rather than counts apply(t, 2, function(x)x/sum(x)*100) #GRAFICKY KONTINGENCNI TABULKA par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) barplot(prop.table(table(women$sunexp, women$country),2),legend.text=c("Avoid","Sometimes","Prefer"),args.legend=list(x="topright",inset=c(-0.3,0))) ################################# ## ROZSIRENY MODEL ## poland as reference level women$country <- relevel(women$country, ref="Poland") ## prefer sun sometimes (sunexp = 2) as reference level women$sunexp <- relevel(women$sunexp, ref="sometimes") ## Linear regression on all variables model1 <- lm(log10(vitd) ~ bmi + age + log10(vitdintake) + sunexp + country, data = women) summary(model1) ## removing age model2 <- lm(log10(vitd) ~ bmi + log10(vitdintake) + sunexp + country, data = women) summary(model2) drop1(model2,test="F") ## removing sun habits model3 <- lm(log10(vitd) ~ bmi + log10(vitdintake) + country, data = women) summary(model3) drop1(model3,test="F") #porad jsme to moc nevyresili, staty skryvaji hodne variability ################################# ## ANALYZA REZIDUI #abecedni poradi women$country <- relevel(women$country, ref="Ireland") women$country <- relevel(women$country, ref="Finland") women$country <- relevel(women$country, ref="Denmark") ## attaching residuals to dataset women$residuals <- model3$residuals ## split into countrywise datasets countries <- split(women, women$country) levels <- c("Denmark", "Finland", "Ireland", "Poland") countrypch <- c(4, 2, 19, 21) par(mfrow = c(3,2)) ## Residuals vs country stripchart(residuals ~ relevel(country, ref = "Denmark"), data = women, vertical = T, xlim = c(0.5,4.5), ylim = c(-0.9, 0.5), yaxp = c(-0.8,0.4,3), method = "jitter", xlab = "Country", ylab = "Residual", group.names = levels, pch = countrypch) abline(h = 0, lty = "22") ## Residuals vs bmi, smoothers countrywise scatter.smooth(women$residuals ~ women$bmi, pch = countrypch[women$country], ylim = c(-0.9, 0.5), yaxp = c(-0.8,0.4,3), xlab = "BMI", ylab = "Residual", degree = 1 ) ## Residuals vs vitamin D intake, and smoother scatter.smooth(residuals(model3) ~ log10(women$vitdintake), pch = countrypch[women$country], degree = 1, ylim = c(-0.9, 0.5), yaxp = c(-0.8,0.4,3), xlab = expression(paste(log[10], "(vitamin D intake)")), ylab = "Residual" ) ## residuals vs sun exposure stripchart(countries[["Denmark"]]$residuals ~ countries[["Denmark"]]$sunexp, vertical = T, xlim = c(0.5,3.5), ylim = c(-0.9, 0.5), yaxp = c(-0.8,0.4,3), xlab = "Sun exposure", ylab = "Residual", method = "jitter", pch = 4) stripchart(countries[["Finland"]]$residuals ~ countries[["Finland"]]$sunexp, vertical = T, method = "jitter", add = T, pch = 2) stripchart(countries[["Ireland"]]$residuals ~ countries[["Ireland"]]$sunexp, vertical = T, method = "jitter", add = T, pch = 19) stripchart(countries[["Poland"]]$residuals ~ countries[["Poland"]]$sunexp, vertical = T, method = "jitter", add = T, pch = 21) abline(h = 0, lty = "11") ## residuals vs age scatter.smooth(women$residuals ~ women$age, pch = countrypch[women$country], ylim = c(-0.9, 0.5), yaxp = c(-0.8,0.4,3), xlab = "Age", ylab = "Residual" ) ## Residuals vs predicted values of log10(Vitamin D) plot(residuals(model3) ~ predict(model3), pch = countrypch[women$country], xlab = expression(paste("Predicted value of ", log[10], "(vitamin D)")), ylab = "Residual", ylim = c(-0.9, 0.5), yaxp = c(-0.8,0.4,3) ) abline(h = 0, lty = "11") #velka rezidua women[women$residuals < -0.5,] ################################# ## HLEDANI INTERAKCI ### country-vitdintake par(mfrow = c(1,2)) ## log10(vitaminD) vs log10(vitdintake) ## smoothers countrywise plot(log10(vitd) ~ log10(vitdintake), pch = countrypch[country], data = women, xlab = expression(paste(log[10], "(vitamin D intake)")), ylab = expression(paste(log[10], "(vitamin D)")), col = "grey" ) ## for smoothing on a subdataset f <- function(d){ newvitdintake <- seq(min(d$vitdintake), max(d$vitdintake), length.out = 100) lines(log10(newvitdintake), predict(loess(log10(vitd) ~ log10(vitdintake), data = d), data.frame(vitdintake = newvitdintake)) ) } lapply(countries, f) ## Residuals vs bmi ## smoothers countrywise plot(residuals(model3) ~ log10(vitdintake), pch = countrypch[country], data = women, ylim = c(-0.9, 0.5), yaxp = c(-0.8,0.4,3), xlab = expression(paste(log[10], "(vitamin D intake)")), ylab = "Residual", col = "grey" ) ## for smoothing on a subdataset f <- function(d){ newvitdintake <- seq(min(d$vitdintake), max(d$vitdintake), length.out = 100) lines(log10(newvitdintake), predict(loess(residuals ~ log10(vitdintake), data = d), data.frame(vitdintake = newvitdintake)) ) } lapply(countries, f) ### country-sunexp median.logintake <- median(log10(women$vitdintake)) median.intake <- median((women$vitdintake)) median.bmi <- median(women$bmi) ## linear regression, where bmi and log10(vitamin D intake) are centered around the medians, ## resulting in more interpretable parameters model4 <- lm(log10(vitd) ~ I(bmi-median.bmi) + country:I(log10(vitdintake)-median.logintake) + country:sunexp + country-1, data = women) summary(model4) drop1(model4,test="F") ## graf interakci # predpoved pro ruzne moznosti newdata <- expand.grid(sunexp = factor(c("avoid sun","sometimes","prefer sun"),levels=c("avoid sun","sometimes","prefer sun")), country = factor(c("Denmark","Finland","Ireland","Poland")), bmi = median.bmi, vitdintake = median(women$vitdintake)) newdata <- data.frame(newdata) newdata$pred <- predict(model4, newdata = newdata) # vykresleni predpovedi countries <<- split(newdata, newdata$country) par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) stripchart(newdata$pred ~ newdata$sunexp, xlim = c(0.75,3.25), vertical = T, col = "white", ## not plotting group.names = c("Avoid", "Sometimes", "Prefer"), xlab = "Sun exposure", ylab = expression(paste("Predicted ", log[10], "(vitamin D)"))) points(countries[["Denmark"]]$pred ~ countries[["Denmark"]]$sunexp, type = "b", pch = 4) points(countries[["Finland"]]$pred ~ countries[["Finland"]]$sunexp, type = "b", pch = 2) points(countries[["Ireland"]]$pred ~ countries[["Ireland"]]$sunexp, type = "b", pch = 19) points(countries[["Poland"]]$pred ~ countries[["Poland"]]$sunexp, type = "b", pch = 21) legend("topright", c("Denmark","Finland","Ireland","Poland"), pch=c(4,2,19,21),inset=c(-0.3,0)) #VLIVNA POZOROVANI plot(cooks.distance(model4) ~ women$bmi, xlab = "BMI", ylab = "Cook's distance", pch = countrypch[women$country]) legend("topright", c("Denmark","Finland","Ireland","Poland"), pch=c(4,2,19,21)) women[cooks.distance(model4)>0.1,] ##### FINALNI MODEL model <- lm(log10(vitd) ~ country + bmi + country:log10(vitdintake) -1 , data = women) summary(model) ## Effect of vitamin D intake print(b <- confint(model)) ## Effect of doubling the intake 2^model$coef[6:9] ## and confidence intervals obtained by transformation of CI for b 2^b[6:9,]