# LOGISTICKA REGRESE


#nacteni souboru
heartdisease <- read.table("heartdisease.txt", header=TRUE,sep=",")

###   A retrospective sample of males in a heart-disease high-risk region
###   of the Western Cape, South Africa. There are roughly two controls per
###   case of CHD. Many of the CHD positive men have undergone blood
###   pressure reduction treatment and other programs to reduce their risk
###   factors after their CHD event. In some cases the measurements were
###   made after these treatments. These data are taken from a larger
###   dataset, described in  Rousseauw et al, 1983, South African Medical
###   Journal. 
###   
###   sbp		systolic blood pressure
###   tobacco		cumulative tobacco (kg)
###   ldl		low densiity lipoprotein cholesterol
###   adiposity
###   famhist		family history of heart disease (Present, Absent)
###   typea		type-A behavior
###   obesity
###   alcohol		current alcohol consumption
###   age		age at onset
###   chd		response, coronary heart disease

# jednoduchy prehled promennych
pairs(heartdisease)

##


# jednoduchy model se spojitou promennou
model1 <- glm(chd ~ tobacco,
             family = binomial,
             data = heartdisease)

summary(model1)

#odhad ODDS RATIO
exp(coef(model1)[2])


## znazorneni modelu
newTob <- data.frame(tobacco=seq(0,40,.1))
predicted.probability <- predict(model1,newTob,type="resp")

plot(predicted.probability ~ newTob$tobacco,type="l",
	xlab="Tobacco",
	ylab="Prob(CHD)",
	ylim=c(0,1))

points(heartdisease$tobacco,heartdisease$chd)

# princip Goodness of Fit

#kategorizace prediktoru
Tob.group <- cut(heartdisease$tobacco,c(-1,2,4,6,8,10,15,20,100))
tb <- table(Tob.group, heartdisease$chd)
rel.freq <- prop.table(tb,1)[,2]
#relativni frekvence vysledku v kategoriich
points(rel.freq~c(1,3,5,7,9,12.5,17.5,25),pch=5)

## Goodness of Fit statistic

#library(Design) # v knihovne Franka Harrella
library(rms) # v knihovne Franka Harrella
des.model1 <- lrm(chd ~ tobacco, x=TRUE, y=TRUE, data=heartdisease)
resid(des.model1, 'gof')

## OVERENI PREDPOKLADU MODELU

#zakladni rezidua ve zobecnenem linearnim modelu

plot(residuals(model1, type = "pearson") ~ heartdisease$tobacco,
     xlab = "Tobacco",
     ylab = "Pearson residual"
     )

#radeji s vyhlazenim
scatter.smooth(residuals(model1, type = "pearson") ~ heartdisease$tobacco,
     xlab = "Tobacco",
     ylab = "Pearson residual"
     )

scatter.smooth(residuals(model1, type = "deviance") ~ heartdisease$tobacco,
     xlab = "Tobacco",
     ylab = "Deviance residual"
     )

#pakove body
plot(hatvalues(model1)~fitted(model1),
     xlab = "Predicted probability",
     ylab = "Leverage"
)
points(fitted(model1)[heartdisease$chd==1],hatvalues(model1)[heartdisease$chd==1], col="red")
#bez ohledu na skutečný výsledek, váha pozorování s predikovanou vysokou nebo nízkou pravděpodobností je nízká

#Cookovy vzdalenosti
plot(cooks.distance(model1)~fitted(model1),
		xlab = "Predicted probability",
    ylab = "Cookova vzdálenost"
)
points(fitted(model1)[heartdisease$chd==1],cooks.distance(model1)[heartdisease$chd==1], col="red")

heartdisease[cooks.distance(model1) > 0.05,]
boxplot(heartdisease$tobacco~heartdisease$chd)


## kategorialni prediktor

tobacco.kat <- cut(heartdisease$tobacco,c(-1,0,Inf),right=T,labels=c("Nekurak","Kurak"))

model2 <- glm(chd ~ tobacco.kat,
             family = binomial,
             data = heartdisease)

summary(model2)

## kategorialni + spojity prediktor

model3 <- glm(chd ~ tobacco.kat+sbp,
             family = binomial,
             data = heartdisease)

summary(model3)


## analyza deviance, se statistickym testem

# srovnani modelu a submodelu

anova(model2,model3,test="Chisq")