# Načtení balíčků a dat --------------------------------------------------

library(betaDelta)
library(car)
library(broom)
library(tidyverse)

theme_set(
  theme_classic(base_size = 15)
)


df <- read_rds("https://is.muni.cz/go/ccaz4r")


# Odhad regresních modelů ---------------------------------------------

# Budeme predikovat depresivitu na základě pohlaví, věku, průměru známek,
# důvěry ve vrstevníky/rodiče a optimismu


# Výběr proměnných a vyčištění dat
df_clean <- df %>% 
  select(id, skola, trida, deprese, pohlavi, vekr, 
         gpa, duv_v, duv_r, selfe, optim) %>% 
  drop_na()

# Odhad prvního modelu pomocí funkce lm() jako "linear model"
fit_1 <- lm(deprese ~ pohlavi + vekr, data = df_clean)

fit_1 # pouze rovnice a koeficienty
summary(fit_1) # Celkový přehled modelu

# Odhad druhého modelu
fit_2 <- lm(deprese ~ pohlavi + vekr + gpa + duv_v + duv_r + selfe + optim, 
             data = df_clean) 
summary(fit_2) 

# Přírůstek R2
summary(fit_1)$r.squared
summary(fit_2)$r.squared
summary(fit_2)$r.squared - summary(fit_1)$r.squared

# Srovnání obou modelů pomocí funkce anova()
# nulovou hypotézou je nulový přírůstek vysvětleného rozptylu
anova(fit_1, fit_2)

# Standardizované regresní koeficienty
betaDelta::BetaDelta(fit_2, alpha = .05)

# Pro základní grafickou diagnostiku modelu stačí na fit-objekt použít funkci
# plot()
plot(fit_2)

plot(fit_2, which = 1, id.n = 10) # Residuals vs. fitted
plot(fit_2, which = 2, id.n = 10) # QQ-plot of residuals
plot(fit_2, which = 3, id.n = 10) # Absolute residuals vs. fitted
plot(fit_2, which = 4, id.n = 10) # Cook's distances
plot(fit_2, which = 5, id.n = 10) # Residuals vs. leverage
plot(fit_2, which = 6, id.n = 10) # Cook's distances vs leverage


# Uložení reziduí
residuals(fit_2) %>% 
  head()

res <- tibble(resid = residuals(fit_2),
              std_resid = as.double(scale(resid)))

res 

# Histogram reziduí
res %>% 
  ggplot() +
  geom_histogram(aes(x = std_resid,
                     y = after_stat(density)),
                 color = "black", fill = "grey") +
  stat_function(fun = dnorm,
                color = "red",
                linewidth = 1) +
  coord_cartesian(xlim = c(-3.5, 3.5))

# Q-Q graf reziduí
res %>% 
  ggplot(aes(sample = std_resid)) +
  geom_qq_line(color = "red",
               linewidth = 1) +
  geom_qq(size = 3,
          alpha = .2)


# Funkce balíčku car -----------------------------------------

# Residual plots (vs predictor values)
car::residualPlots(fit_2)

# Component+residual plots
car::crPlots(fit_2)

# Added-variable, also called partial-regression plots
car::avPlots(fit_2)

# variance inflation factor
car::vif(fit_2)

# Durbin-watson test (autokorelace reziduí)
car::durbinWatsonTest(fit_2)


# Funkce balíčku broom ----------------------------------------

# balíček broom nabízí funkce tidy() a glance() pro zobrazení modelu
# v jedné tibble
# tidy() extrahuje informace o koeficientech modelu
# glace() o celkovém fitu modelu
broom::tidy(fit_2, conf.int = TRUE, conf.level = 0.95)

broom::glance(fit_1)
broom::glance(fit_2)

bind_rows(
  m1 = broom::glance(fit_1),
  m2 = broom::glance(fit_2),
  .id = "model"
)


df_aug <- broom::augment(fit_2, data = df_clean)
head(df_aug)


# Výběr pozorování s nejvyšší Cookovou vzdáleností
df_aug %>% 
  slice_max(order_by = .cooksd, n = 10) %>% 
  print(width = Inf)

# Výběr pozorování s nejvyšším reziduem
df_aug %>% 
  slice_max(order_by = abs(.resid), n = 10) %>% 
  print(width = Inf)

# Intervaly spolehlivosti a predikční intervaly ----------------------------

# Hodnoty prediktorů, pro které chceme zjistit predikované hodnoty
new_data <- expand.grid(
  pohlavi = c("muzske", "zenske"),
  vekr = 13,
  gpa = 2,
  duv_v = 2.7, 
  duv_r = 2.8,
  selfe = seq(1, 4, length.out = 1000),
  optim = 17
) %>% as_tibble()


# Intervaly spolehlivosti
ci <- broom::augment(fit_2,
                     newdata = new_data,
                     interval = "confidence")
ci <- ci %>% 
  rename(ci_lower = .lower,
         ci_upper = .upper)
  
ci

# Graficky
ci %>% 
  ggplot(aes(x = selfe,
             y = .fitted)) +
  geom_ribbon(aes(ymin = ci_lower,
                  ymax = ci_upper),
              fill = "blue",
              alpha = .3) +
  geom_line(color = "blue") +
  facet_wrap(~pohlavi, nrow = 2) +
  coord_cartesian(ylim = c(1, 4)) +
  labs(x = "Self-esteem",
       y = "Predikovaná depresivita")

# Predikční intervaly
pi <- broom::augment(fit_2,
                     newdata = new_data,
                     interval = "prediction")
pi <- pi %>% 
  rename(pi_lower = .lower,
         pi_upper = .upper)

cpi <- ci %>% 
  left_join(pi)
cpi

cpi %>% 
  ggplot(aes(selfe, .fitted)) +
  geom_ribbon(aes(ymin = pi_lower,
                  ymax = pi_upper),
              fill = "pink",
              alpha = .5) +
  geom_ribbon(aes(ymin = ci_lower,
                  ymax = ci_upper),
              fill = "blue",
              alpha = .3) +
  geom_line(color = "blue") +
  facet_wrap(~pohlavi, nrow = 2) +
  coord_cartesian(ylim = c(0.5, 4)) +
  labs(x = "Self-esteem",
       y = "Predikovaná depresivita")


# Moderace ----------------------------------------------------------------

# Kategorická proměnná jako moderátor
math <- read_csv("https://is.muni.cz/go/oqd8xr")
math

math <- math %>% 
  mutate(gender = factor(gender,
                         levels = c(0, 1),
                         labels = c("Male", "Female")))
math
summary(math)

fit_1 <- lm(math ~ gender + training, 
            data = math)
fit_2 <- lm(math ~ gender * training, 
            data = math)

summary(fit_1)
summary(fit_2)

math %>% 
  ggplot(aes(x = training, 
             y = math,
             color = gender,
             fill = gender)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm")

# Reparametrizace modelu, pokud chceme znát tzv. simple effects
math <- math %>% 
  mutate(
    m = as.integer(gender == "Male"),
    f = as.integer(gender == "Female"),
    m_train = m*training,
    f_train = f*training
  )


fit_3 <- lm(math ~ -1 + m + f + m_train + f_train, 
            data = math)
summary(fit_3)


# Spojitá proměnná jako moderátor
depress <- read_csv("https://is.muni.cz/go/p5psox")
depress

summary(depress)

# Vycentrování prediktorů
depress <-  depress %>% 
  mutate(
    stress_cnt = stress - 5,
    support_cnt = support - 5,
  )

fit_1 <- lm(depress ~ stress_cnt + support_cnt, 
            data = depress)

fit_2 <- lm(depress ~ stress_cnt * support_cnt, 
            data = depress)

summary(fit_1)
summary(fit_2)

# Diskretizace support 
depress <- depress %>% 
  mutate(
    support_cat = cut_number(support, n = 3)
  )

depress %>% 
  ggplot(aes(stress, depress,
             color = support_cat,
             fill = support_cat)) +
  geom_jitter(size = 3) +
  geom_smooth(method = "lm")

summary(depress)

# Hodnoty prediktorů, pro které chceme zjistit predikované hodnoty
new_data <- expand.grid(
  stress_cnt = seq(-4, 5, length.out = 1000),
  support_cnt = c(-2.5, 0, 2.5)
) %>% 
  as_tibble()

ci <- broom::augment(fit_2,
                     newdata = new_data,
                     interval = "confidence")

ci <- ci %>% 
  mutate(
    group = factor(support_cnt,
                   labels = c(2.5, 5.0, 7.5))
  )


ci %>% 
  ggplot(aes(x = stress_cnt + 5,
             y = .fitted,
             color = group,
             fill = group)) +
  geom_ribbon(aes(ymin = .lower,
                  ymax = .upper),
              alpha = .3) +
  geom_line() +
  labs(x = "Stress",
       y = "Predikovaná depresivita",
       color = "Sociální podpora",
       fill = "Sociální podpora")