# Balíčky library(psych) library(haven) library(tidyverse) df <- read_sav("data/hw_inventory.sav") meta <- read_rds("data/hw_metadata.rds") glimpse(df) names(df) # Tvorba nových proměnných (sloupců) pomocí mutate() -------------------------- df <- mutate(df, sr_height_m = sr_height_cm / 100, bmi = sr_weight_kg / sr_height_m^2) select(df, sr_height_m, bmi) # Alternativou k funkci mutate() je funkce transmute() transmute(df, sr_height_m = sr_height_cm / 100, bmi = sr_weight_kg / sr_height_m^2) # Pokud chceme změny uložit df <- df %>% mutate(bmi = sr_weight_kg / (sr_height_cm/100)^2) # Převedení vybraných proměnných na proměnné kategorické (factors) summary(df$gender) summary(df[["gender"]]) summary(df$sr_wcat) df <- df %>% mutate(gender = as_factor(gender), sr_wcat = as_factor(sr_wcat)) summary(df$gender) summary(df$sr_wcat) # List položek c("hi_01", "hi_02") str_c("hi_", 1:26) str_pad(1:26, width = 2, pad = 0) str_c("hi_", str_pad(1:26, width = 2, pad = 0)) str_c("wi_", str_pad(1:22, width = 2, pad = 0)) str_c("rses_", str_pad(1:10, width = 2, pad = 0)) items <- list(hi = str_c("hi_", str_pad(1:26, width = 2, pad = 0)), wi = str_c("wi_", str_pad(1:22, width = 2, pad = 0)), rses = str_c("rses_", str_pad(1:10, width = 2, pad = 0))) items # Reverzní položky rev_items <- c(str_c("hi_", str_pad(14:26, width = 2, pad = 0)), str_c("wi_", str_pad(12:22, width = 2, pad = 0)), str_c("rses_", str_pad(c(2, 5, 6, 8, 9), width = 2, pad = 0))) # Korelace mezi položkami df %>% select(all_of(items$hi)) %>% cor(use = "pairwise") %>% round(digits = 2) # Rekódování reverzních položek df %>% mutate(hi_14 = 5 - hi_14, hi_15 = 5 - hi_15) ?across df %>% transmute(across(hi_14:hi_26, ~5 - .x)) df %>% transmute(across(hi_14:hi_26, ~5 - .x, .names = "{.col}_rec")) df_old <- df df <- df %>% mutate(across(all_of(rev_items), ~5 - .x)) # Položková analýza df %>% select(all_of(items$hi)) %>% psych::alpha() df %>% select(all_of(items$hi)) %>% psych::omega(nfactors = 2) # Složitější rekódování # Below 18.5 Underweight # 18.5 – 24.99 Healthy Weight # 25.0 – 29.99 Overweight # 30.0 and Above Obesity summary(df$bmi) bmi_class <- c("Underweight", "Healthy Weight", "Overweight", "Obesity") df %>% filter(!between(bmi, 15, 40)) %>% select(id, gender, age, sr_height_cm, sr_weight_kg, bmi) %>% arrange(bmi) df %>% mutate(bmi_cat = case_when(bmi < 18.5 ~ bmi_class[1], bmi < 25 ~ bmi_class[2], bmi < 30 ~ bmi_class[3], bmi >= 30 ~ bmi_class[4], TRUE ~ NA_character_) %>% factor(levels = bmi_class)) %>% select(bmi, bmi_cat) %>% print(n = 30) df <- df %>% mutate(bmi_cat = case_when(bmi < 18.5 ~ bmi_class[1], bmi < 25 ~ bmi_class[2], bmi < 30 ~ bmi_class[3], bmi >= 30 ~ bmi_class[4], TRUE ~ NA_character_) %>% factor(levels = bmi_class)) df %>% transmute(across(all_of(items$hi), ~case_when(.x <= 2 ~ 0L, .x >= 3 ~ 1L, TRUE ~ NA_integer_))) # Suspicious cases df_old %>% filter(if_all(hi_01:hi_26, ~.x == 4)) df_old %>% filter(if_all(hi_01:hi_26, ~.x == 1)) df <- df %>% filter(id != 2184) # computing total scores df %>% transmute(total_hi = hi_01 + hi_02 + hi_03) df %>% transmute(total_hi = pick(all_of(items$hi)) %>% rowMeans()) df %>% transmute(total_hi = pick(all_of(items$hi)) %>% rowMeans(na.rm = TRUE)) df %>% transmute(total_hi = pick(all_of(items$hi)) %>% rowMeans(na.rm = TRUE), na_hi = pick(all_of(items$hi)) %>% is.na() %>% rowSums(), total_hi = if_else(na_hi > 1, NA_real_, total_hi)) row_mean <- function(..., max_na = 0) { # Vybrat sloupce, které chceme data <- pick(...) # Vypočíst součet chybějícíh dat (počet NA) pro daný řádek n_missing <- data %>% is.na() %>% rowSums() # Vypočíst celkový skór jako průměr položek pro daný řádek out <- data %>% rowMeans(na.rm = TRUE) # Nahradit celkový skór chybějící hodnotou, pokud je n_missing > max_na out[n_missing > max_na] <- NA_real_ # Výstupní hodnota (output funkce) return(out) } df %>% transmute(total_hi = row_mean(all_of(items$hi), max_na = 0)) df %>% transmute(total_hi = row_mean(all_of(items$hi), max_na = 1)) df <- df %>% mutate(total_hi = row_mean(all_of(items$hi), max_na = 2)) # Souhrnné statistiky pomocí summarise() ------------------------------------ df %>% summarise(M_height = mean(sr_height_cm)) df %>% summarise(M_height = mean(sr_height_cm, na.rm = TRUE)) # To, jak si nově vypočtenou statistiku pojmenujeme, je na nás. df %>% summarise(průměrná_výška = mean(sr_height_cm, na.rm = TRUE)) # Užitečnější je ovšem tato funkce v kombinaci s funkcí group_by() by_gender <- group_by(df, gender) by_gender by_gender %>% summarise(M_height = mean(sr_height_cm, na.rm = TRUE)) # Protože často potřebujeme dataset rozdělit různě a vytvářet pokaždé nový, # jinak rozdělený dataset (jako v tomto případě by_day) je poněku zbytečné, # často využíváme pipe (%>%) ke zřetězení více operací dohromady. df %>% # použitý dataset group_by(gender, sr_wcat) %>% # rozdělit podle summarise(M_bmi = mean(bmi, na.rm = TRUE)) %>% # Vypočíst statistiky arrange(sr_wcat, gender) # seřadit podle df %>% group_by(gender, sr_wcat) %>% summarise(M_bmi = mean(bmi, na.rm = TRUE), SD_bmi = sd(bmi, na.rm = TRUE), n = n()) %>% arrange(sr_wcat, gender) df %>% drop_na(gender, sr_wcat, bmi) %>% group_by(gender, sr_wcat) %>% summarise(M_bmi = mean(bmi, na.rm = TRUE), SD_bmi = sd(bmi, na.rm = TRUE), n = n()) %>% arrange(sr_wcat, gender) # Více proměnných df %>% group_by(gender) %>% summarise(across(c(sr_height_cm, sr_weight_kg, bmi), ~mean(.x, na.rm = TRUE), .names = "M_{.col}")) df %>% group_by(gender) %>% summarise(across(c(sr_height_cm, sr_weight_kg, bmi), ~mean(.x, na.rm = TRUE), .names = "M_{.col}")) df %>% select(gender, sr_height_cm, sr_weight_kg, bmi) df %>% select(id, gender, sr_height_cm, sr_weight_kg, bmi) df %>% select(id, gender, sr_height_cm, sr_weight_kg, bmi) %>% pivot_longer(cols = c(sr_height_cm, sr_weight_kg, bmi), names_to = "variable", values_to = "value") df %>% select(id, gender, sr_height_cm, sr_weight_kg, bmi) %>% pivot_longer(cols = c(sr_height_cm, sr_weight_kg, bmi), names_to = "variable", values_to = "value") %>% drop_na(gender, value) df %>% select(id, gender, sr_height_cm, sr_weight_kg, bmi) %>% pivot_longer(cols = c(sr_height_cm, sr_weight_kg, bmi), names_to = "variable", values_to = "value") %>% drop_na(gender, value) %>% group_by(gender, variable) %>% summarise(M = mean(value), SD = sd(value), Min = min(value), q1 = quantile(value, probs = .25), Mdn = median(value), q3 = quantile(value, probs = .75), Max = max(value), skew = psych::skew(value), kurt = psych::kurtosi(value)) df %>% drop_na(gender, sr_wcat) %>% count(gender, sr_wcat) df %>% drop_na(gender, sr_wcat) %>% count(gender, sr_wcat) %>% group_by(gender) %>% mutate(p = n / sum(n))