# Data tranformation functions ------------------------------------------ r2_diff <- function(fit1, fit2) { abs(summary(fit1)$r.squared - summary(fit2)$r.squared) } # Function to overview variable names, labels and value labels # for data imported from SPSS using haven package metadata <- function(data) { var_names <- colnames(data) atr <- map(data, attributes) var_labels <- map(atr, "label") %>% map_chr(~if_else(is.null(.x), NA_character_, .x )) value_labels <- atr %>% map(~.x$labels) %>% map_chr(~str_c(.x, " = ", names(.x), collapse = "; ")) value_labels[value_labels == " = "] <- NA_character_ tibble(var_names, var_labels, value_labels) } # Function to extract labels from vector attributes and convert to factor fct_label <- function(x) { levs <- attributes(x)$labels labs <- names(levs) factor(x, levels = levs, labels = labs) } # function to generate variable names # prefix = variable prefix # range = integer vector, # suffix = any suffix if needed, # width = add leading zeros to numeric range to keep selected width item_range <- function(prefix, range, width = NULL, suffix = NULL, with.suffix = NULL) { if (!is.null(width)) { range <- str_pad(range, width = width, pad = "0") } x <- str_c(prefix, range) if (!is.null(with.suffix)) { x[with.suffix] <- str_c(x, suffix) } else { x <- str_c(x, suffix) } return(x) } # Function to recode reversed scored items # data = dataframe # vars = variable to recode # min = minimum attainable item score # max = maximum attainable item score # drop = keep only newly created variables? reverse_score <- function(x, min, max) { (min + max) - x } # Function to compute number of missing values in a row # data = dataframe, # vars = character vector of variable (columns) names, # if NULL (default), computes number of missing values across all columns # new_col = name of the new column created # drop = keep only newly created variable? not.na <- function(x) { !is.na(x) } row_missing_n <- function(...) { cur_data() %>% select(...) %>% is.na() %>% rowSums() } row_missing_p <- function(...) { cur_data() %>% select(...) %>% is.na() %>% rowMeans() } row_valid_n <- function(...) { cur_data() %>% select(...) %>% not.na() %>% rowSums() } row_valid_p <- function(...) { cur_data() %>% select(...) %>% not.na() %>% rowMeans() } # Function to compute row means # data = dataframe, # vars = character vector of variable (column) names # new_col = name of the new column created # max_na = maximum number of missing values (otherwise returns NA) # drop = keep only newly created variable? row_mean <- function(..., min.valid = 1) { data <- cur_data() %>% select(...) n_valid <- data %>% not.na() %>% rowSums() x <- data %>% rowMeans(na.rm = TRUE) x[n_valid < min.valid] <- NA_real_ return(x) } # Data exploration -------------------------------------------------------- # Returns TRUE, if a values is flagged as outlier # x = numeric vector # method = method to identify outliers # outher arguments specify the cut-offs for outlier detection # zscore = z-score difference from the mean is_outlier <- function(x, method = c("zscore", "madmedian", "boxplot"), zscore = 2, madmed = 2.24, iqrmult = 1.5) { method <- match.arg(method) if (method == "zscore") { abs(scale(x)) > zscore } else if (method == "madmedian") { abs(x - median(x, na.rm = TRUE)) / mad(x, na.rm = TRUE) > madmed } else if (method == "boxplot") { q <- quantile(x, probs = c(.25, .75), na.rm = TRUE) iqr <- (q[2] - q[1]) * iqrmult (x < q[1] - iqr) | (x > q[2] + iqr) } else { NULL } } # * Barplots ---------------------------------------------------------------- plot_bar <- function(data, var, ..., fill = NULL, facet = NULL, add_n = TRUE, sort = TRUE) { if (sort) { data[[var]] <- fct_infreq(data[[var]]) } if (is.null(fill) && is.null(facet)) { data <- data %>% count(.data[[var]]) } else if (!is.null(fill) && is.null(facet)) { data <- data %>% count(.data[[var]], .data[[fill]]) } else if (is.null(fill) && !is.null(facet)) { data <- data %>% count(.data[[var]], .data[[facet]]) %>% group_by(.data[[facet]]) } else { data <- data %>% count(.data[[var]], .data[[fill]], .data[[facet]]) %>% group_by(.data[[facet]]) } data <- data %>% mutate( p = n/sum(n) ) if (is.null(facet)) { data_labels <- data %>% group_by(.data[[var]]) %>% summarise( p = sum(p), n = sum(n)) } else { data_labels <- data %>% group_by(.data[[var]], .data[[facet]]) %>% summarise( p = sum(p), n = sum(n)) } p <- ggplot( data, aes_string( var, "p", fill = fill)) + geom_col(...) + coord_flip() + labs(y = "Rel. Freq.") + theme(legend.position = "top") if (add_n) { p <- p + geom_text( data = data_labels, aes(label = n, fill = NULL), fontface = "bold", hjust = -.25) } if (!is.null(facet)) { p <-p + facet_wrap(vars(.data[[facet]]), labeller = label_both) } p + scale_y_continuous( labels = scales::percent_format(accuracy = 1), expand = expansion(mult = c(0, 0.2)) ) } plot_bar_mult <- function(data, var, fill = NULL, facet = NULL, add_n = TRUE, sort = TRUE) { map( var, ~plot_bar(data = data, .x, fill = fill, facet = facet, add_n = add_n, sort = sort) ) } plot_stackbar <- function(data, var, fill, ..., facet = NULL) { p <- data %>% ggplot(aes_string(var, fill = fill)) + geom_bar(..., position = "fill", color = "black") + ylab("Rel. Freq.") + coord_flip() + theme(legend.position = "top") + guides(fill = guide_legend( reverse = TRUE, byrow = TRUE, nrow = 1) ) + scale_y_continuous(labels = scales::percent_format(accuracy = 1)) if(!is.null(facet)) { p <- p + facet_wrap(vars(.data[[facet]])) } p } plot_stackbar_mult <- function(data, var, fill, facet = NULL) { map2( var, fill, ~plot_stackbar(data = data, var = .x, fill = .y, facet = facet) ) } # * Histogram ------------------------------------------------------------- plot_histogram <- function(data, var, ..., facet = NULL, geom = c("histogram", "density", "freqpoly"), scales = "fixed", transform_fun = NULL) { geom <- match.arg(geom) if (!is.null(transform_fun)) { data[[var]] <- transform_fun(data[[var]]) } p <- data %>% ggplot(aes_string(var)) switch (geom, histogram = p <- p + geom_histogram(color = "white", ...), density = p <- p + geom_density(...), freqpoly = p <- p + geom_freqpoly(...) ) if (!is.null(facet)) { p <- p + facet_wrap(vars(.data[[facet]]), scales = scales) } p } plot_histogram_mult <- function(data, var, facet = NULL, geom = c("histogram", "density", "freqpoly"), scales = "fixed", transform_fun = NULL) { map( var, ~plot_histogram(data, var = .x, facet = facet, geom = geom, scales = scales, transform_fun = transform_fun) ) } # * Boxplot -------------------------------------------------------------- plot_box <- function(data, var, groups = "var", ..., fill = NULL, facet = NULL, geom = c("boxplot", "violin"), order = TRUE) { geom = match.arg(geom) if (order && groups != "var") { p <- data %>% ggplot(aes_string( var, paste0("fct_reorder(", groups, ",", var, ", na.rm = TRUE)"), fill = fill) ) } else { p <-data %>% ggplot(aes_string(var, groups)) } switch (geom, boxplot = p <- p + geom_boxplot(...), violin = p <- p + geom_boxplot(...) ) if (!is.null(fill) && groups == fill) { p <- p + theme(legend.position = "none") } else { p <- p + theme(legend.position = "top") } if (!is.null(facet)) { p <- p + facet_wrap(vars(.data[[facet]])) } p + labs(x = var, y = groups) } plot_box_mult <- function(data, var, groups = "var", fill = NULL, facet = NULL, geom = c("boxplot", "violin"), order = TRUE) { map( var, ~plot_box(data = data, var = .x, groups = groups, fill = fill, facet = facet, geom = geom, order = order) ) } # * Scatterplot ----------------------------------------------------------- plot_scatter <- function(data, xvar, yvar, color = NULL, ..., geom = c("point", "jitter", "count"), alpha = 1, sample = NULL, fit_line = TRUE, method = NULL) { if (!is.null(sample)) { data <- sample_n(data, size = sample) } if (is.null(color)) { p <- data %>% ggplot(aes_string(xvar, yvar, color = color)) } else { p <- data %>% ggplot(aes_string(xvar, yvar)) } switch (geom, point = p <- p + geom_point(..., alpha = alpha), jitter = p <- p + geom_jitter(..., alpha = alpha), count = p <- p + geom_count(..., alpha = alpha) ) if (fit_line) { p <- p + geom_smooth(method = method) } return(p) } plot_scatter_mult <- function(data, xvar, yvar, geom = c("point", "jitter", "count"), alpha = 1, color = NULL, sample = NULL, fit_line = TRUE, method = NULL){ map2( xvar, yvar, ~plot_scatter( data = data, xvar = .x, yvar = .y, color = color, alpha = alpha, geom = geom, sample = sample, fit_line = fit_line, method = method ) ) } # * mosaic plots ------------------------------------------------------------ plot_mosaic <- function(data, xvar, yvar) { require(ggmosaic) if (!is.factor(data[[yvar]])) { data[[yvar]] <- factor(data[[yvar]]) } data %>% ggplot() + ggmosaic::geom_mosaic( aes_string( x = paste0("product(", yvar, ",", xvar, ")"), fill = yvar ) ) + guides(fill = guide_legend(reverse = TRUE)) } plot_mosaic_mult <- function(data, xvar, yvar) { map2( xvar, yvar, ~plot_mosaic(data = data, xvar = .x, yvar = .y) ) } # Cross-tabs -------------------------------------------------------------- crosstabs <- function(df, row_var, col_var, ..., digits = 2) { freq <- df %>% select( {{row_var}}, {{col_var}} ) %>% table() row_p <- freq %>% prop.table(margin = 1) %>% (function(x) x*100) %>% round(digits = digits) col_p <- freq %>% prop.table(margin = 2) %>% (function(x) x*100) %>% round(digits = digits) tot_p <- freq %>% prop.table() %>% (function(x) x*100) %>% round(digits = digits) print(chisq.test(freq, ...)) list(n = freq, `row %` = row_p, `col %` = col_p, `tot %` = tot_p) %>% map(as.data.frame) %>% bind_rows(.id = "stat") %>% pivot_wider(names_from = {{col_var}}, values_from = Freq) } # Tests ------------------------------------------------------------------- cor_table <- function(data, use = "pairwise", method = "pearson", sig_levels = c(0.05, 0.01, 0.001), adjust = "none", conf = 0.95) { f <- function(x) format(round(x, 2), nsmall = 2) n <- map_int(data, ~sum(!is.na(.x))) M <- map_dbl(data, ~mean(.x, na.rm = TRUE)) SD <- map_dbl(data, ~sd(.x, na.rm = TRUE)) nms <- colnames(data) lgt <- length(nms) results <- psych::corr.test(data, use = use, method = method, adjust = adjust, alpha = 1 - conf) r <- results$ci$r if (adjust == "none") { ci <- results$ci[, c("lower", "upper")] p <- results$ci$p } else { ci <- results$ci.adj %>% set_names(c("lower", "upper")) p <- results$ci2$p.adj } input <- tibble(r, ci, p, stars = "") if (!is.null(sig_levels)) { input <- input %>% mutate( stars = case_when( p < sig_levels[[3]] ~ "***", p < sig_levels[[2]] ~ "** ", p < sig_levels[[1]] ~ "* ", TRUE ~ " " ) ) } m <- matrix("", nrow = lgt, ncol = lgt, dimnames = list(nms, nms)) m1 <- m m2 <- m m1[lower.tri(m1)] <- paste0(f(input$r), input$stars) if (length(results$n) == 1) { m1[upper.tri(m1)] <- results$n } else { m1[upper.tri(m1)] <- results$n[upper.tri(results$n)] } m2[lower.tri(m1)] <- paste0("[", f(input$lower),";", f(input$upper), "]") m3 <- matrix("", ncol = lgt, nrow = 2*lgt) colnames(m3) <- nms m3[c(TRUE, FALSE), ] <- m1 m3[c(FALSE, TRUE), ] <- m2 df1 <- data.frame(var = nms, n = n, M = f(M), SD = f(SD)) df2 <- data.frame(var = rep("", 2*lgt), n = rep("", 2*lgt), M = rep("", 2*lgt), SD = rep("", 2*lgt)) df2[c(TRUE, FALSE), ] <- df1 bind_cols(df2, as.data.frame(m3)) } ci_mean <- function(x, conf.level = 0.95, na.rm = TRUE) { if (na.rm) x <- x[!is.na(x)] n <- length(x) se <- sd(x) / sqrt(n) q <- (1 - conf.level) / 2 q <- 1 - q t <- qt(q, df = n - 1) M <- mean(x) lo = M - t*se up = M + t*se data.frame(M = M, Mci_low = lo, Mci_upp = up) } quantiles <- function(x, probs = c(0, .25, .5, .75, 1), na.rm = TRUE) { q <- quantile(x, probs = probs, na.rm = na.rm) data.frame( Min = q[1], Q2 = q[2], Mdn = q[3], Q3 = q[4], Max = q[5] ) } # Power ------------------------------------------------------------------- pwr_df <- function(data, fn) { out <- data %>% pmap(fn) %>% map_df(~.x[names(.x)]) meth <- out$method[[1]] web <- out$url[[1]] cat("Method: ", meth, " ", "URL: ", web, "\n") out %>% select(-method, -url) } plot_pwr <- function(data, x, y, grp = NULL, facet = NULL) { p <- data %>% ggplot(aes({{x}}, {{y}})) + theme(legend.position = "top") if (is.null(grp)) { p <- p + geom_line(size = 1) } else { p <- p + geom_line(size = 1, aes(color = as.factor({{grp}}), group = as.factor({{grp}}))) + labs(color = dplyr::select(data, {{grp}}) %>% names()) } if (!is.null(facet)) { p <- p + facet_wrap(vars({{facet}}), labeller = "label_both") } p } r2_to_f <- function(r2_full, r2_null = 0, f_squared = FALSE) { if (f_squared) { (r2_full - r2_null) / (1 - r2_full) } else { sqrt((r2_full - r2_null) / (1 - r2_full)) } } f_to_r2 <- function(f) { f^2 / (1+f^2) } peta2_to_f <- function(peta2, f_squared = FALSE) { if (f_squared) { peta2 / (1 - peta2) } else { sqrt(peta2 / (1 - peta2)) } } # Missing data ------------------------------------------------------------ miss_by_group <- function(.data, ...) { .data %>% add_n_miss() %>% group_by(...) %>% summarise( n = n(), ci_mean(n_miss_all), min = min(n_miss_all), q25 = quantile(n_miss_all, .25), mdn = median(n_miss_all), q75 = quantile(n_miss_all, .75), max = max(n_miss_all) ) } my_summarise <- function(data, group_var, summarise_var) { data %>% bind_shadow() %>% group_by(across({{ group_var }})) %>% summarise( "n_{summarise_var}" := n(), "M_{summarise_var}" := mean(.data[[summarise_var]], na.rm = TRUE), "SD_{summarise_var}" := sd(.data[[summarise_var]], na.rm = TRUE), ) }