From bf2754262650405921ccc79d15c7b16f6e4981f8 Mon Sep 17 00:00:00 2001 From: Felicia Date: Fri, 4 Jun 2021 15:11:28 +0800 Subject: [PATCH] edited the code in effsize file(effsize boot function) to account for paired and unpaired cases --- R/effectsize.R | 1376 ++++++++++++++++++++++++------------------------ 1 file changed, 694 insertions(+), 682 deletions(-) diff --git a/R/effectsize.R b/R/effectsize.R index 75cdcdf..d9ae41b 100755 --- a/R/effectsize.R +++ b/R/effectsize.R @@ -1,682 +1,694 @@ -#' Compute Effect Size(s) -#' -#' For each pair of observations in a \code{dabest} object, a desired effect -#' size can be computed. Currently there are five effect sizes available: -#' \itemize{ \item The \strong{mean difference}, given by \code{mean_diff()}. -#' \item The \strong{median difference}, given by \code{median_diff()}. \item -#' \strong{Cohen's \emph{d}}, given by \code{cohens_d()}. \item \strong{Hedges' -#' \emph{g}}, given by \code{hedges_g()}. \item \strong{Cliff's delta}, given by -#' \code{cliffs_delta()}. } -#' -#' -#' @param x A \code{dabest} object, generated by the \link[=dabest]{dabest()} -#' function. -#' -#' @param ci float, default 95. The level of the confidence intervals produced. -#' The default \code{ci = 95} produces 95\% CIs. -#' -#' @param reps integer, default 5000. The number of bootstrap resamples that -#' will be generated. -#' -#' @param seed integer, default 12345. This specifies the seed used to set the -#' random number generator. Setting a seed ensures that the bootstrap -#' confidence intervals for the same data will remain stable over separate -#' runs/calls of this function. See \link{set.seed} for more details. -#' -#' -#' -#' @return A \code{dabest_effsize} object with 10 elements. -#' -#' \describe{ -#' -#' \item{\code{data}}{ The dataset passed to \link[=dabest]{dabest()}, as a -#' \code{\link[tibble]{tibble}}. } -#' -#' \item{\code{x} and \code{y}}{ The columns in \code{data} used to plot the x -#' and y axes, respectively, as supplied to \link[=dabest]{dabest()}. These are -#' \href{https://adv-r.hadley.nz/quasiquotation.html}{quoted variables} for -#' \href{https://tidyeval.tidyverse.org/}{tidy evaluation} during the -#' computation of effect sizes. } -#' -#' \item{\code{idx}}{ The vector of control-test groupings as initially passed -#' to \link[=dabest]{dabest()}. } -#' -#' \item{\code{is.paired}}{ Whether or not the experiment consists of paired -#' (aka repeated) observations. Originally supplied to \link[=dabest]{dabest()}. } -#' -#' \item{\code{id.column}}{ If \code{is.paired} is \code{TRUE}, the column in -#' \code{data} that indicates the pairing of observations. As passed to -#' \link[=dabest]{dabest()}. } -#' -#' \item{\code{effect.size}}{ The effect size being computed. One of the -#' following: \code{c("mean_diff", "median_diff", "cohens_d", "hedges_g", -#' "cliffs_delta")}. } -#' -#' \item{\code{.data.name}}{ The variable name of the dataset passed to -#' \link[=dabest]{dabest()}. } -#' -#' \item{\code{summary}}{ A \link{tibble} with a row for the mean or median of -#' each group in the \code{x} column of \code{data}, as indicated in -#' \code{idx}. } -#' -#' \item{\code{result}}{ -#' -#' A \link{tibble} with the following 15 columns: -#' -#' \describe{ \item{control_group, test_group}{ The name of the control group -#' and test group respectively.} -#' -#' \item{control_size, test_size}{ The number of observations in the control -#' group and test group respectively. } -#' -#' \item{effect_size}{ The effect size used. } -#' -#' \item{paired}{ Is the difference paired (\code{TRUE}) or not -#' (\code{FALSE})? } -#' -#' \item{difference}{ The effect size of the difference between the two -#' groups. } -#' -#' \item{variable}{ The variable whose difference is being computed, ie. the -#' column supplied to \code{y}. } -#' -#' \item{ci}{ The \code{ci} passed to this function. } -#' -#' \item{bca_ci_low, bca_ci_high}{ The lower and upper limits of the Bias -#' Corrected and Accelerated bootstrap confidence interval. } -#' -#' \item{pct_ci_low, pct_ci_high}{ The lower and upper limits of the -#' percentile bootstrap confidence interval. } -#' -#' \item{bootstraps}{ The vector of bootstrap resamples generated. } } } -#' -#' } -#' -#' -#' @seealso \itemize{ -#' -#' \item \link[=dabest]{Loading data} for effect size computation. -#' -#' \item \link[=plot.dabest_effsize]{Generating estimation plots} after effect -#' size computation. -#' -#' \item The -#' \href{http://www.estimationstats.com/#/about-effect-sizes}{mathematical -#' definitions} and equations used to compute each effect size. -#' -#' \item The \code{ \link[effsize:effsize-package]{effsize} } package, which -#' is used under the hood to compute Cohen's \emph{d}, Hedges' \emph{g}, and -#' Cliff's delta. -#' -#' \item The \code{ \link[boot]{boot}() } and \code{ \link[boot]{boot.ci}() } -#' functions from the \code{boot} package, which generate the (nonparametric) -#' bootstrapped resamples used to compute the confidence intervals. -#' -#' } -#' -#' -#' @examples -#' # Loading data for unpaired (two independent groups) analysis. -#' petal_widths <- dabest(iris, Species, Petal.Width, -#' idx = c("setosa", "versicolor"), -#' paired = FALSE) -#' -#' -#' # Compute the mean difference. -#' mean_diff(petal_widths) -#' -#' # Plotting the mean differences. -#' mean_diff(petal_widths) %>% plot() -#' -#' @export -mean_diff <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("mean_diff", x) - -#' @export -mean_diff.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { - effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "mean_diff") -} - - -#' @rdname mean_diff -#' @export -median_diff <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("median_diff", x) - -#' @export -median_diff.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { - effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "median_diff") -} - - -#' @rdname mean_diff -#' @export -cohens_d <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("cohens_d", x) - -#' @export -cohens_d.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { - effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "cohens_d") -} - - - -#' @rdname mean_diff -#' @export -hedges_g <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("hedges_g", x) - -#' @export -hedges_g.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { - effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "hedges_g") -} - - - -#' @rdname mean_diff -#' @export -cliffs_delta <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("cliffs_delta", x) - -#' @export -cliffs_delta.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { - effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "cliffs_delta") -} - - -#### Helper functions that actually compute the effect sizes. #### -mean_diff_ <- function(control, treatment, paired) { - if (identical(paired, FALSE)) return(mean(treatment) - mean(control)) - else return(mean(treatment - control)) -} - - - -median_diff_ <- function(control, treatment, paired) { - if (identical(paired, FALSE)) return(median(treatment) - median(control)) - else return(median(treatment - control)) -} - - - -cohens_d_ <- function(control, treatment, paired) { - return(effsize::cohen.d(treatment, control, paired=paired)$estimate) -} - - - -hedges_g_ <- function(control, treatment, paired) { - cd <- cohens_d_(treatment, control, paired=paired) - # not sure why I have to negate this.... - corr.factor <- -hedges_correction(treatment, control) - return(cd * corr.factor) -} - - - -cliffs_delta_ <- function(control, treatment, paired=NA) { - return(effsize::cliff.delta(treatment, control)$estimate) -} - - -#' Returns the exact Hedges' correction factor for Cohen's d. -#' @param x1 A vector of values. -#' -#' @param x2 Another vector of values. -#' -#' @return Hedges' correction for the g of x1 and x2. -#' -hedges_correction <- function(x1, x2) { - - n1 <- length(x1) - n2 <- length(x2) - - deg.freedom <- n1 + n2 - 2 - numer <- gamma(deg.freedom/2) - denom0 <- gamma((deg.freedom - 1) / 2) - denom <- sqrt((deg.freedom / 2)) * denom0 - - if (is.infinite(numer) | is.infinite(denom)) { - # Occurs when df is too large. - # Applies Hedges and Olkin's approximation. - df.sum <- n1 + n2 - denom <- (4 * df.sum) - 9 - out <- 1 - (3 / denom) - } else out <- numer / denom - - return(out) -} - - - -#' Compute the standardizers for Cohen's d -#' @param x1 A vector of values. -#' -#' @param x2 Another vector of values. -#' -#' @return Named list for pooled and average standardizers. -#' Use the pooled one for unpaired Cohens' d, and the average one -#' for paired Cohens'd d. -#' -cohen_d_standardizers <- function(x1, x2) { - control_n <- length(x1) - test_n <- length(x2) - - control_mean <- mean(x1) - test_mean <- mean(x2) - - control_var <- var(x1) # by default, use N-1 to compute the variance. - test_var <- var(x2) - - control_std <- sd(x1) - test_std <- sd(x2) - - # For unpaired 2-groups standardized mean difference. - pooled <- sqrt( - ((control_n - 1) * control_var + (test_n - 1) * test_var) / - (control_n + test_n - 2) - ) - - # For paired standardized mean difference. - average <- sqrt((control_var + test_var) / 2) - - # out <- c(pooled, average) - # names(out) <- c("pooled", "average") - - return(list(pooled = pooled, average = average)) - -} - - - - -effsize_boot <- function(data, effsize_func, R = 5000, paired = FALSE) { - - func <- match.fun(effsize_func) - - # Create stata for resampling. - s <- c(rep(1, length(data$control)), - rep(2, length(data$test))) - - - bootboot <- function(d, indicies, paired) { - c <- d[indicies[s == 1]] - t <- d[indicies[s == 2]] - - return(func(c, t, paired)) - } - - b <- boot( - c(data$control, data$test), - statistic = bootboot, - R = R, - strata = s, - paired = paired) - - return(b) - -} - - -# effsize_boot <- function(effsize, data, paired, indices) { -# control <- data$control[indices] -# test <- data$test[indices] -# -# cd <- effsize(control, test, paired) -# -# return(cd) -# } - - - -#' @importFrom boot boot boot.ci -#' @importFrom dplyr arrange bind_rows filter group_by summarize -#' @importFrom magrittr %>% -#' @importFrom rlang quo_name -#' @importFrom stringr str_interp -#' @importFrom tibble tibble -effect_size <- function(.data, ..., effect.size, ci, reps, seed) { - - #### Check object class #### - if (class(.data)[1] != "dabest") { - stop(paste( - "The object you are plotting is not a `dabest` class object. ", - "Please check again! ") - ) - } - - # Create handles for easy access to the items in `.data`. - raw.data <- .data$data - idx <- .data$idx - all.groups <- .data$.all.groups - paired <- .data$is.paired - data.name <- .data$.data.name - is.paired <- .data$is.paired - - plot.groups.sizes <- unlist(lapply(idx, length)) - - # The variables below should are quosures! - x_enquo <- .data$x - y_enquo <- .data$y - effect.size_enquo <- quo_name(effect.size) - - x_quoname <- quo_name(x_enquo) - y_quoname <- quo_name(y_enquo) - - # effect.size_enquo <- rlang::enquo(effect.size) - # effect.size_quoname <- rlang::quo_name(effect.size_enquo) - - id.col_enquo <- .data$id.column - - # Parse the effect.size. - if (effect.size == "mean_diff") { - es = mean_diff_ - summ_func = mean - summ_name = "mean" - } else if (effect.size == "median_diff") { - es = median_diff_ - summ_func = median - summ_name = "median" - } else if (effect.size == "cohens_d") { - es = cohens_d_ - summ_func = mean - summ_name = "mean" - } else if (effect.size == "hedges_g") { - es = hedges_g_ - summ_func = mean - summ_name = "mean" - } else if (effect.size == "cliffs_delta") { - es = cliffs_delta_ - summ_func = median - summ_name = "median" - } - - - - #### Loop through each comparison group. #### - result <- tibble() # To capture output. - - for (group in idx) { - - # Check the control group (`group[1]`) is in the x-column. - if (identical(group[1] %in% raw.data[[x_quoname]], FALSE)) { - - err1 <- str_interp("${group[1]} is not found") - err2 <- str_interp("in the ${x_quoname} column.") - - stop(paste(err1, err2)) - } - - # Patch in v0.2.2. - # Note how we have to unquote both the x_enquo, and the group name! - ctrl <- raw.data %>% filter(!!x_enquo == !!group[1]) - - ctrl <- ctrl[[y_quoname]] - - c <- na.omit(ctrl) - - # If ctrl is length 0, stop! - if (length(c) == 0) { - stop( - str_interp( - c("There are zero numeric observations in the group ${group[1]}.") - ) - ) - } - - # Get test groups (everything else in group), loop through them and compute - # the difference between group[1] and each group. - # Test groups are the 2nd element of group onwards. - test_groups <- group[2: length(group)] - - for (grp in test_groups) { - - # Check if the current group is in the x-column. - if (identical(grp %in% raw.data[[x_quoname]], FALSE)) { - stop( - str_interp( - "${grp} is not found in the ${x_quoname} column." - ) - ) - } - - # Patch in v0.2.2. - # Note how we have to unquote both x_enquo, and grp! - test <- raw.data %>% filter(!!x_enquo == !!grp) - test <- test[[y_quoname]] - t <- na.omit(test) - - # If current test group is length 0, stop! - if (length(t) == 0) { - stop( - str_interp( - c("There are zero numeric observations in the group ${grp}.") - ) - ) - } - - - - #### Compute bootstrap. #### - datalist <- list(control = ctrl, test = test) - - set.seed(seed) - boot_result <- effsize_boot(datalist, effsize_func = es, - R = reps, paired = paired) - - set.seed(NULL) - - # if (identical(paired, FALSE)) { - # diff <- func(t) - func(c) - # # For two.boot, note that the first vector is the test vector. - # boot <- simpleboot::two.boot(t, c, FUN = func, R = reps) - # - # } else { - # if (length(c) != length(t)) { - # stop("The two groups are not the same size, but paired = TRUE.") - # } - # paired_diff <- t - c - # diff <- func(paired_diff) - # boot <- simpleboot::one.boot(paired_diff, FUN = func, R = reps) - # } - - - - #### Compute confidence interval. #### - # check CI. - if (ci < 0 | ci > 100) { - err_string <- str_interp( - "`ci` must be between 0 and 100, not ${ci}" - ) - stop(err_string) - } - - bootci <- boot.ci(boot_result, conf = ci/100, type = c("perc", "bca")) - - - #### Save pairwise result. #### - row <- tibble( - # Convert the name of `func` to a string. - control_group = group[1], - test_group = grp, - control_size = length(c), - test_size = length(t), - effect_size = effect.size, - paired = paired, - variable = y_quoname, - difference = boot_result$t0, - ci = ci, - bca_ci_low = bootci$bca[4], - bca_ci_high = bootci$bca[5], - pct_ci_low = bootci$percent[4], - pct_ci_high = bootci$percent[5], - bootstraps = list(as.vector(boot_result$t)), - nboots = length(boot_result$t) - ) - result <- bind_rows(result, row) - } - } - - # Reset seed. - set.seed(NULL) - - - - #### Compute summaries. #### - summaries <- - raw.data %>% - filter(!!x_enquo %in% all.groups) %>% - group_by(!!x_enquo) %>% - summarize(func_quoname = summ_func(!!y_enquo)) - - colnames(summaries) <- c(x_quoname, summ_name) - - # Order the summaries by the idx. - summaries[[x_quoname]] <- - summaries[[x_quoname]] %>% - factor(all.groups, ordered = TRUE) - - summaries <- summaries %>% arrange(!!x_enquo) - - - - #### Assemble only the data used to create the plot. #### - data.out <- raw.data - - # New in v0.2.1 patch. - # Basically, the `ellipsis` package has been updated, - # and now forcats::as_factor() should only take the object to coerce. - data.out[[x_quoname]] <- forcats::as_factor(data.out[[x_quoname]]) - - data.out <- filter(data.out, !!x_enquo %in% all.groups) - - - - #### Collate output. #### - out = list( - data = data.out, - x = x_enquo, - y = y_enquo, - idx = idx, - id.column = id.col_enquo, - is.paired = is.paired, - effect.size = effect.size, - .data.name = data.name, - result = result, - summary = summaries - ) - - - - #### Append the custom class `dabest_effsize`. #### - class(out) <- c("dabest_effsize", "list") - - - - #### Return the output. #### - return(out) -} - - - -#' Print a `dabest_effsize` object -#' -#' @param x A \code{dabest_effsize} object, generated by one of the -#' \link[=mean_diff]{effect size computation} functions. -#' -#' @param ... S3 signature for generic plot function. -#' -#' @param signif_digits Integer, default 3. All numeric figures in the printed -#' output will be rounded off to this number of significant digits. -#' -#' -#' @return A summary of the effect sizes and respective confidence intervals. -#' -#' @examples -#' # Performing unpaired (two independent groups) analysis. -#' unpaired_mean_diff <- dabest(iris, Species, Petal.Width, -#' idx = c("setosa", "versicolor"), -#' paired = FALSE) %>% -#' mean_diff() -#' -#' # Display the results in a user-friendly format. -#' print(unpaired_mean_diff) -#' -#' @export -#' @importFrom rlang quo_name -#' @importFrom stringr str_interp -print.dabest_effsize <- function(x, ..., signif_digits = 3) { - - #### Check object class #### - if (class(x)[1] != "dabest_effsize") { - stop(paste( - "The object you are plotting is not a `dabest_effsize` class object. ", - "Please check again! ")) - } else { - dabest.effsize <- x - } - - #### Get results table and y var. #### - tbl <- dabest.effsize$result - var <- quo_name(dabest.effsize$y) - - #### Print greeting header. #### - print_greeting_header() - # dabest_ver <- utils::packageVersion("dabestr") - # header <- stringr::str_interp( - # "DABEST (Data Analysis with Bootstrap Estimation) v${dabest_ver}\n") - # cat(header) - # - # cat(rep('=', nchar(header) - 1), sep='') - # cat("\n\n") - - # cat(stringr::str_interp("Variable: ${var} \n\n")) - - #### Print dataset name, xvar, and yvar. #### - xvar = quo_name(dabest.effsize$x) - yvar = quo_name(dabest.effsize$y) - - cat(str_interp("Dataset : ${dabest.effsize$.data.name}\n")) - cat(str_interp("X Variable : ${xvar}\n")) - cat(str_interp("Y Variable : ${yvar}\n\n")) - - #### Print each row. #### - cat(apply(tbl, 1, printrow_, sigdig = signif_digits), - sep = "\n") - - #### Endnote about BCa. #### - cat(str_interp("${tbl$nboots[1]} bootstrap resamples.\n")) - cat("All confidence intervals are bias-corrected and accelerated.\n\n") - -} - - - -#' @importFrom stringr str_interp -printrow_ <- function(my.row, sigdig = 3) { - if (identical(my.row$paired, TRUE)) p <- "Paired" else p <- "Unpaired" - - # For labelling the y-axis. - effsizer <- list("mean difference", "median difference", - "Cohen's d", "Hedges' g", "Cliff's delta") - names(effsizer) <- c("mean_diff", "median_diff", - "cohens_d", "hedges_g", "cliffs_delta") - - line1 <- str_interp( - c( - "${p} ${effsizer[[my.row$effect_size]]} of ", - "${my.row$test_group} ", - "(n = ${my.row$test_size}) ", - "minus ${my.row$control_group} ", - "(n = ${my.row$control_size})\n" - ) - ) - - - line2 <- str_interp( - c("${signif(my.row$difference, sigdig)} ", - "[${signif(my.row$ci, sigdig)}CI ", - "${signif(my.row$bca_ci_low, sigdig)}; ", - "${signif(my.row$bca_ci_high, sigdig)}]\n\n") - ) - - cat(line1, line2) -} - - - +#' Compute Effect Size(s) +#' +#' For each pair of observations in a \code{dabest} object, a desired effect +#' size can be computed. Currently there are five effect sizes available: +#' \itemize{ \item The \strong{mean difference}, given by \code{mean_diff()}. +#' \item The \strong{median difference}, given by \code{median_diff()}. \item +#' \strong{Cohen's \emph{d}}, given by \code{cohens_d()}. \item \strong{Hedges' +#' \emph{g}}, given by \code{hedges_g()}. \item \strong{Cliff's delta}, given by +#' \code{cliffs_delta()}. } +#' +#' +#' @param x A \code{dabest} object, generated by the \link[=dabest]{dabest()} +#' function. +#' +#' @param ci float, default 95. The level of the confidence intervals produced. +#' The default \code{ci = 95} produces 95\% CIs. +#' +#' @param reps integer, default 5000. The number of bootstrap resamples that +#' will be generated. +#' +#' @param seed integer, default 12345. This specifies the seed used to set the +#' random number generator. Setting a seed ensures that the bootstrap +#' confidence intervals for the same data will remain stable over separate +#' runs/calls of this function. See \link{set.seed} for more details. +#' +#' +#' +#' @return A \code{dabest_effsize} object with 10 elements. +#' +#' \describe{ +#' +#' \item{\code{data}}{ The dataset passed to \link[=dabest]{dabest()}, as a +#' \code{\link[tibble]{tibble}}. } +#' +#' \item{\code{x} and \code{y}}{ The columns in \code{data} used to plot the x +#' and y axes, respectively, as supplied to \link[=dabest]{dabest()}. These are +#' \href{https://adv-r.hadley.nz/quasiquotation.html}{quoted variables} for +#' \href{https://tidyeval.tidyverse.org/}{tidy evaluation} during the +#' computation of effect sizes. } +#' +#' \item{\code{idx}}{ The vector of control-test groupings as initially passed +#' to \link[=dabest]{dabest()}. } +#' +#' \item{\code{is.paired}}{ Whether or not the experiment consists of paired +#' (aka repeated) observations. Originally supplied to \link[=dabest]{dabest()}. } +#' +#' \item{\code{id.column}}{ If \code{is.paired} is \code{TRUE}, the column in +#' \code{data} that indicates the pairing of observations. As passed to +#' \link[=dabest]{dabest()}. } +#' +#' \item{\code{effect.size}}{ The effect size being computed. One of the +#' following: \code{c("mean_diff", "median_diff", "cohens_d", "hedges_g", +#' "cliffs_delta")}. } +#' +#' \item{\code{.data.name}}{ The variable name of the dataset passed to +#' \link[=dabest]{dabest()}. } +#' +#' \item{\code{summary}}{ A \link{tibble} with a row for the mean or median of +#' each group in the \code{x} column of \code{data}, as indicated in +#' \code{idx}. } +#' +#' \item{\code{result}}{ +#' +#' A \link{tibble} with the following 15 columns: +#' +#' \describe{ \item{control_group, test_group}{ The name of the control group +#' and test group respectively.} +#' +#' \item{control_size, test_size}{ The number of observations in the control +#' group and test group respectively. } +#' +#' \item{effect_size}{ The effect size used. } +#' +#' \item{paired}{ Is the difference paired (\code{TRUE}) or not +#' (\code{FALSE})? } +#' +#' \item{difference}{ The effect size of the difference between the two +#' groups. } +#' +#' \item{variable}{ The variable whose difference is being computed, ie. the +#' column supplied to \code{y}. } +#' +#' \item{ci}{ The \code{ci} passed to this function. } +#' +#' \item{bca_ci_low, bca_ci_high}{ The lower and upper limits of the Bias +#' Corrected and Accelerated bootstrap confidence interval. } +#' +#' \item{pct_ci_low, pct_ci_high}{ The lower and upper limits of the +#' percentile bootstrap confidence interval. } +#' +#' \item{bootstraps}{ The vector of bootstrap resamples generated. } } } +#' +#' } +#' +#' +#' @seealso \itemize{ +#' +#' \item \link[=dabest]{Loading data} for effect size computation. +#' +#' \item \link[=plot.dabest_effsize]{Generating estimation plots} after effect +#' size computation. +#' +#' \item The +#' \href{http://www.estimationstats.com/#/about-effect-sizes}{mathematical +#' definitions} and equations used to compute each effect size. +#' +#' \item The \code{ \link[effsize:effsize-package]{effsize} } package, which +#' is used under the hood to compute Cohen's \emph{d}, Hedges' \emph{g}, and +#' Cliff's delta. +#' +#' \item The \code{ \link[boot]{boot}() } and \code{ \link[boot]{boot.ci}() } +#' functions from the \code{boot} package, which generate the (nonparametric) +#' bootstrapped resamples used to compute the confidence intervals. +#' +#' } +#' +#' +#' @examples +#' # Loading data for unpaired (two independent groups) analysis. +#' petal_widths <- dabest(iris, Species, Petal.Width, +#' idx = c("setosa", "versicolor"), +#' paired = FALSE) +#' +#' +#' # Compute the mean difference. +#' mean_diff(petal_widths) +#' +#' # Plotting the mean differences. +#' mean_diff(petal_widths) %>% plot() +#' +#' @export +mean_diff <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("mean_diff", x) + +#' @export +mean_diff.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { + effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "mean_diff") +} + + +#' @rdname mean_diff +#' @export +median_diff <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("median_diff", x) + +#' @export +median_diff.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { + effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "median_diff") +} + + +#' @rdname mean_diff +#' @export +cohens_d <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("cohens_d", x) + +#' @export +cohens_d.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { + effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "cohens_d") +} + + + +#' @rdname mean_diff +#' @export +hedges_g <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("hedges_g", x) + +#' @export +hedges_g.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { + effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "hedges_g") +} + + + +#' @rdname mean_diff +#' @export +cliffs_delta <- function(x, ci = 95, reps = 5000 , seed = 12345) UseMethod("cliffs_delta", x) + +#' @export +cliffs_delta.dabest <- function(x, ci = 95, reps = 5000 , seed = 12345) { + effect_size(x, ci = ci, reps = reps, seed = seed, effect.size = "cliffs_delta") +} + + +#### Helper functions that actually compute the effect sizes. #### +mean_diff_ <- function(control, treatment, paired) { + if (identical(paired, FALSE)) return(mean(treatment) - mean(control)) + else return(mean(treatment - control)) +} + + + +median_diff_ <- function(control, treatment, paired) { + if (identical(paired, FALSE)) return(median(treatment) - median(control)) + else return(median(treatment - control)) +} + + + +cohens_d_ <- function(control, treatment, paired) { + return(effsize::cohen.d(treatment, control, paired=paired)$estimate) +} + + + +hedges_g_ <- function(control, treatment, paired) { + cd <- cohens_d_(treatment, control, paired=paired) + # not sure why I have to negate this.... + corr.factor <- -hedges_correction(treatment, control) + return(cd * corr.factor) +} + + + +cliffs_delta_ <- function(control, treatment, paired=NA) { + return(effsize::cliff.delta(treatment, control)$estimate) +} + + +#' Returns the exact Hedges' correction factor for Cohen's d. +#' @param x1 A vector of values. +#' +#' @param x2 Another vector of values. +#' +#' @return Hedges' correction for the g of x1 and x2. +#' +hedges_correction <- function(x1, x2) { + + n1 <- length(x1) + n2 <- length(x2) + + deg.freedom <- n1 + n2 - 2 + numer <- gamma(deg.freedom/2) + denom0 <- gamma((deg.freedom - 1) / 2) + denom <- sqrt((deg.freedom / 2)) * denom0 + + if (is.infinite(numer) | is.infinite(denom)) { + # Occurs when df is too large. + # Applies Hedges and Olkin's approximation. + df.sum <- n1 + n2 + denom <- (4 * df.sum) - 9 + out <- 1 - (3 / denom) + } else out <- numer / denom + + return(out) +} + + + +#' Compute the standardizers for Cohen's d +#' @param x1 A vector of values. +#' +#' @param x2 Another vector of values. +#' +#' @return Named list for pooled and average standardizers. +#' Use the pooled one for unpaired Cohens' d, and the average one +#' for paired Cohens'd d. +#' +cohen_d_standardizers <- function(x1, x2) { + control_n <- length(x1) + test_n <- length(x2) + + control_mean <- mean(x1) + test_mean <- mean(x2) + + control_var <- var(x1) # by default, use N-1 to compute the variance. + test_var <- var(x2) + + control_std <- sd(x1) + test_std <- sd(x2) + + # For unpaired 2-groups standardized mean difference. + pooled <- sqrt( + ((control_n - 1) * control_var + (test_n - 1) * test_var) / + (control_n + test_n - 2) + ) + + # For paired standardized mean difference. + average <- sqrt((control_var + test_var) / 2) + + # out <- c(pooled, average) + # names(out) <- c("pooled", "average") + + return(list(pooled = pooled, average = average)) + +} + + + + +effsize_boot <- function(data, effsize_func, R = 5000, paired = FALSE) { + + func <- match.fun(effsize_func) + + # Create stata for resampling. + s <- c(rep(1, length(data$control)), + rep(2, length(data$test))) + + + # accounts for paired or unpaired. + bootboot <- function(d, indicies, paired) { + if (identical(paired, FALSE)) { + c <- d[indicies[s == 1]] + t <- d[indicies[s == 2]] + } else { + c <- d[indicies,1] + t <- d[indicies,2] + } + return(func(c, t, paired)) + } + if (identical(paired, FALSE)) { + b <- boot( + c(data$control, data$test), + statistic = bootboot, + R = R, + strata = s, + paired = paired) + } else { + b <- boot( + data.frame(data$control, data$test), + statistic = bootboot, + R = R, + paired = paired) + } + + return(b) + +} + + +# effsize_boot <- function(effsize, data, paired, indices) { +# control <- data$control[indices] +# test <- data$test[indices] +# +# cd <- effsize(control, test, paired) +# +# return(cd) +# } + + + +#' @importFrom boot boot boot.ci +#' @importFrom dplyr arrange bind_rows filter group_by summarize +#' @importFrom magrittr %>% +#' @importFrom rlang quo_name +#' @importFrom stringr str_interp +#' @importFrom tibble tibble +effect_size <- function(.data, ..., effect.size, ci, reps, seed) { + + #### Check object class #### + if (class(.data)[1] != "dabest") { + stop(paste( + "The object you are plotting is not a `dabest` class object. ", + "Please check again! ") + ) + } + + # Create handles for easy access to the items in `.data`. + raw.data <- .data$data + idx <- .data$idx + all.groups <- .data$.all.groups + paired <- .data$is.paired + data.name <- .data$.data.name + is.paired <- .data$is.paired + + plot.groups.sizes <- unlist(lapply(idx, length)) + + # The variables below should are quosures! + x_enquo <- .data$x + y_enquo <- .data$y + effect.size_enquo <- quo_name(effect.size) + + x_quoname <- quo_name(x_enquo) + y_quoname <- quo_name(y_enquo) + + # effect.size_enquo <- rlang::enquo(effect.size) + # effect.size_quoname <- rlang::quo_name(effect.size_enquo) + + id.col_enquo <- .data$id.column + + # Parse the effect.size. + if (effect.size == "mean_diff") { + es = mean_diff_ + summ_func = mean + summ_name = "mean" + } else if (effect.size == "median_diff") { + es = median_diff_ + summ_func = median + summ_name = "median" + } else if (effect.size == "cohens_d") { + es = cohens_d_ + summ_func = mean + summ_name = "mean" + } else if (effect.size == "hedges_g") { + es = hedges_g_ + summ_func = mean + summ_name = "mean" + } else if (effect.size == "cliffs_delta") { + es = cliffs_delta_ + summ_func = median + summ_name = "median" + } + + + + #### Loop through each comparison group. #### + result <- tibble() # To capture output. + + for (group in idx) { + + # Check the control group (`group[1]`) is in the x-column. + if (identical(group[1] %in% raw.data[[x_quoname]], FALSE)) { + + err1 <- str_interp("${group[1]} is not found") + err2 <- str_interp("in the ${x_quoname} column.") + + stop(paste(err1, err2)) + } + + # Patch in v0.2.2. + # Note how we have to unquote both the x_enquo, and the group name! + ctrl <- raw.data %>% filter(!!x_enquo == !!group[1]) + + ctrl <- ctrl[[y_quoname]] + + c <- na.omit(ctrl) + + # If ctrl is length 0, stop! + if (length(c) == 0) { + stop( + str_interp( + c("There are zero numeric observations in the group ${group[1]}.") + ) + ) + } + + # Get test groups (everything else in group), loop through them and compute + # the difference between group[1] and each group. + # Test groups are the 2nd element of group onwards. + test_groups <- group[2: length(group)] + + for (grp in test_groups) { + + # Check if the current group is in the x-column. + if (identical(grp %in% raw.data[[x_quoname]], FALSE)) { + stop( + str_interp( + "${grp} is not found in the ${x_quoname} column." + ) + ) + } + + # Patch in v0.2.2. + # Note how we have to unquote both x_enquo, and grp! + test <- raw.data %>% filter(!!x_enquo == !!grp) + test <- test[[y_quoname]] + t <- na.omit(test) + + # If current test group is length 0, stop! + if (length(t) == 0) { + stop( + str_interp( + c("There are zero numeric observations in the group ${grp}.") + ) + ) + } + + + + #### Compute bootstrap. #### + datalist <- list(control = ctrl, test = test) + + set.seed(seed) + boot_result <- effsize_boot(datalist, effsize_func = es, + R = reps, paired = paired) + + set.seed(NULL) + + # if (identical(paired, FALSE)) { + # diff <- func(t) - func(c) + # # For two.boot, note that the first vector is the test vector. + # boot <- simpleboot::two.boot(t, c, FUN = func, R = reps) + # + # } else { + # if (length(c) != length(t)) { + # stop("The two groups are not the same size, but paired = TRUE.") + # } + # paired_diff <- t - c + # diff <- func(paired_diff) + # boot <- simpleboot::one.boot(paired_diff, FUN = func, R = reps) + # } + + + + #### Compute confidence interval. #### + # check CI. + if (ci < 0 | ci > 100) { + err_string <- str_interp( + "`ci` must be between 0 and 100, not ${ci}" + ) + stop(err_string) + } + + bootci <- boot.ci(boot_result, conf = ci/100, type = c("perc", "bca")) + + + #### Save pairwise result. #### + row <- tibble( + # Convert the name of `func` to a string. + control_group = group[1], + test_group = grp, + control_size = length(c), + test_size = length(t), + effect_size = effect.size, + paired = paired, + variable = y_quoname, + difference = boot_result$t0, + ci = ci, + bca_ci_low = bootci$bca[4], + bca_ci_high = bootci$bca[5], + pct_ci_low = bootci$percent[4], + pct_ci_high = bootci$percent[5], + bootstraps = list(as.vector(boot_result$t)), + nboots = length(boot_result$t) + ) + result <- bind_rows(result, row) + } + } + + # Reset seed. + set.seed(NULL) + + + + #### Compute summaries. #### + summaries <- + raw.data %>% + filter(!!x_enquo %in% all.groups) %>% + group_by(!!x_enquo) %>% + summarize(func_quoname = summ_func(!!y_enquo)) + + colnames(summaries) <- c(x_quoname, summ_name) + + # Order the summaries by the idx. + summaries[[x_quoname]] <- + summaries[[x_quoname]] %>% + factor(all.groups, ordered = TRUE) + + summaries <- summaries %>% arrange(!!x_enquo) + + + + #### Assemble only the data used to create the plot. #### + data.out <- raw.data + + # New in v0.2.1 patch. + # Basically, the `ellipsis` package has been updated, + # and now forcats::as_factor() should only take the object to coerce. + data.out[[x_quoname]] <- forcats::as_factor(data.out[[x_quoname]]) + + data.out <- filter(data.out, !!x_enquo %in% all.groups) + + + + #### Collate output. #### + out = list( + data = data.out, + x = x_enquo, + y = y_enquo, + idx = idx, + id.column = id.col_enquo, + is.paired = is.paired, + effect.size = effect.size, + .data.name = data.name, + result = result, + summary = summaries + ) + + + + #### Append the custom class `dabest_effsize`. #### + class(out) <- c("dabest_effsize", "list") + + + + #### Return the output. #### + return(out) +} + + + +#' Print a `dabest_effsize` object +#' +#' @param x A \code{dabest_effsize} object, generated by one of the +#' \link[=mean_diff]{effect size computation} functions. +#' +#' @param ... S3 signature for generic plot function. +#' +#' @param signif_digits Integer, default 3. All numeric figures in the printed +#' output will be rounded off to this number of significant digits. +#' +#' +#' @return A summary of the effect sizes and respective confidence intervals. +#' +#' @examples +#' # Performing unpaired (two independent groups) analysis. +#' unpaired_mean_diff <- dabest(iris, Species, Petal.Width, +#' idx = c("setosa", "versicolor"), +#' paired = FALSE) %>% +#' mean_diff() +#' +#' # Display the results in a user-friendly format. +#' print(unpaired_mean_diff) +#' +#' @export +#' @importFrom rlang quo_name +#' @importFrom stringr str_interp +print.dabest_effsize <- function(x, ..., signif_digits = 3) { + + #### Check object class #### + if (class(x)[1] != "dabest_effsize") { + stop(paste( + "The object you are plotting is not a `dabest_effsize` class object. ", + "Please check again! ")) + } else { + dabest.effsize <- x + } + + #### Get results table and y var. #### + tbl <- dabest.effsize$result + var <- quo_name(dabest.effsize$y) + + #### Print greeting header. #### + print_greeting_header() + # dabest_ver <- utils::packageVersion("dabestr") + # header <- stringr::str_interp( + # "DABEST (Data Analysis with Bootstrap Estimation) v${dabest_ver}\n") + # cat(header) + # + # cat(rep('=', nchar(header) - 1), sep='') + # cat("\n\n") + + # cat(stringr::str_interp("Variable: ${var} \n\n")) + + #### Print dataset name, xvar, and yvar. #### + xvar = quo_name(dabest.effsize$x) + yvar = quo_name(dabest.effsize$y) + + cat(str_interp("Dataset : ${dabest.effsize$.data.name}\n")) + cat(str_interp("X Variable : ${xvar}\n")) + cat(str_interp("Y Variable : ${yvar}\n\n")) + + #### Print each row. #### + cat(apply(tbl, 1, printrow_, sigdig = signif_digits), + sep = "\n") + + #### Endnote about BCa. #### + cat(str_interp("${tbl$nboots[1]} bootstrap resamples.\n")) + cat("All confidence intervals are bias-corrected and accelerated.\n\n") + +} + + + +#' @importFrom stringr str_interp +printrow_ <- function(my.row, sigdig = 3) { + if (identical(my.row$paired, TRUE)) p <- "Paired" else p <- "Unpaired" + + # For labelling the y-axis. + effsizer <- list("mean difference", "median difference", + "Cohen's d", "Hedges' g", "Cliff's delta") + names(effsizer) <- c("mean_diff", "median_diff", + "cohens_d", "hedges_g", "cliffs_delta") + + line1 <- str_interp( + c( + "${p} ${effsizer[[my.row$effect_size]]} of ", + "${my.row$test_group} ", + "(n = ${my.row$test_size}) ", + "minus ${my.row$control_group} ", + "(n = ${my.row$control_size})\n" + ) + ) + + + line2 <- str_interp( + c("${signif(my.row$difference, sigdig)} ", + "[${signif(my.row$ci, sigdig)}CI ", + "${signif(my.row$bca_ci_low, sigdig)}; ", + "${signif(my.row$bca_ci_high, sigdig)}]\n\n") + ) + + cat(line1, line2) +} + + +