add kw

strengejacke · May 9, 2024 · 7c7d32f · 7c7d32f
1 parent 383b1a4
commit 7c7d32f
Show file tree

Hide file tree

Showing 8 changed files with 316 additions and 13 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -85,6 +85,7 @@ export(gmd)
 export(icc)
 export(inequ_trend)
 export(is_prime)
+export(kruskal_wallis_test)
 export(link_inverse)
 export(mann_whitney_test)
 export(mean_n)

diff --git a/R/chi_squared_test.R b/R/chi_squared_test.R
@@ -14,7 +14,7 @@
 #' @param ... Additional arguments passed down to [`chisq.test()`].
 #' @inheritParams mann_whitney_test
 #'
-#' @return A data frame.
+#' @return A data frame with test results.
 #'
 #' @examples
 #' data(efc)

diff --git a/R/kruskal_wallis_test.R b/R/kruskal_wallis_test.R
@@ -0,0 +1,276 @@
+#' @title Kruskal-Wallis-Test
+#' @name kruskal_wallis_test
+#' @description This function performs a Kruskal-Wallis rank sum test, see
+#' [`kruskal.test()`] and [`coin::kruskal_test()`]).
+#'
+#' @inheritParams mann_whitney_test
+#'
+#' @return A data frame with test results.
+#'
+#' @examples
+#' data(efc)
+#' # Mann-Whitney-U-Tests for elder's age by elder's sex.
+#' kruskal_wallis_test(efc, "e17age", by = "e16sex")
+#' @export
+kruskal_wallis_test <- function(data,
+                                select = NULL,
+                                by = NULL,
+                                weights = NULL) {
+  insight::check_if_installed("datawizard")
+
+  # sanity checks
+  .sanitize_htest_input(data, select, by, weights)
+
+  # get data
+  dv <- data[[select]]
+  grp <- data[[by]]
+
+  # coerce to factor
+  grp <- datawizard::to_factor(grp)
+
+  # only two groups allowed
+  if (insight::n_unique(grp) < 2) {
+    insight::format_error("At least two groups are required, i.e. data must have at least two unique levels in `by` for `kruskal_wallis_test()`.") # nolint
+  }
+  if (is.null(weights)) {
+    .calculate_kw(dv, grp)
+  } else {
+    .calculate_weighted_kw(dv, grp, data[[weights]])
+  }
+}
+
+
+# Kruskal-Wallis-Test --------------------------------------------
+
+.calculate_kw <- function(dv, grp) {
+  # prepare data
+  wcdat <- data.frame(dv, grp)
+  # perfom wilcox test
+  wt <- stats::kruskal.test(dv ~ grp, data = wcdat)
+  # number of groups
+  n_groups <- vapply(grp, function(g) sum(grp == g, na.rm = TRUE), numeric(1))
+
+  out <- data.frame(
+    data = wt$data.name,
+    Chi2 = wt$statistic,
+    df = wt$parameter,
+    p = as.numeric(wt$p.value),
+    stringsAsFactors = FALSE
+  )
+
+  attr(out, "n_groups") <- n_groups
+  attr(out, "method") <- "kruskal"
+  attr(out, "weighted") <- FALSE
+  class(out) <- c("sj_htest_kw", "data.frame")
+
+  out
+}
+
+
+# Weighted Mann-Whitney-Test for two groups ----------------------------------
+
+.calculate_weighted_kw <- function(dv, grp, weights) {
+  # check if pkg survey is available
+  insight::check_if_installed("survey")
+
+  dat <- stats::na.omit(data.frame(dv, grp, weights))
+  colnames(dat) <- c("x", "g", "w")
+
+  design <- survey::svydesign(ids = ~0, data = dat, weights = ~w)
+  result <- survey::svyranktest(formula = x ~ g, design, test = "KruskalWallis")
+
+  # for rank mean
+  group_levels <- levels(droplevels(grp))
+  # subgroups
+  dat_gr1 <- dat[dat$g == group_levels[1], ]
+  dat_gr2 <- dat[dat$g == group_levels[2], ]
+  dat_gr1$rank_x <- rank(dat_gr1$x)
+  dat_gr2$rank_x <- rank(dat_gr2$x)
+
+  # rank means
+  design_mean1 <- survey::svydesign(
+    ids = ~0,
+    data = dat_gr1,
+    weights = ~w
+  )
+  rank_mean_1 <- survey::svymean(~rank_x, design_mean1)
+
+  design_mean2 <- survey::svydesign(
+    ids = ~0,
+    data = dat_gr2,
+    weights = ~w
+  )
+  rank_mean_2 <- survey::svymean(~rank_x, design_mean2)
+
+  # group Ns
+  n_grp1 <- round(dat_gr1$x * dat_gr1$w)
+  n_grp2 <- round(dat_gr2$x * dat_gr2$w)
+
+  # statistics and effect sizes
+  z <- result$statistic
+  r <- abs(z / sqrt(sum(n_grp1, n_grp2)))
+
+  out <- data.frame(
+    group1 = group_levels[1],
+    group2 = group_levels[2],
+    estimate = result$estimate,
+    z = z,
+    r = r,
+    p = as.numeric(result$p.value)
+  )
+
+  attr(out, "rank_means") <- stats::setNames(
+    c(rank_mean_1, rank_mean_2),
+    c("Mean Group 1", "Mean Group 2")
+  )
+  attr(out, "n_groups") <- stats::setNames(
+    c(n_grp1, n_grp2),
+    c("N Group 1", "N Group 2")
+  )
+  attr(out, "group_labels") <- group_labels
+  attr(out, "method") <- method
+  attr(out, "weighted") <- TRUE
+  class(out) <- c("sj_htest_mwu", "data.frame")
+
+  out
+}
+
+
+# helper ----------------------------------------------------------------------
+
+.misspelled_string <- function(source, searchterm, default_message = NULL) {
+  if (is.null(searchterm) || length(searchterm) < 1) {
+    return(default_message)
+  }
+  # used for many matches
+  more_found <- ""
+  # init default
+  msg <- ""
+  # remove matching strings
+  same <- intersect(source, searchterm)
+  searchterm <- setdiff(searchterm, same)
+  source <- setdiff(source, same)
+  # guess the misspelled string
+  possible_strings <- unlist(lapply(searchterm, function(s) {
+    source[.fuzzy_grep(source, s)] # nolint
+  }), use.names = FALSE)
+  if (length(possible_strings)) {
+    msg <- "Did you mean "
+    if (length(possible_strings) > 1) {
+      # make sure we don't print dozens of alternatives for larger data frames
+      if (length(possible_strings) > 5) {
+        more_found <- sprintf(
+          " We even found %i more possible matches, not shown here.",
+          length(possible_strings) - 5
+        )
+        possible_strings <- possible_strings[1:5]
+      }
+      msg <- paste0(msg, "one of ", toString(paste0("\"", possible_strings, "\"")))
+    } else {
+      msg <- paste0(msg, "\"", possible_strings, "\"")
+    }
+    msg <- paste0(msg, "?", more_found)
+  } else {
+    msg <- default_message
+  }
+  # no double white space
+  insight::trim_ws(msg)
+}
+
+
+.fuzzy_grep <- function(x, pattern, precision = NULL) {
+  if (is.null(precision)) {
+    precision <- round(nchar(pattern) / 3)
+  }
+  if (precision > nchar(pattern)) {
+    return(NULL)
+  }
+  p <- sprintf("(%s){~%i}", pattern, precision)
+  grep(pattern = p, x = x, ignore.case = FALSE)
+}
+
+
+.sanitize_htest_input <- function(data, select, by, weights) {
+  # check if arguments are NULL
+  if (is.null(select)) {
+    insight::format_error("Argument `select` is missing.")
+  }
+  if (is.null(by)) {
+    insight::format_error("Arguments `by` is missing.")
+  }
+
+  # check if arguments have correct length (length of 1)
+  if (length(select) != 1 || !is.character(select)) {
+    insight::format_error("Argument `select` must be the name of a single variable.")
+  }
+  if (length(by) != 1 || !is.character(by)) {
+    insight::format_error("Argument `by` must be the name of a single variable.")
+  }
+  if (!is.null(weights) && length(weights) != 1) {
+    insight::format_error("Argument `weights` must be the name of a single variable.")
+  }
+
+  # check if "select" is in data
+  if (!select %in% colnames(data)) {
+    insight::format_error(
+      sprintf("Variable '%s' not found in data frame.", select),
+      .misspelled_string(colnames(data), select, "Maybe misspelled?")
+    )
+  }
+  # check if "by" is in data
+  if (!by %in% colnames(data)) {
+    insight::format_error(
+      sprintf("Variable '%s' not found in data frame.", by),
+      .misspelled_string(colnames(data), by, "Maybe misspelled?")
+    )
+  }
+  # check if "weights" is in data
+  if (!is.null(weights) && !weights %in% colnames(data)) {
+    insight::format_error(
+      sprintf("Weighting variable '%s' not found in data frame.", weights),
+      .misspelled_string(colnames(data), weights, "Maybe misspelled?")
+    )
+  }
+}
+
+
+# methods ---------------------------------------------------------------------
+
+#' @export
+print.sj_htest_mwu <- function(x, ...) {
+  # fetch attributes
+  group_labels <- attributes(x)$group_labels
+  rank_means <- attributes(x)$rank_means
+  n_groups <- attributes(x)$n_groups
+  weighted <- attributes(x)$weighted
+
+  if (weighted) {
+    weight_string <- " (weighted)"
+  } else {
+    weight_string <- ""
+  }
+
+  # same width
+  group_labels <- format(group_labels)
+
+  # header
+  insight::print_color(sprintf("# Mann-Whitney-Test%s\n\n", weight_string), "blue")
+
+  # group-1-info
+  insight::print_color(
+    sprintf(
+      "  Group 1: %s (n = %i, rank mean = %s)\n",
+      group_labels[1], n_groups[1], insight::format_value(rank_means[1], protect_integers = TRUE)
+    ), "cyan"
+  )
+
+  # group-2-info
+  insight::print_color(
+    sprintf(
+      "  Group 2: %s (n = %i, rank mean = %s)\n",
+      group_labels[2], n_groups[2], insight::format_value(rank_means[2], protect_integers = TRUE)
+    ), "cyan"
+  )
+
+  cat(sprintf("\n  r = %.3f, Z = %.3f, %s\n\n", x$r, x$z, insight::format_p(x$p)))
+}
diff --git a/R/mann_whitney_test.R b/R/mann_whitney_test.R
@@ -16,7 +16,7 @@
 #' should be computed. May be one of `"exact"`, `"approximate"` or `"asymptotic"`
 #' (default). See [`coin::wilcox_test()`] for details.
 #'
-#' @return A data frame.
+#' @return A data frame with test results.
 #'
 #' @details This function calls [`coin::wilcox_test()`] to extract effect sizes.
 #' Interpretation of the effect size **r**, as a rule-of-thumb:
@@ -138,14 +138,8 @@ mann_whitney_test <- function(data,
   dat <- stats::na.omit(data.frame(dv, grp, weights))
   colnames(dat) <- c("x", "g", "w")
 
-  if (insight::n_unique(dat$g) > 2) {
-    method <- "KruskalWallis"
-  } else {
-    method <- "wilcoxon"
-  }
-
   design <- survey::svydesign(ids = ~0, data = dat, weights = ~w)
-  result <- survey::svyranktest(formula = x ~ g, design, test = method)
+  result <- survey::svyranktest(formula = x ~ g, design, test = "wilcoxon")
 
   # for rank mean
   group_levels <- levels(droplevels(grp))
@@ -196,7 +190,6 @@ mann_whitney_test <- function(data,
     c("N Group 1", "N Group 2")
   )
   attr(out, "group_labels") <- group_labels
-  attr(out, "method") <- method
   attr(out, "weighted") <- TRUE
   class(out) <- c("sj_htest_mwu", "data.frame")
 

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -30,8 +30,9 @@ reference:
 - title: "Summary Statistics and Tests"
   contents:
   - gmd
-  - mann_whitney_test
   - chi_squared_test
+  - kruskal_wallis_test
+  - mann_whitney_test
   - var_pop
 
 - title: "Tools for Regression Models"

diff --git a/man/chi_squared_test.Rd b/man/chi_squared_test.Rd
diff --git a/man/kruskal_wallis_test.Rd b/man/kruskal_wallis_test.Rd
diff --git a/man/mann_whitney_test.Rd b/man/mann_whitney_test.Rd