@@ -83,3 +83,19 @@ se <- function(x, ...) {
+#' @rdname r2
+#' @export
+means_by_group <- function(x, ...) {
+ .Defunct("datawizard::means_by_group()")
+ datawizard::means_by_group(x, ...)
+#' @rdname r2
+#' @export
+mean_n <- function(x, ...) {
+ .Defunct("datawizard::row_means()")
+ datawizard::row_means(x, ...)
diff --git a/R/anova_stats.R b/R/anova_stats.R
index ef7c1218..f7917687 100644
--- a/R/anova_stats.R
+++ b/R/anova_stats.R
@@ -29,9 +29,7 @@
#' }
#' @export
anova_stats <- function(model, digits = 3) {
- if (!requireNamespace("pwr", quietly = TRUE)) {
- stop("Package `pwr` needed for this function to work. Please install it.", call. = FALSE)
- }
+ insight::check_if_installed("pwr")
# .Deprecated("effectsize::effectsize()", package = "effectsize")
@@ -94,6 +92,7 @@ aov_stat <- function(model, type) {
aov_stat_summary <- function(model) {
+ insight::check_if_installed("broom")
# check if we have a mixed model
mm <- is_merMod(model)
ori.model <- model
diff --git a/R/chi_squared_test.R b/R/chi_squared_test.R
new file mode 100644
index 00000000..426883a6
--- /dev/null
+++ b/R/chi_squared_test.R
@@ -0,0 +1,267 @@
+#' @title Chi-Squared test
+#' @name chi_squared_test
+#' @description This function performs a \eqn{chi}^2 test for contingency
+#' tables or tests for given probabilities. The returned effects sizes are
+#' Cramer's V for tables with more than two rows and columns, Phi (\eqn{\phi})
+#' for 2x2 tables, and \ifelse{latex}{\eqn{Fei}}{פ (Fei)} for tests against
+#' given probabilities (see _Ben-Shachar et al. 2023_).
+#' @param probabilities A numeric vector of probabilities for each cell in the
+#' contingency table. The length of the vector must match the number of cells
+#' in the table, i.e. the number of unique levels of the variable specified
+#' in `select`. If `probabilities` is provided, a chi-squared test for given
+#' probabilities is conducted. Furthermore, if `probabilities` is given, `by`
+#' must be `NULL`. The probabilities must sum to 1.
+#' @param paired Logical, if `TRUE`, a McNemar test is conducted for 2x2 tables.
+#' Note that `paired` only works for 2x2 tables.
+#' @param ... Additional arguments passed down to [`chisq.test()`].
+#' @inheritParams mann_whitney_test
+#' @return A data frame with test results. The returned effects sizes are
+#' Cramer's V for tables with more than two rows and columns, Phi (\eqn{\phi})
+#' for 2x2 tables, and \ifelse{latex}{\eqn{Fei}}{פ (Fei)} for tests against
+#' given probabilities.
+#' @details The function is a wrapper around [`chisq.test()`] and
+#' [`fisher.test()`] (for small expected values) for contingency tables, and
+#' `chisq.test()` for given probabilities. When `probabilities` are provided,
+#' these are rescaled to sum to 1 (i.e. `rescale.p = TRUE`). When `fisher.test()`
+#' is called, simulated p-values are returned (i.e. `simulate.p.value = TRUE`,
+#' see `?fisher.test`). If `paired = TRUE` and a 2x2 table is provided,
+#' a McNemar test (see [`mcnemar.test()`]) is conducted.
+#' The weighted version of the chi-squared test is based on the a weighted
+#' table, using [`xtabs()`] as input for `chisq.test()`.
+#' @references Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
+#' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
+#' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
+#' \doi{10.3390/math11091982}
+#' @examples
+#' data(efc)
+#' efc$weight <- abs(rnorm(nrow(efc), 1, 0.3))
+#' # Chi-squared test
+#' chi_squared_test(efc, "c161sex", by = "e16sex")
+#' # weighted Chi-squared test
+#' chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight")
+#' # Chi-squared test for given probabilities
+#' chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7))
+#' @export
+chi_squared_test <- function(data,
+ select = NULL,
+ by = NULL,
+ probabilities = NULL,
+ weights = NULL,
+ paired = FALSE,
+ ...) {
+ if (is.null(probabilities)) {
+ .calculate_chisq(data, select, by, weights, paired, ...)
+ } else {
+ # sanity check - `paired = TRUE` is not available for given probabilities
+ if (paired) {
+ insight::format_error("When `probabilities` are provided, `paired = TRUE` is not available.") # nolint
+ }
+ .calculate_chisq_gof(data, select, probabilities, weights, ...)
+ }
+# Mann-Whitney-Test for two groups --------------------------------------------
+.calculate_chisq <- function(data, select, by, weights, paired = FALSE, ...) {
+ insight::check_if_installed("datawizard")
+ # sanity checks
+ .sanitize_htest_input(data, select, by, weights)
+ # get data
+ grp1 <- data[[select]]
+ grp2 <- data[[by]]
+ # if paired = TRUE, we only allow a 2x2 table
+ if (paired && (length(stats::na.omit(unique(grp1))) != 2 || length(stats::na.omit(unique(grp2))) != 2)) {
+ insight::format_error("When `paired = TRUE`, only 2x2 tables are allowed (i.e. both variables must have exactly two levels).") # nolint
+ }
+ # create data frame for table
+ x <- data.frame(
+ grp1 = datawizard::to_factor(grp1),
+ grp2 = datawizard::to_factor(grp2)
+ )
+ # add weights
+ if (!is.null(weights)) {
+ x$weights <- data[[weights]]
+ }
+ # remove missings
+ x <- stats::na.omit(x)
+ # contingency table
+ if (is.null(weights)) {
+ tab <- table(x)
+ } else {
+ tab <- as.table(round(stats::xtabs(x[[3]] ~ x[[1]] + x[[2]])))
+ class(tab) <- "table"
+ }
+ # expected values, to identify whether Fisher's test is needed
+ expected_values <- as.table(round(as.array(margin.table(tab, 1)) %*% t(as.array(margin.table(tab, 2))) / margin.table(tab))) # nolint
+ # paired? mc-nemar test
+ if (paired) {
+ htest <- suppressWarnings(stats::mcnemar.test(tab, ...))
+ test_statistic <- htest$statistic
+ } else {
+ # chi-squared test
+ htest <- suppressWarnings(stats::chisq.test(tab, ...))
+ test_statistic <- htest$statistic
+ # need fisher?
+ if (min(expected_values) < 5 || (min(expected_values) < 10 && htest$parameter == 1)) {
+ htest <- stats::fisher.test(tab, simulate.p.value = TRUE, ...)
+ }
+ }
+ p_value <- htest$p.value
+ # effect size
+ if (nrow(tab) > 2 || ncol(tab) > 2) {
+ effect_size <- stats::setNames(cramer(tab), "Cramer's V")
+ } else {
+ effect_size <- stats::setNames(phi(tab), "Phi")
+ }
+ # return result
+ out <- data.frame(
+ data = paste(select, "by", by),
+ statistic_name = "Chi-squared",
+ statistic = test_statistic,
+ effect_size_name = names(effect_size),
+ effect_size = as.numeric(effect_size),
+ p = p_value,
+ df = (nrow(tab) - 1) * (ncol(tab) - 1),
+ n_obs = sum(tab, na.rm = TRUE),
+ stringsAsFactors = FALSE
+ )
+ class(out) <- c("sj_htest_chi", "data.frame")
+ attr(out, "weighted") <- !is.null(weights)
+ attr(out, "fisher") <- isTRUE(startsWith(htest$method, "Fisher"))
+ attr(out, "mcnemar") <- isTRUE(paired)
+ attr(out, "caption") <- "contingency tables"
+ out
+.calculate_chisq_gof <- function(data, select, probabilities, weights, ...) {
+ insight::check_if_installed("effectsize")
+ # get data
+ x <- data.frame(grp = data[[select]])
+ # add weights
+ if (!is.null(weights)) {
+ x$weights <- data[[weights]]
+ }
+ # remove missings
+ x <- stats::na.omit(x)
+ # contingency table
+ if (is.null(weights)) {
+ tab <- table(x)
+ } else {
+ tab <- as.table(round(stats::xtabs(x[[2]] ~ x[[1]])))
+ class(tab) <- "table"
+ }
+ # table dimensions
+ n_rows <- nlevels(droplevels(as.factor(x$grp)))
+ # sanity check
+ if (length(probabilities) != n_rows) {
+ insight::format_error("Length of probabilities must match number of cells in table (i.e. number of levels of input factor).") # nolint
+ }
+ if (!isTRUE(all.equal(sum(probabilities), 1))) {
+ insight::format_error("Probabilities must sum to 1.")
+ }
+ # chi-squared test
+ htest <- suppressWarnings(stats::chisq.test(tab, p = probabilities, rescale.p = TRUE, ...))
+ test_statistic <- htest$statistic
+ p_value <- htest$p.value
+ effect_size <- effectsize::chisq_to_fei(
+ test_statistic,
+ n = sum(tab),
+ nrow = n_rows,
+ ncol = 1,
+ p = probabilities,
+ alternative = "two.sided"
+ )$Fei
+ # return result
+ out <- data.frame(
+ data = paste(
+ select,
+ "against probabilities",
+ datawizard::text_concatenate(sprintf("%i%%", round(100 * probabilities)))
+ ),
+ statistic_name = "Chi-squared",
+ statistic = test_statistic,
+ effect_size_name = "Fei",
+ effect_size = as.numeric(effect_size),
+ p = p_value,
+ df = n_rows - 1,
+ n_obs = sum(tab, na.rm = TRUE),
+ stringsAsFactors = FALSE
+ )
+ class(out) <- c("sj_htest_chi", "data.frame")
+ attr(out, "caption") <- "given probabilities"
+ attr(out, "weighted") <- !is.null(weights)
+ out
+# methods ---------------------------------------------------------------------
+#' @export
+print.sj_htest_chi <- function(x, ...) {
+ weighted <- attributes(x)$weighted
+ if (weighted) {
+ weight_string <- " (weighted)"
+ } else {
+ weight_string <- ""
+ }
+ fisher <- attributes(x)$fisher
+ mcnemar <- attributes(x)$mcnemar
+ # headline
+ insight::print_color(sprintf(
+ "\n# Chi-squared test for %s%s\n",
+ attributes(x)$caption,
+ weight_string
+ ), "blue")
+ # Fisher's exact test?
+ if (isTRUE(fisher)) {
+ insight::print_color(" (using Fisher's exact test due to small expected values)\n", "blue") # nolint
+ } else if (isTRUE(mcnemar)) {
+ insight::print_color(" (using McNemar's test for paired data)\n", "blue") # nolint
+ }
+ cat("\n")
+ # data info
+ insight::print_color(
+ sprintf(" Data: %s (n = %i)\n", x$data, round(x$n_obs)),
+ "cyan"
+ )
+ # prepare and align strings
+ eff_symbol <- .format_symbols(x$effect_size_name)
+ stat_symbol <- .format_symbols(x$statistic_name)
+ cat(sprintf(
+ "\n %s = %.4f, %s = %.4f, df = %i, %s\n\n",
+ stat_symbol, x$statistic, eff_symbol, x$effect_size, round(x$df), insight::format_p(x$p)
+ ))
diff --git a/R/cramer.R b/R/cramer.R
index 66798e72..a623da44 100644
--- a/R/cramer.R
+++ b/R/cramer.R
@@ -1,53 +1,52 @@
#' @rdname crosstable_statistics
#' @export
-cramer <- function(tab, ...) {
- UseMethod("cramer")
+cramers_v <- function(tab, ...) {
+ UseMethod("cramers_v")
+#' @rdname crosstable_statistics
+#' @export
+cramer <- cramers_v
#' @export
-cramer.table <- function(tab, ...) {
- .cramer(tab)
+cramers_v.table <- function(tab, ...) {
+ .cramers_v(tab)
#' @export
-cramer.ftable <- function(tab, ...) {
- .cramer(tab)
+cramers_v.ftable <- function(tab, ...) {
+ .cramers_v(tab)
#' @rdname crosstable_statistics
#' @export
-cramer.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) {
+cramers_v.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) {
terms <- all.vars(formula)
tab <- table(data[[terms[1]]], data[[terms[2]]])
method <- match.arg(method)
if (is.null(ci.lvl) || is.na(ci.lvl)) {
- .cramer(tab)
+ .cramers_v(tab)
} else {
- ci <- data[, terms] %>%
- sjstats::bootstrap(n) %>%
- dplyr::mutate(
- tables = lapply(.data$strap, function(x) {
- dat <- as.data.frame(x)
- table(dat[[1]], dat[[2]])
- }),
- cramers = sapply(.data$tables, function(x) .cramer(x))
- ) %>%
- dplyr::pull("cramers") %>%
- boot_ci(ci.lvl = ci.lvl, method = method)
+ straps <- sjstats::bootstrap(data[terms], n)
+ tables <- lapply(straps$strap, function(x) {
+ dat <- as.data.frame(x)
+ table(dat[[1]], dat[[2]])
+ })
+ cramers <- sapply(tables, function(x) .cramers_v(x))
+ ci <- boot_ci(cramers, ci.lvl = ci.lvl, method = method)
- cramer = .cramer(tab),
+ cramer = .cramers_v(tab),
conf.low = ci$conf.low,
conf.high = ci$conf.high
-.cramer <- function(tab) {
+.cramers_v <- function(tab) {
# convert to flat table
if (!inherits(tab, "ftable")) tab <- stats::ftable(tab)
sqrt(phi(tab)^2 / min(dim(tab) - 1))
diff --git a/R/cv_error.R b/R/cv_error.R
index 9ab6f875..c4880b12 100644
--- a/R/cv_error.R
+++ b/R/cv_error.R
@@ -34,13 +34,13 @@
#' @importFrom modelr crossv_kfold
#' @importFrom dplyr mutate summarise
#' @importFrom purrr map map2 map_dbl map_df
-#' @importFrom broom augment
#' @importFrom tidyr unnest
#' @importFrom rlang .data
#' @importFrom insight find_response
#' @importFrom performance rmse
#' @export
cv_error <- function(data, formula, k = 5) {
+ insight::check_if_installed("broom")
# compute cross validation data
cv_data <- data %>%
diff --git a/R/find_beta.R b/R/find_beta.R
index 7e45408b..1ae733cc 100644
--- a/R/find_beta.R
+++ b/R/find_beta.R
@@ -1,10 +1,10 @@
#' @title Determining distribution parameters
#' @name find_beta
-#' @description \code{find_beta()}, \code{find_normal()} and \code{find_cauchy()} find the
+#' @description `find_beta()`, `find_normal()` and `find_cauchy()` find the
#' shape, mean and standard deviation resp. the location and scale parameters
#' to describe the beta, normal or cauchy distribution, based on two
-#' percentiles. \code{find_beta2()} finds the shape parameters for a Beta
+#' percentiles. `find_beta2()` finds the shape parameters for a Beta
#' distribution, based on a probability value and its standard error
#' or confidence intervals.
@@ -14,14 +14,14 @@
#' @param p2 Probability of the second percentile.
#' @param x Numeric, a probability value between 0 and 1. Typically indicates
#' a prevalence rate of an outcome of interest; Or an integer value
-#' with the number of observed events. In this case, specify \code{n}
+#' with the number of observed events. In this case, specify `n`
#' to indicate the toral number of observations.
-#' @param se The standard error of \code{x}. Either \code{se} or \code{ci} must
+#' @param se The standard error of `x`. Either `se` or `ci` must
#' be specified.
-#' @param ci The upper limit of the confidence interval of \code{x}. Either
-#' \code{se} or \code{ci} must be specified.
+#' @param ci The upper limit of the confidence interval of `x`. Either
+#' `se` or `ci` must be specified.
#' @param n Numeric, number of total observations. Needs to be specified, if
-#' \code{x} is an integer (number of observed events), and no
+#' `x` is an integer (number of observed events), and no
#' probability. See 'Examples'.
#' @return A list of length two, with the two distribution parameters than can
@@ -29,21 +29,20 @@
#' the shape for the given input parameters.
#' @details These functions can be used to find parameter for various distributions,
-#' to define prior probabilities for Bayesian analyses. \code{x1},
-#' \code{p1}, \code{x2} and \code{p2} are parameters that describe two
-#' quantiles. Given this knowledge, the distribution parameters are
-#' returned. \cr \cr
-#' Use \code{find_beta2()}, if the known parameters are, e.g. a prevalence
-#' rate or similar probability, and its standard deviation or confidence
-#' interval. In this case. \code{x} should be a probability,
-#' for example a prevalence rate of a certain event. \code{se} then
-#' needs to be the standard error for this probability. Alternatively,
-#' \code{ci} can be specified, which should indicate the upper limit
-#' of the confidence interval od the probability (prevalence rate) \code{x}.
-#' If the number of events out of a total number of trials is known
-#' (e.g. 12 heads out of 30 coin tosses), \code{x} can also be the number
-#' of observed events, while \code{n} indicates the total amount of trials
-#' (in the above example, the function call would be: \code{find_beta2(x = 12, n = 30)}).
+#' to define prior probabilities for Bayesian analyses. `x1`, `p1`, `x2` and
+#' `p2` are parameters that describe two quantiles. Given this knowledge, the
+#' distribution parameters are returned.
+#' Use `find_beta2()`, if the known parameters are, e.g. a prevalence rate or
+#' similar probability, and its standard deviation or confidence interval. In
+#' this case. `x` should be a probability, for example a prevalence rate of a
+#' certain event. `se` then needs to be the standard error for this probability.
+#' Alternatively, `ci` can be specified, which should indicate the upper limit
+#' of the confidence interval od the probability (prevalence rate) `x`. If the
+#' number of events out of a total number of trials is known (e.g. 12 heads out
+#' of 30 coin tosses), `x` can also be the number of observed events, while `n`
+#' indicates the total amount of trials (in the above example, the function
+#' call would be: `find_beta2(x = 12, n = 30)`).
#' @references Cook JD. Determining distribution parameters from quantiles. 2010: Department of Biostatistics, Texas (\href{https://www.johndcook.com/quantiles_parameters.pdf}{PDF})
@@ -79,14 +78,12 @@
#' shapes <- find_beta2(x = 3, n = 20)
#' curve(dbeta(x, shapes[[1]], shapes[[2]]))
-#' @importFrom stats pbeta approx
-#' @importFrom purrr map_dbl
#' @export
find_beta <- function(x1, p1, x2, p2) {
logK <- seq(-5, 10, length = 200)
K <- exp(logK)
- m <- purrr::map_dbl(K, ~ betaprior(.x, x1, p1))
+ m <- unlist(lapply(K, betaprior, x = x1, p = p1))
prob2 <- stats::pbeta(x2, K * m, K * (1 - m))
ind <- ((prob2 > 0) & (prob2 < 1))
@@ -127,13 +124,13 @@ betaprior <- function(K, x, p) {
find_beta2 <- function(x, se, ci, n) {
# check if all required arguments are given
if (missing(se) && missing(ci) && missing(n)) {
- stop("Either `se` or `ci`, or `n` must be specified.", call. = F)
+ insight::format_error("Either `se` or `ci`, or `n` must be specified.")
# for number of observations, compute variance of beta distribution
if (!missing(n)) {
if (!is.integer(x) && x < 1)
- stop("If `n` is given, x` must be an integer value greater than 0.", call. = F)
+ insight::format_error("If `n` is given, x` must be an integer value greater than 0.")
# compute 2 SD from beta variance
bvar <- 2 * sqrt((x * n) / ((x + n)^2 * (x + n + 1)))
@@ -164,7 +161,6 @@ find_beta2 <- function(x, se, ci, n) {
-#' @importFrom stats qcauchy
#' @rdname find_beta
#' @export
find_cauchy <- function(x1, p1, x2, p2) {
@@ -177,7 +173,6 @@ find_cauchy <- function(x1, p1, x2, p2) {
-#' @importFrom stats qnorm
#' @rdname find_beta
#' @export
find_normal <- function(x1, p1, x2, p2) {
diff --git a/R/grpmean.R b/R/grpmean.R
deleted file mode 100644
index ab2eb2d1..00000000
--- a/R/grpmean.R
+++ /dev/null
@@ -1,335 +0,0 @@
-#' @title Summary of mean values by group
-#' @name means_by_group
-#' @description Computes mean, sd and se for each sub-group (indicated by \code{grp})
-#' of \code{dv}.
-#' @param x A (grouped) data frame.
-#' @param dv Name of the dependent variable, for which the mean value, grouped
-#' by \code{grp}, is computed.
-#' @param grp Factor with the cross-classifying variable, where \code{dv} is
-#' grouped into the categories represented by \code{grp}. Numeric vectors
-#' are coerced to factors.
-#' @param weights Name of variable in \code{x} that indicated the vector of
-#' weights that will be applied to weight all observations. Default is
-#' \code{NULL}, so no weights are used.
-#' @param digits Numeric, amount of digits after decimal point when rounding
-#' estimates and values.
-#' @param file Destination file, if the output should be saved as file.
-#' Only used when \code{out} is not \code{"txt"}.
-#' @param encoding Character vector, indicating the charset encoding used
-#' for variable and value labels. Default is \code{"UTF-8"}. Only used
-#' when \code{out} is not \code{"txt"}.
-#' @param out Character vector, indicating whether the results should be printed
-#' to console (\code{out = "txt"}) or as HTML-table in the viewer-pane
-#' (\code{out = "viewer"}) or browser (\code{out = "browser"}), of if the
-#' results should be plotted (\code{out = "plot"}, only applies to certain
-#' functions). May be abbreviated.
-#' @return For non-grouped data frames, \code{means_by_group()} returns a data frame with
-#' following columns: \code{term}, \code{mean}, \code{N}, \code{std.dev},
-#' \code{std.error} and \code{p.value}. For grouped data frames, returns
-#' a list of such data frames.
-#' @details This function performs a One-Way-Anova with \code{dv} as dependent
-#' and \code{grp} as independent variable, by calling
-#' \code{lm(count ~ as.factor(grp))}. Then \code{\link[emmeans]{contrast}}
-#' is called to get p-values for each sub-group. P-values indicate whether
-#' each group-mean is significantly different from the total mean.
-#' @examples
-#' data(efc)
-#' means_by_group(efc, c12hour, e42dep)
-#' data(iris)
-#' means_by_group(iris, Sepal.Width, Species)
-#' # also works for grouped data frames
-#' if (require("dplyr")) {
-#' efc %>%
-#' group_by(c172code) %>%
-#' means_by_group(c12hour, e42dep)
-#' }
-#' # weighting
-#' efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5))
-#' means_by_group(efc, c12hour, e42dep, weights = weight)
-#' @importFrom sjlabelled get_label drop_labels get_labels
-#' @importFrom stats lm na.omit sd weighted.mean
-#' @importFrom purrr map_chr map_df
-#' @importFrom sjmisc to_value is_empty
-#' @importFrom rlang enquo .data quo_name
-#' @export
-means_by_group <- function(x,
- dv,
- grp,
- weights = NULL,
- digits = 2,
- out = c("txt", "viewer", "browser"),
- encoding = "UTF-8",
- file = NULL) {
- out <- match.arg(out)
- if (out != "txt" && !requireNamespace("sjPlot", quietly = TRUE)) {
- message("Package `sjPlot` needs to be loaded to print HTML tables.")
- out <- "txt"
- }
- # create quosures
- grp.name <- rlang::quo_name(rlang::enquo(grp))
- dv.name <- rlang::quo_name(rlang::enquo(dv))
- # weights need extra checking, might be NULL
- if (!missing(weights)) {
- .weights <- try(rlang::quo_name(rlang::enquo(weights)), silent = TRUE)
- if (inherits(.weights, "try-error")) .weights <- NULL
- w.string <- try(eval(weights), silent = TRUE)
- if (!inherits(w.string, "try-error") && !is.null(w.string) && is.character(w.string)) .weights <- w.string
- if (sjmisc::is_empty(.weights) || .weights == "NULL") .weights <- NULL
- } else
- .weights <- NULL
- # create string with variable names
- vars <- c(grp.name, dv.name, .weights)
- # get data
- x <- suppressMessages(dplyr::select(x, !! vars))
- # set value and row labels
- varGrpLabel <- sjlabelled::get_label(x[[grp.name]], def.value = grp.name)
- varCountLabel <- sjlabelled::get_label(x[[dv.name]], def.value = dv.name)
- # first, drop unused labels
- x[[grp.name]] <- sjlabelled::drop_labels(x[[grp.name]], drop.na = TRUE)
- # now get valid value labels
- value.labels <- sjlabelled::get_labels(
- x[[grp.name]], attr.only = F, values = "n", non.labelled = TRUE
- )
- # return values
- dataframes <- list()
- # do we have a grouped data frame?
- if (inherits(x, "grouped_df")) {
- # get grouped data
- grps <- get_grouped_data(x)
- # now plot everything
- for (i in seq_len(nrow(grps))) {
- # copy back labels to grouped data frame
- tmp <- sjlabelled::copy_labels(grps$data[[i]], x)
- # get grouped means table
- dummy <- means_by_group_helper(
- x = tmp,
- dv = dv.name,
- grp = grp.name,
- weight.by = .weights,
- value.labels = value.labels,
- varCountLabel = varCountLabel,
- varGrpLabel = varGrpLabel
- )
- attr(dummy, "group") <- get_grouped_title(x, grps, i, sep = "\n")
- # save data frame for return value
- dataframes[[length(dataframes) + 1]] <- dummy
- }
- # add class-attr for print-method()
- if (out == "txt")
- class(dataframes) <- c("sj_grpmeans", "list")
- else
- class(dataframes) <- c("sjt_grpmeans", "list")
- } else {
- dataframes <- means_by_group_helper(
- x = x,
- dv = dv.name,
- grp = grp.name,
- weight.by = .weights,
- value.labels = value.labels,
- varCountLabel = varCountLabel,
- varGrpLabel = varGrpLabel
- )
- # add class-attr for print-method()
- if (out == "txt")
- class(dataframes) <- c("sj_grpmean", class(dataframes))
- else
- class(dataframes) <- c("sjt_grpmean", class(dataframes))
- }
- # save how to print output
- attr(dataframes, "print") <- out
- attr(dataframes, "encoding") <- encoding
- attr(dataframes, "file") <- file
- attr(dataframes, "digits") <- digits
- dataframes
-#' @importFrom stats pf lm weighted.mean na.omit sd
-#' @importFrom sjmisc to_value add_variables
-#' @importFrom emmeans emmeans contrast
-#' @importFrom dplyr pull select n_distinct
-#' @importFrom purrr map_chr
-#' @importFrom rlang .data
-means_by_group_helper <- function(x, dv, grp, weight.by, value.labels, varCountLabel, varGrpLabel) {
- # copy vectors from data frame
- dv <- x[[dv]]
- grp <- x[[grp]]
- if (!is.null(weight.by))
- weight.by <- x[[weight.by]]
- else
- weight.by <- 1
- # convert values to numeric
- dv <- sjmisc::to_value(dv)
- # create data frame, for emmeans
- mydf <- stats::na.omit(data.frame(
- dv = dv,
- grp = as.factor(grp),
- weight.by = weight.by
- ))
- # compute anova statistics for mean table
- fit <- stats::lm(dv ~ grp, weights = weight.by, data = mydf)
- # p-values of contrast-means
- means.p <- fit %>%
- emmeans::emmeans(specs = "grp") %>%
- emmeans::contrast(method = "eff") %>%
- summary() %>%
- dplyr::pull("p.value")
- ## TODO
- # efc %>%
- # group_by(c172code, c161sex) %>%
- # means_by_group(c12hour, e42dep)
- # check if value labels length matches group count
- if (dplyr::n_distinct(mydf$grp) != length(value.labels)) {
- # get unique factor levels and check if these are numeric.
- # if so, we match the values from value labels and the remaining
- # factor levels, so we get the correct value labels for printing
- nl <- unique(mydf$grp)
- if (sjmisc::is_num_fac(nl))
- value.labels <- value.labels[names(value.labels) %in% levels(nl)]
- else
- value.labels <- nl
- }
- # create summary
- dat <- mydf %>%
- dplyr::group_by(.data$grp) %>%
- summarise(
- mean = stats::weighted.mean(.data$dv, w = .data$weight.by, na.rm = TRUE),
- N = round(sum(.data$weight.by)),
- std.dev = weighted_sd(.data$dv, .data$weight.by),
- std.error = weighted_se(.data$dv, .data$weight.by)
- ) %>%
- mutate(p.value = means.p) %>%
- dplyr::select(-.data$grp)
- # finally, add total-row
- dat <- dplyr::bind_rows(
- dat,
- data_frame(
- mean = stats::weighted.mean(mydf$dv, w = mydf$weight.by, na.rm = TRUE),
- N = nrow(mydf),
- std.dev = weighted_sd(mydf$dv, mydf$weight.by),
- std.error = weighted_se(mydf$dv, mydf$weight.by),
- p.value = NA
- )
- )
- # add row labels
- dat <- sjmisc::add_variables(
- dat,
- term = c(unname(value.labels), "Total"),
- .after = -1
- )
- # get anova statistics for mean table
- sum.fit <- summary(fit)
- # r-squared values
- r2 <- sum.fit$r.squared
- r2.adj <- sum.fit$adj.r.squared
- # F-statistics
- fstat <- sum.fit$fstatistic
- pval <- stats::pf(fstat[1], fstat[2], fstat[3], lower.tail = F)
- # copy as attributes
- attr(dat, "r2") <- r2
- attr(dat, "adj.r2") <- r2.adj
- attr(dat, "fstat") <- fstat[1]
- attr(dat, "p.value") <- pval
- attr(dat, "dv.label") <- varCountLabel
- attr(dat, "grp.label") <- varGrpLabel
- dat
-get_grouped_title <- function(x, grps, i, sep = "\n") {
- # create title for first grouping level
- tp <- get_title_part(x, grps, 1, i)
- title <- sprintf("%s: %s", tp[1], tp[2])
- # do we have another groupng variable?
- if (length(dplyr::group_vars(x)) > 1) {
- tp <- get_title_part(x, grps, 2, i)
- title <- sprintf("%s%s%s: %s", title, sep, tp[1], tp[2])
- }
- # return title
- title
-get_title_part <- function(x, grps, level, i) {
- # prepare title for group
- var.name <- colnames(grps)[level]
- # get values from value labels
- vals <- sjlabelled::get_values(x[[var.name]])
- # if we have no value labels, get values directly
- if (is.null(vals)) {
- vals <- unique(x[[var.name]])
- lab.pos <- i
- } else {
- # find position of value labels for current group
- lab.pos <- which(vals == grps[[var.name]][i])
- }
- # get variable and value labels
- t1 <- sjlabelled::get_label(x[[var.name]], def.value = var.name)
- t2 <- sjlabelled::get_labels(x[[var.name]])[lab.pos]
- # if we have no value label, use value instead
- if (is.null(t2)) t2 <- vals[lab.pos]
- # generate title
- c(t1, t2)
-#' @rdname means_by_group
-#' @export
-grpmean <- means_by_group
diff --git a/R/helpfunctions.R b/R/helpfunctions.R
index 0785b281..ce36f369 100644
--- a/R/helpfunctions.R
+++ b/R/helpfunctions.R
@@ -34,7 +34,7 @@ get_glm_family <- function(fit) {
# create logical for family
binom_fam <- fitfam %in% c("binomial", "quasibinomial")
poisson_fam <- fitfam %in% c("poisson", "quasipoisson") ||
- sjmisc::str_contains(fitfam, "negative binomial", ignore.case = T)
+ sjmisc::str_contains(fitfam, "negative binomial", ignore.case = TRUE)
list(is_bin = binom_fam, is_pois = poisson_fam, is_logit = logit_link)
@@ -64,8 +64,45 @@ get_grouped_data <- function(x) {
.compact_character <- function(x) {
x[!sapply(x, function(i) is.null(i) || nchar(i) == 0 || is.na(i) || any(i == "NULL", na.rm = TRUE))]
+.format_symbols <- function(x) {
+ if (.unicode_symbols()) {
+ x <- gsub("Delta", "\u0394", x, ignore.case = TRUE)
+ x <- gsub("Phi", "\u03D5", x, ignore.case = TRUE)
+ x <- gsub("Eta", "\u03B7", x, ignore.case = TRUE)
+ x <- gsub("Epsilon", "\u03b5", x, ignore.case = TRUE)
+ x <- gsub("Omega", "\u03b5", x, ignore.case = TRUE)
+ x <- gsub("R2", "R\u00b2", x, ignore.case = TRUE)
+ x <- gsub("Chi2", "\u03C7\u00b2", x, ignore.case = TRUE)
+ x <- gsub("Chi-squared", "\u03C7\u00b2", x, ignore.case = TRUE)
+ x <- gsub("Chi", "\u03C7", x, ignore.case = TRUE)
+ x <- gsub("Sigma", "\u03C3", x, ignore.case = TRUE)
+ x <- gsub("Rho", "\u03C1", x, ignore.case = TRUE)
+ x <- gsub("Mu", "\u03BC", x, ignore.case = TRUE)
+ x <- gsub("Theta", "\u03B8", x, ignore.case = TRUE)
+ x <- gsub("Fei", "\u05E4\u200E", x, ignore.case = TRUE)
+ }
+ x
+.unicode_symbols <- function() {
+ win_os <- tryCatch(
+ {
+ si <- Sys.info()
+ if (is.null(si["sysname"])) {
+ } else {
+ si["sysname"] == "Windows" || startsWith(R.version$os, "mingw")
+ }
+ },
+ error = function(e) {
+ }
+ )
+ l10n_info()[["UTF-8"]] && ((win_os && getRversion() >= "4.2") || (!win_os && getRversion() >= "4.0"))
diff --git a/R/kruskal_wallis_test.R b/R/kruskal_wallis_test.R
new file mode 100644
index 00000000..a66e3c15
--- /dev/null
+++ b/R/kruskal_wallis_test.R
@@ -0,0 +1,188 @@
+#' @title Kruskal-Wallis test
+#' @name kruskal_wallis_test
+#' @description This function performs a Kruskal-Wallis rank sum test, to test
+#' the null hypothesis that the population median of all of the groups are
+#' equal. The alternative is that they differ in at least one.
+#' @inheritParams mann_whitney_test
+#' @return A data frame with test results.
+#' @details The function simply is a wrapper around [`kruskal.test()`]. The
+#' weighted version of the Kruskal-Wallis test is based on the **survey** package,
+#' using [`survey::svyranktest()`].
+#' @examples
+#' data(efc)
+#' # Kruskal-Wallis test for elder's age by education
+#' kruskal_wallis_test(efc, "e17age", by = "c172code")
+#' # when data is in wide-format, specify all relevant continuous
+#' # variables in `select` and omit `by`
+#' set.seed(123)
+#' wide_data <- data.frame(
+#' scale1 = runif(20),
+#' scale2 = runif(20),
+#' scale3 = runif(20)
+#' )
+#' kruskal_wallis_test(wide_data, select = c("scale1", "scale2", "scale3"))
+#' # same as if we had data in long format, with grouping variable
+#' long_data <- data.frame(
+#' scales = c(wide_data$scale1, wide_data$scale2, wide_data$scale3),
+#' groups = rep(c("A", "B", "C"), each = 20)
+#' )
+#' kruskal_wallis_test(long_data, select = "scales", by = "groups")
+#' @export
+kruskal_wallis_test <- function(data,
+ select = NULL,
+ by = NULL,
+ weights = NULL) {
+ insight::check_if_installed("datawizard")
+ # sanity checks
+ .sanitize_htest_input(data, select, by, weights)
+ # does select indicate more than one variable?
+ if (length(select) > 1) {
+ if (!is.null(by)) {
+ insight::format_error("If `select` specifies more than one variable, `by` must be `NULL`.")
+ }
+ # we convert the data into long format, and create a grouping variable
+ data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale")
+ by <- select[2]
+ select <- select[1]
+ # after converting to long, we have the "grouping" variable first in the data
+ colnames(data) <- c(by, select)
+ }
+ # get data
+ dv <- data[[select]]
+ grp <- data[[by]]
+ # coerce to factor
+ grp <- datawizard::to_factor(grp)
+ # only two groups allowed
+ if (insight::n_unique(grp) < 2) {
+ insight::format_error("At least two groups are required, i.e. data must have at least two unique levels in `by` for `kruskal_wallis_test()`.") # nolint
+ }
+ if (is.null(weights)) {
+ .calculate_kw(dv, grp)
+ } else {
+ .calculate_weighted_kw(dv, grp, data[[weights]])
+ }
+# Kruskal-Wallis-Test --------------------------------------------
+.calculate_kw <- function(dv, grp, paired = FALSE) {
+ # prepare data
+ wcdat <- data.frame(dv, grp)
+ if (paired) {
+ # perfom friedman test for paired data
+ wt <- stats::friedman.test(table(wcdat))
+ } else {
+ # perfom kruskal wallis test
+ wt <- stats::kruskal.test(dv ~ grp, data = wcdat)
+ }
+ # number of groups
+ n_groups <- vapply(
+ stats::na.omit(unique(grp)),
+ function(g) sum(grp == g, na.rm = TRUE),
+ numeric(1)
+ )
+ out <- data.frame(
+ data = wt$data.name,
+ Chi2 = wt$statistic,
+ df = wt$parameter,
+ p = as.numeric(wt$p.value),
+ stringsAsFactors = FALSE
+ )
+ attr(out, "n_groups") <- n_groups
+ attr(out, "method") <- ifelse(paired, "friedman", "kruskal")
+ attr(out, "weighted") <- FALSE
+ class(out) <- c("sj_htest_kw", "data.frame")
+ out
+# Weighted Mann-Whitney-Test for two groups ----------------------------------
+.calculate_weighted_kw <- function(dv, grp, weights, paired = FALSE) {
+ # check if pkg survey is available
+ insight::check_if_installed("survey")
+ dat <- stats::na.omit(data.frame(dv, grp, weights))
+ colnames(dat) <- c("x", "g", "w")
+ # number of groups
+ n_groups <- vapply(stats::na.omit(unique(grp)), function(g) {
+ sum(dat$w[dat$grp == g], na.rm = TRUE)
+ }, numeric(1))
+ if (paired) {
+ ## TODO: paired no working. should call `friedman.test()`
+ } else {
+ design <- survey::svydesign(ids = ~0, data = dat, weights = ~w)
+ result <- survey::svyranktest(formula = x ~ g, design, test = "KruskalWallis")
+ }
+ out <- data.frame(
+ data = paste(dv, "by", grp),
+ Chi2 = result$statistic,
+ df = result$parameter,
+ p = as.numeric(result$p.value),
+ stringsAsFactors = FALSE
+ )
+ attr(out, "n_groups") <- n_groups
+ attr(out, "method") <- ifelse(paired, "friedman", "kruskal")
+ attr(out, "weighted") <- TRUE
+ class(out) <- c("sj_htest_kw", "data.frame")
+ out
+# methods ---------------------------------------------------------------------
+#' @export
+print.sj_htest_kw <- function(x, ...) {
+ insight::check_if_installed("datawizard")
+ # fetch attributes
+ n_groups <- attributes(x)$n_groups
+ weighted <- attributes(x)$weighted
+ method <- attributes(x)$method
+ if (weighted) {
+ weight_string <- " (weighted)"
+ } else {
+ weight_string <- ""
+ }
+ # header
+ if (identical(method, "kruskal")) {
+ insight::print_color(sprintf("# Kruskal-Wallis test%s\n\n", weight_string), "blue")
+ } else {
+ insight::print_color(sprintf("# Friedman test%s\n\n", weight_string), "blue")
+ }
+ # data info
+ insight::print_color(
+ sprintf(
+ " Data: %s (%i groups, n = %s)\n",
+ x$data, length(n_groups), datawizard::text_concatenate(n_groups)
+ ), "cyan"
+ )
+ stat_symbol <- .format_symbols("Chi2")
+ cat(sprintf(
+ "\n %s = %.3f, df = %i, %s\n\n",
+ stat_symbol, x$Chi2, round(x$df), insight::format_p(x$p)
+ ))
diff --git a/R/mann_whitney_test.R b/R/mann_whitney_test.R
new file mode 100644
index 00000000..d16f1788
--- /dev/null
+++ b/R/mann_whitney_test.R
@@ -0,0 +1,378 @@
+#' @title Mann-Whitney-Test
+#' @name mann_whitney_test
+#' @description This function performs a Mann-Whitney-Test (or Wilcoxon rank
+#' sum test for _unpaired_ samples.
+#' A Mann-Whitney-Test is a non-parametric test for the null hypothesis that two
+#' independent samples have identical continuous distributions. It can be used
+#' when the two continuous variables are not normally distributed.
+#' @param data A data frame.
+#' @param select Name of the dependent variable (as string) to be used for the
+#' test. `select` can also be a character vector, specifying the names of
+#' multiple continuous variables. In this case, `by` is ignored and variables
+#' specified in `select` are used to compute the test. This can be useful if
+#' the data is in wide-format and no grouping variable is available.
+#' @param by Name of the grouping variable to be used for the test. If `by` is
+#' not a factor, it will be coerced to a factor. For `chi_squared_test()`, if
+#' `probabilities` is provided, `by` must be `NULL`.
+#' @param weights Name of an (optional) weighting variable to be used for the test.
+#' @param distribution Indicates how the null distribution of the test statistic
+#' should be computed. May be one of `"exact"`, `"approximate"` or `"asymptotic"`
+#' (default). See [`coin::wilcox_test()`] for details.
+#' @return A data frame with test results. The function returns p and Z-values
+#' as well as effect size r and group-rank-means.
+#' @details This function is based on [`wilcox.test()`] and [`coin::wilcox_test()`]
+#' (the latter to extract effect sizes). The weighted version of the test is
+#' based on [`survey::svyranktest()`].
+#' Interpretation of the effect size **r**, as a rule-of-thumb:
+#' - small effect >= 0.1
+#' - medium effect >= 0.3
+#' - large effect >= 0.5
+#' **r** is calcuated as:
+#' ```
+#' r = |Z| / sqrt(n1 + n2)
+#' ```
+#' @examples
+#' data(efc)
+#' # Mann-Whitney-U-Tests for elder's age by elder's sex.
+#' mann_whitney_test(efc, "e17age", by = "e16sex")
+#' # when data is in wide-format, specify all relevant continuous
+#' # variables in `select` and omit `by`
+#' set.seed(123)
+#' wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20))
+#' mann_whitney_test(wide_data, select = c("scale1", "scale2"))
+#' # same as if we had data in long format, with grouping variable
+#' long_data <- data.frame(
+#' scales = c(wide_data$scale1, wide_data$scale2),
+#' groups = rep(c("A", "B"), each = 20)
+#' )
+#' mann_whitney_test(long_data, select = "scales", by = "groups")
+#' @export
+mann_whitney_test <- function(data,
+ select = NULL,
+ by = NULL,
+ weights = NULL,
+ distribution = "asymptotic") {
+ insight::check_if_installed("datawizard")
+ # sanity checks
+ .sanitize_htest_input(data, select, by, weights)
+ # does select indicate more than one variable?
+ if (length(select) > 1) {
+ # sanity check - may only specify two variable names
+ if (length(select) > 2) {
+ insight::format_error("You may only specify two variables for Mann-Whitney test.")
+ }
+ if (!is.null(by)) {
+ insight::format_error("If `select` specifies more than one variable, `by` must be `NULL`.")
+ }
+ # we convert the data into long format, and create a grouping variable
+ data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale")
+ by <- select[2]
+ select <- select[1]
+ # after converting to long, we have the "grouping" variable first in the data
+ colnames(data) <- c(by, select)
+ }
+ # get data
+ dv <- data[[select]]
+ grp <- data[[by]]
+ # coerce to factor
+ grp <- datawizard::to_factor(grp)
+ # only two groups allowed
+ if (insight::n_unique(grp) > 2) {
+ insight::format_error("Only two groups are allowed for Mann-Whitney test. Please use `kruskal_wallis_test()` for more than two groups.") # nolint
+ }
+ # value labels
+ group_labels <- names(attr(data[[by]], "labels", exact = TRUE))
+ if (is.null(group_labels)) {
+ group_labels <- levels(droplevels(grp))
+ }
+ if (is.null(weights)) {
+ .calculate_mwu(dv, grp, distribution, group_labels)
+ } else {
+ .calculate_weighted_mwu(dv, grp, data[[weights]], group_labels)
+ }
+# Mann-Whitney-Test for two groups --------------------------------------------
+.calculate_mwu <- function(dv, grp, distribution, group_labels) {
+ insight::check_if_installed("coin")
+ # prepare data
+ wcdat <- data.frame(dv, grp)
+ # perfom wilcox test
+ wt <- coin::wilcox_test(dv ~ grp, data = wcdat, distribution = distribution)
+ # for rank mean
+ group_levels <- levels(grp)
+ # compute statistics
+ u <- as.numeric(coin::statistic(wt, type = "linear"))
+ z <- as.numeric(coin::statistic(wt, type = "standardized"))
+ p <- coin::pvalue(wt)
+ r <- abs(z / sqrt(length(dv)))
+ w <- suppressWarnings(stats::wilcox.test(dv ~ grp, data = wcdat)$statistic)
+ # group means
+ dat_gr1 <- stats::na.omit(dv[grp == group_levels[1]])
+ dat_gr2 <- stats::na.omit(dv[grp == group_levels[2]])
+ rank_mean_1 <- mean(rank(dat_gr1))
+ rank_mean_2 <- mean(rank(dat_gr2))
+ # compute n for each group
+ n_grp1 <- length(dat_gr1)
+ n_grp2 <- length(dat_gr2)
+ out <- data.frame(
+ group1 = group_levels[1],
+ group2 = group_levels[2],
+ estimate = rank_mean_1 - rank_mean_2,
+ u = u,
+ w = w,
+ z = z,
+ r = r,
+ p = as.numeric(p)
+ )
+ attr(out, "rank_means") <- stats::setNames(
+ c(rank_mean_1, rank_mean_2),
+ c("Mean Group 1", "Mean Group 2")
+ )
+ attr(out, "n_groups") <- stats::setNames(
+ c(n_grp1, n_grp2),
+ c("N Group 1", "N Group 2")
+ )
+ attr(out, "group_labels") <- group_labels
+ attr(out, "method") <- "wilcoxon"
+ attr(out, "weighted") <- FALSE
+ class(out) <- c("sj_htest_mwu", "data.frame")
+ out
+# Weighted Mann-Whitney-Test for two groups ----------------------------------
+.calculate_weighted_mwu <- function(dv, grp, weights, group_labels) {
+ # check if pkg survey is available
+ insight::check_if_installed("survey")
+ dat <- stats::na.omit(data.frame(dv, grp, weights))
+ colnames(dat) <- c("x", "g", "w")
+ design <- survey::svydesign(ids = ~0, data = dat, weights = ~w)
+ result <- survey::svyranktest(formula = x ~ g, design, test = "wilcoxon")
+ # for rank mean
+ group_levels <- levels(droplevels(grp))
+ # subgroups
+ dat_gr1 <- dat[dat$g == group_levels[1], ]
+ dat_gr2 <- dat[dat$g == group_levels[2], ]
+ dat_gr1$rank_x <- rank(dat_gr1$x)
+ dat_gr2$rank_x <- rank(dat_gr2$x)
+ # rank means
+ design_mean1 <- survey::svydesign(
+ ids = ~0,
+ data = dat_gr1,
+ weights = ~w
+ )
+ rank_mean_1 <- survey::svymean(~rank_x, design_mean1)
+ design_mean2 <- survey::svydesign(
+ ids = ~0,
+ data = dat_gr2,
+ weights = ~w
+ )
+ rank_mean_2 <- survey::svymean(~rank_x, design_mean2)
+ # group Ns
+ n_grp1 <- round(sum(dat_gr1$w))
+ n_grp2 <- round(sum(dat_gr2$w))
+ # statistics and effect sizes
+ z <- result$statistic
+ r <- abs(z / sqrt(sum(n_grp1, n_grp2)))
+ out <- data.frame(
+ group1 = group_levels[1],
+ group2 = group_levels[2],
+ estimate = result$estimate,
+ z = z,
+ r = r,
+ p = as.numeric(result$p.value)
+ )
+ attr(out, "rank_means") <- stats::setNames(
+ c(rank_mean_1, rank_mean_2),
+ c("Mean Group 1", "Mean Group 2")
+ )
+ attr(out, "n_groups") <- stats::setNames(
+ c(n_grp1, n_grp2),
+ c("N Group 1", "N Group 2")
+ )
+ attr(out, "group_labels") <- group_labels
+ attr(out, "weighted") <- TRUE
+ class(out) <- c("sj_htest_mwu", "data.frame")
+ out
+# helper ----------------------------------------------------------------------
+.misspelled_string <- function(source, searchterm, default_message = NULL) {
+ if (is.null(searchterm) || length(searchterm) < 1) {
+ return(default_message)
+ }
+ # used for many matches
+ more_found <- ""
+ # init default
+ msg <- ""
+ # remove matching strings
+ same <- intersect(source, searchterm)
+ searchterm <- setdiff(searchterm, same)
+ source <- setdiff(source, same)
+ # guess the misspelled string
+ possible_strings <- unlist(lapply(searchterm, function(s) {
+ source[.fuzzy_grep(source, s)] # nolint
+ }), use.names = FALSE)
+ if (length(possible_strings)) {
+ msg <- "Did you mean "
+ if (length(possible_strings) > 1) {
+ # make sure we don't print dozens of alternatives for larger data frames
+ if (length(possible_strings) > 5) {
+ more_found <- sprintf(
+ " We even found %i more possible matches, not shown here.",
+ length(possible_strings) - 5
+ )
+ possible_strings <- possible_strings[1:5]
+ }
+ msg <- paste0(msg, "one of ", toString(paste0("\"", possible_strings, "\"")))
+ } else {
+ msg <- paste0(msg, "\"", possible_strings, "\"")
+ }
+ msg <- paste0(msg, "?", more_found)
+ } else {
+ msg <- default_message
+ }
+ # no double white space
+ insight::trim_ws(msg)
+.fuzzy_grep <- function(x, pattern, precision = NULL) {
+ if (is.null(precision)) {
+ precision <- round(nchar(pattern) / 3)
+ }
+ if (precision > nchar(pattern)) {
+ return(NULL)
+ }
+ p <- sprintf("(%s){~%i}", pattern, precision)
+ grep(pattern = p, x = x, ignore.case = FALSE)
+.sanitize_htest_input <- function(data, select, by, weights) {
+ # check if arguments are NULL
+ if (is.null(select)) {
+ insight::format_error("Argument `select` is missing.")
+ }
+ # `by` is only allowed to be NULL if `select` specifies more than one variable
+ if (is.null(by) && length(select) == 1) {
+ insight::format_error("Arguments `by` is missing.")
+ }
+ # check if arguments have correct length or are of correct type
+ if (!is.character(select)) {
+ insight::format_error("Argument `select` must be a character string with the name(s) of the variable(s).")
+ }
+ if (!is.null(by) && (length(by) != 1 || !is.character(by))) {
+ insight::format_error("Argument `by` must be a character string with the name of a single variable.")
+ }
+ if (!is.null(weights) && length(weights) != 1) {
+ insight::format_error("Argument `weights` must be a character string with the name of a single variable.")
+ }
+ # check if "select" is in data
+ if (!all(select %in% colnames(data))) {
+ not_found <- setdiff(select, colnames(data))[1]
+ insight::format_error(
+ sprintf("Variable '%s' not found in data frame.", not_found),
+ .misspelled_string(colnames(data), not_found, "Maybe misspelled?")
+ )
+ }
+ # check if "by" is in data
+ if (!is.null(by) && !by %in% colnames(data)) {
+ insight::format_error(
+ sprintf("Variable '%s' not found in data frame.", by),
+ .misspelled_string(colnames(data), by, "Maybe misspelled?")
+ )
+ }
+ # check if "weights" is in data
+ if (!is.null(weights) && !weights %in% colnames(data)) {
+ insight::format_error(
+ sprintf("Weighting variable '%s' not found in data frame.", weights),
+ .misspelled_string(colnames(data), weights, "Maybe misspelled?")
+ )
+ }
+# methods ---------------------------------------------------------------------
+#' @export
+print.sj_htest_mwu <- function(x, ...) {
+ # fetch attributes
+ group_labels <- attributes(x)$group_labels
+ rank_means <- attributes(x)$rank_means
+ n_groups <- attributes(x)$n_groups
+ weighted <- attributes(x)$weighted
+ if (weighted) {
+ weight_string <- " (weighted)"
+ } else {
+ weight_string <- ""
+ }
+ # same width
+ group_labels <- format(group_labels)
+ # header
+ insight::print_color(sprintf("# Mann-Whitney test%s\n\n", weight_string), "blue")
+ # group-1-info
+ insight::print_color(
+ sprintf(
+ " Group 1: %s (n = %i, rank mean = %s)\n",
+ group_labels[1], n_groups[1], insight::format_value(rank_means[1], protect_integers = TRUE)
+ ), "cyan"
+ )
+ # group-2-info
+ insight::print_color(
+ sprintf(
+ " Group 2: %s (n = %i, rank mean = %s)\n",
+ group_labels[2], n_groups[2], insight::format_value(rank_means[2], protect_integers = TRUE)
+ ), "cyan"
+ )
+ cat(sprintf("\n r = %.3f, Z = %.3f, %s\n\n", x$r, x$z, insight::format_p(x$p)))
diff --git a/R/mean_n.R b/R/mean_n.R
deleted file mode 100644
index 5d6a2e0d..00000000
--- a/R/mean_n.R
+++ /dev/null
@@ -1,78 +0,0 @@
-#' @title Row means with min amount of valid values
-#' @name mean_n
-#' @description This function is similar to the SPSS \code{MEAN.n} function and computes
-#' row means from a \code{data.frame} or \code{matrix} if at least \code{n}
-#' values of a row are valid (and not \code{NA}).
-#' @param dat A data frame with at least two columns, where row means are applied.
-#' @param n May either be
-#' \itemize{
-#' \item a numeric value that indicates the amount of valid values per row to calculate the row mean;
-#' \item or a value between 0 and 1, indicating a proportion of valid values per row to calculate the row mean (see 'Details').
-#' }
-#' If a row's sum of valid values is less than \code{n}, \code{NA} will be returned as row mean value.
-#' @param digits Numeric value indicating the number of decimal places to be used for rounding mean
-#' value. Negative values are allowed (see 'Details').
-#' @return A vector with row mean values of \code{df} for those rows with at least \code{n}
-#' valid values. Else, \code{NA} is returned.
-#' @details Rounding to a negative number of \code{digits} means rounding to a power of
-#' ten, so for example mean_n(df, 3, digits = -2) rounds to the
-#' nearest hundred. \cr \cr
-#' For \code{n}, must be a numeric value from \code{0} to \code{ncol(dat)}. If
-#' a \emph{row} in \code{dat} has at least \code{n} non-missing values, the
-#' row mean is returned. If \code{n} is a non-integer value from 0 to 1,
-#' \code{n} is considered to indicate the proportion of necessary non-missing
-#' values per row. E.g., if \code{n = .75}, a row must have at least \code{ncol(dat) * n}
-#' non-missing values for the row mean to be calculated. See 'Examples'.
-#' @references \href{https://r4stats.com/2014/09/03/adding-the-spss-mean-n-function-to-r/}{r4stats.com}
-#' @examples
-#' dat <- data.frame(c1 = c(1,2,NA,4),
-#' c2 = c(NA,2,NA,5),
-#' c3 = c(NA,4,NA,NA),
-#' c4 = c(2,3,7,8))
-#' # needs at least 4 non-missing values per row
-#' mean_n(dat, 4) # 1 valid return value
-#' # needs at least 3 non-missing values per row
-#' mean_n(dat, 3) # 2 valid return values
-#' # needs at least 2 non-missing values per row
-#' mean_n(dat, 2)
-#' # needs at least 1 non-missing value per row
-#' mean_n(dat, 1) # all means are shown
-#' # needs at least 50% of non-missing values per row
-#' mean_n(dat, .5) # 3 valid return values
-#' # needs at least 75% of non-missing values per row
-#' mean_n(dat, .75) # 2 valid return values
-#' @export
-mean_n <- function(dat, n, digits = 2) {
- # is 'n' indicating a proportion?
- digs <- n %% 1
- if (digs != 0) n <- round(ncol(dat) * digs)
- # coerce matrix to data frame
- if (is.matrix(dat)) dat <- as.data.frame(dat)
- # check if we have a data framme with at least two columns
- if (!is.data.frame(dat) || ncol(dat) < 2) {
- warning("`dat` must be a data frame with at least two columns.", call. = TRUE)
- return(NA)
- }
- # n may not be larger as df's amount of columns
- if (ncol(dat) < n) {
- warning("`n` must be smaller or equal to number of columns in data frame.", call. = TRUE)
- return(NA)
- }
- round(apply(dat, 1, function(x) ifelse(sum(!is.na(x)) >= n, mean(x, na.rm = TRUE), NA)), digits)
diff --git a/R/mwu.R b/R/mwu.R
deleted file mode 100644
index 6967bf7a..00000000
--- a/R/mwu.R
+++ /dev/null
@@ -1,234 +0,0 @@
-#' @title Mann-Whitney-U-Test
-#' @name mwu
-#' @description This function performs a Mann-Whitney-U-Test (or Wilcoxon rank
-#' sum test for _unpaired_ samples, see [`wilcox.test()`] and [`coin::wilcox_test()`])
-#' comparing `x` by each group indicated by `grp`. If `grp` has more than two
-#' categories, a comparison between each combination of two groups is performed.
-#' The function reports U, p and Z-values as well as effect size r and group-rank-means.
-#' @param x Bare (unquoted) variable name, or a character vector with the variable name.
-#' @param distribution Indicates how the null distribution of the test statistic
-#' should be computed. May be one of `"exact"`, `"approximate"` or `"asymptotic"`
-#' (default). See [`coin::wilcox_test()`] for details.
-#' @inheritParams weighted_sd
-#' @inheritParams means_by_group
-#' @return (Invisibly) returns a data frame with U, p and Z-values for each group-comparison
-#' as well as effect-size r; additionally, group-labels and groups' n's are
-#' also included.
-#' @note This function calls the \code{\link[coin]{wilcox_test}} with formula. If \code{grp}
-#' has more than two groups, additionally a Kruskal-Wallis-Test (see \code{\link{kruskal.test}})
-#' is performed. \cr \cr
-#' Interpretation of effect sizes, as a rule-of-thumb:
-#' \itemize{
-#' \item small effect >= 0.1
-#' \item medium effect >= 0.3
-#' \item large effect >= 0.5
-#' }
-#' @examples
-#' data(efc)
-#' # Mann-Whitney-U-Tests for elder's age by elder's sex.
-#' mwu(efc, e17age, e16sex)
-#' # using formula interface
-#' mwu(e17age ~ e16sex, efc)
-#' # Mann-Whitney-Tests for elder's age by each level elder's dependency.
-#' mwu(efc, e17age, e42dep)
-#' @importFrom stats na.omit wilcox.test kruskal.test
-#' @importFrom sjmisc recode_to is_empty
-#' @importFrom sjlabelled get_labels as_numeric
-#' @importFrom rlang quo_name enquo
-#' @export
-mwu <- function(data, ...) {
- UseMethod("mwu")
-#' @rdname mwu
-#' @export
-mwu.default <- function(data,
- x,
- grp,
- distribution = "asymptotic",
- out = c("txt", "viewer", "browser"),
- encoding = "UTF-8",
- file = NULL,
- ...) {
- out <- match.arg(out)
- if (out != "txt" && !requireNamespace("sjPlot", quietly = TRUE)) {
- message("Package `sjPlot` needs to be loaded to print HTML tables.")
- out <- "txt"
- }
- if (!requireNamespace("coin", quietly = TRUE)) {
- stop("Package `coin` needs to be installed to compute the Mann-Whitney-U test.", call. = FALSE)
- }
- # create quosures
- grp.name <- rlang::quo_name(rlang::enquo(grp))
- dv.name <- rlang::quo_name(rlang::enquo(x))
- # create string with variable names
- vars <- c(grp.name, dv.name)
- # get data
- data <- suppressMessages(dplyr::select(data, !! vars))
- grp <- data[[grp.name]]
- dv <- data[[dv.name]]
- # coerce factor and character to numeric
- if (is.factor(grp) || is.character(grp)) grp <- sjlabelled::as_numeric(grp)
- # group "counter" (index) should start with 1, not 0
- if (min(grp, na.rm = TRUE) < 1) grp <- sjmisc::recode_to(grp, lowest = 1, append = FALSE)
- # retrieve unique group values. need to iterate all values
- grp_values <- sort(unique(stats::na.omit(grp)))
- # length of value range
- cnt <- length(grp_values)
- labels <- sjlabelled::get_labels(
- grp, attr.only = FALSE, values = NULL, non.labelled = TRUE
- )
- df <- data.frame()
- for (i in seq_len(cnt)) {
- for (j in i:cnt) {
- if (i != j) {
- # retrieve cases (rows) of subgroups
- xsub <- dv[which(grp == grp_values[i] | grp == grp_values[j])]
- ysub <- grp[which(grp == grp_values[i] | grp == grp_values[j])]
- # this is for unpaired wilcox.test()
- xsub_2 <- stats::na.omit(dv[which(grp == grp_values[i])])
- ysub_2 <- stats::na.omit(dv[which(grp == grp_values[j])])
- # only use rows with non-missings
- ysub <- ysub[which(!is.na(xsub))]
- # remove missings
- xsub <- as.numeric(stats::na.omit(xsub))
- ysub.n <- stats::na.omit(ysub)
- # grouping variable is a factor
- ysub <- as.factor(ysub.n)
- wcdat <- data.frame(
- x = xsub,
- y = ysub
- )
- # perfom wilcox test
- wt <- coin::wilcox_test(x ~ y, data = wcdat, distribution = distribution)
- # compute statistics
- u <- as.numeric(coin::statistic(wt, type = "linear"))
- z <- as.numeric(coin::statistic(wt, type = "standardized"))
- p <- coin::pvalue(wt)
- r <- abs(z / sqrt(length(ysub)))
- w <- stats::wilcox.test(xsub_2, ysub_2, paired = FALSE)$statistic
- rkm.i <- mean(rank(xsub)[which(ysub.n == grp_values[i])], na.rm = TRUE)
- rkm.j <- mean(rank(xsub)[which(ysub.n == grp_values[j])], na.rm = TRUE)
- # compute n for each group
- n_grp1 <- length(xsub[which(ysub.n == grp_values[i])])
- n_grp2 <- length(xsub[which(ysub.n == grp_values[j])])
- # generate result data frame
- df <-
- rbind(
- df,
- cbind(
- grp1 = grp_values[i],
- grp1.label = labels[i],
- grp1.n = n_grp1,
- grp2 = grp_values[j],
- grp2.label = labels[j],
- grp2.n = n_grp2,
- u = u,
- w = w,
- p = p,
- z = z,
- r = r,
- rank.mean.grp1 = rkm.i,
- rank.mean.grp2 = rkm.j
- )
- )
- }
- }
- }
- # convert variables
- df[["grp1"]] <- as.numeric(as.character(df[["grp1"]]))
- df[["grp2"]] <- as.numeric(as.character(df[["grp2"]]))
- df[["grp1.n"]] <- as.numeric(as.character(df[["grp1.n"]]))
- df[["grp2.n"]] <- as.numeric(as.character(df[["grp2.n"]]))
- df[["grp1.label"]] <- as.character(df[["grp1.label"]])
- df[["grp2.label"]] <- as.character(df[["grp2.label"]])
- df[["u"]] <- as.numeric(as.character(df[["u"]]))
- df[["w"]] <- as.numeric(as.character(df[["w"]]))
- df[["p"]] <- as.numeric(as.character(df[["p"]]))
- df[["z"]] <- as.numeric(as.character(df[["z"]]))
- df[["r"]] <- as.numeric(as.character(df[["r"]]))
- df[["rank.mean.grp1"]] <- as.numeric(as.character(df[["rank.mean.grp1"]]))
- df[["rank.mean.grp2"]] <- as.numeric(as.character(df[["rank.mean.grp2"]]))
- # prepare a data frame that can be used for 'sjt.df'.
- tab.df <-
- data_frame(
- Groups = sprintf("%s
%s", df$grp1.label, df$grp2.label),
- N = sprintf("%s
%s", df$grp1.n, df$grp2.n),
- 'Mean Rank' = sprintf("%.2f
%.2f", df$rank.mean.grp1, df$rank.mean.grp2),
- 'Mann-Whitney-U' = as.character(df$u),
- 'Wilcoxon-W' = as.character(df$w),
- Z = sprintf("%.3f", df$z),
- 'Effect Size' = sprintf("%.3f", df$r),
- p = sprintf("%.3f", df$p)
- )
- # replace 0.001 with <0.001
- tab.df$p[which(tab.df$p == "0.001")] <- "<0.001"
- ret.df <- list(df = df, tab.df = tab.df, data = data.frame(dv, grp))
- # save how to print output
- attr(ret.df, "print") <- out
- attr(ret.df, "encoding") <- encoding
- attr(ret.df, "file") <- file
- if (out %in% c("viewer", "browser"))
- class(ret.df) <- c("mwu", "sjt_mwu")
- else
- class(ret.df) <- c("mwu", "sj_mwu")
- ret.df
-#' @importFrom dplyr select
-#' @rdname mwu
-#' @export
-mwu.formula <- function(formula,
- data,
- distribution = "asymptotic",
- out = c("txt", "viewer", "browser"),
- encoding = "UTF-8",
- file = NULL,
- ...) {
- vars <- all.vars(formula)
- mwu(data, x = !! vars[1], grp = !! vars[2], distribution = distribution, out = out, encoding = encoding, file = file, ...)
-#' @rdname mwu
-#' @export
-mannwhitney <- mwu
diff --git a/R/phi.R b/R/phi.R
index f173ce2e..48a07f1e 100644
--- a/R/phi.R
+++ b/R/phi.R
@@ -26,17 +26,13 @@ phi.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist
if (is.null(ci.lvl) || is.na(ci.lvl)) {
} else {
- ci <- data[, terms] %>%
- sjstats::bootstrap(n) %>%
- dplyr::mutate(
- tables = lapply(.data$strap, function(x) {
- dat <- as.data.frame(x)
- table(dat[[1]], dat[[2]])
- }),
- phis = sapply(.data$tables, function(x) .cramer(x))
- ) %>%
- dplyr::pull("phis") %>%
- boot_ci(ci.lvl = ci.lvl, method = method)
+ straps <- sjstats::bootstrap(data[terms], n)
+ tables <- lapply(straps$strap, function(x) {
+ dat <- as.data.frame(x)
+ table(dat[[1]], dat[[2]])
+ })
+ phis <- sapply(tables, function(x) .phi(x))
+ ci <- boot_ci(phis, ci.lvl = ci.lvl, method = method)
phi = .phi(tab),
diff --git a/R/prop.R b/R/prop.R
index e22e994e..167b3cdd 100644
--- a/R/prop.R
+++ b/R/prop.R
@@ -1,11 +1,11 @@
#' @title Proportions of values in a vector
#' @name prop
-#' @description \code{prop()} calculates the proportion of a value or category
-#' in a variable. \code{props()} does the same, but allows for
+#' @description `prop()` calculates the proportion of a value or category
+#' in a variable. `props()` does the same, but allows for
#' multiple logical conditions in one statement. It is similar
-#' to \code{mean()} with logical predicates, however, both
-#' \code{prop()} and \code{props()} work with grouped data frames.
+#' to `mean()` with logical predicates, however, both
+#' `prop()` and `props()` work with grouped data frames.
#' @param data A data frame. May also be a grouped data frame (see 'Examples').
#' @param ... One or more value pairs of comparisons (logical predicates). Put
@@ -14,17 +14,17 @@
#' 'Examples'.
#' @param weights Vector of weights that will be applied to weight all observations.
#' Must be a vector of same length as the input vector. Default is
-#' \code{NULL}, so no weights are used.
+#' `NULL`, so no weights are used.
#' @param na.rm Logical, whether to remove NA values from the vector when the
-#' proportion is calculated. \code{na.rm = FALSE} gives you the raw
-#' percentage of a value in a vector, \code{na.rm = TRUE} the valid
+#' proportion is calculated. `na.rm = FALSE` gives you the raw
+#' percentage of a value in a vector, `na.rm = TRUE` the valid
#' percentage.
#' @param digits Amount of digits for returned values.
-#' @details \code{prop()} only allows one logical statement per comparison,
-#' while \code{props()} allows multiple logical statements per comparison.
-#' However, \code{prop()} supports weighting of variables before calculating
-#' proportions, and comparisons may also be quoted. Hence, \code{prop()}
+#' @details `prop()` only allows one logical statement per comparison,
+#' while `props()` allows multiple logical statements per comparison.
+#' However, `prop()` supports weighting of variables before calculating
+#' proportions, and comparisons may also be quoted. Hence, `prop()`
#' also processes comparisons, which are passed as character vector
#' (see 'Examples').
@@ -96,7 +96,9 @@
#' @export
prop <- function(data, ..., weights = NULL, na.rm = TRUE, digits = 4) {
# check argument
- if (!is.data.frame(data)) stop("`data` needs to be a data frame.", call. = F)
+ if (!is.data.frame(data)) {
+ insight::format_error("`data` needs to be a data frame.")
+ }
# get dots
dots <- match.call(expand.dots = FALSE)$`...`
@@ -109,7 +111,9 @@ prop <- function(data, ..., weights = NULL, na.rm = TRUE, digits = 4) {
#' @export
props <- function(data, ..., na.rm = TRUE, digits = 4) {
# check argument
- if (!is.data.frame(data)) stop("`data` needs to be a data frame.", call. = F)
+ if (!is.data.frame(data)) {
+ insight::format_error("`data` needs to be a data frame.")
+ }
# get dots
dots <- match.call(expand.dots = FALSE)$`...`
@@ -123,7 +127,7 @@ proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) {
# remember comparisons
comparisons <- lapply(dots, function(x) {
# to character, and remove spaces and quotes
- x <- gsub(" ", "", deparse(x), fixed = T)
+ x <- gsub(" ", "", deparse(x), fixed = TRUE)
x <- gsub("\"", "", x, fixed = TRUE)
@@ -188,7 +192,7 @@ proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) {
# order rows by values of grouping variables
fr <- fr[do.call(order, reihenfolge), ]
- return(fr)
+ fr
} else {
# iterate dots (comparing conditions)
@@ -206,14 +210,14 @@ proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) {
- return(unlist(result))
+ unlist(result)
get_proportion <- function(x, data, weight.by, na.rm, digits) {
# to character, and remove spaces and quotes
- x <- gsub(" ", "", deparse(x), fixed = T)
+ x <- gsub(" ", "", deparse(x), fixed = TRUE)
x <- gsub("\"", "", x, fixed = TRUE)
# split expression at ==, < or >
@@ -252,7 +256,7 @@ get_proportion <- function(x, data, weight.by, na.rm, digits) {
if (na.rm) dummy <- na.omit(dummy)
# get proportion
- round(sum(dummy, na.rm = T) / length(dummy), digits = digits)
+ round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits)
@@ -264,5 +268,5 @@ get_multiple_proportion <- function(x, data, na.rm, digits) {
if (na.rm) dummy <- na.omit(dummy)
# get proportion
- round(sum(dummy, na.rm = T) / length(dummy), digits = digits)
+ round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits)
diff --git a/R/wtd_chisqtest.R b/R/wtd_chisqtest.R
deleted file mode 100644
index 99f9a289..00000000
--- a/R/wtd_chisqtest.R
+++ /dev/null
@@ -1,91 +0,0 @@
-#' @rdname weighted_sd
-#' @export
-weighted_chisqtest <- function(data, ...) {
- UseMethod("weighted_chisqtest")
-#' @importFrom dplyr select
-#' @importFrom stats na.omit chisq.test as.formula
-#' @rdname weighted_sd
-#' @export
-weighted_chisqtest.default <- function(data, x, y, weights, ...) {
- x.name <- deparse(substitute(x))
- y.name <- deparse(substitute(y))
- w.name <- deparse(substitute(weights))
- if (w.name == "NULL") {
- w.name <- "weights"
- data$weights <- 1
- }
- # create string with variable names
- vars <- .compact_character(c(x.name, y.name, w.name))
- # get data
- dat <- suppressMessages(dplyr::select(data, !! vars))
- dat <- stats::na.omit(dat)
- colnames(dat)[ncol(dat)] <- ".weights"
- # check if we have chisq-test for given probabilities
- dot_args <- list(...)
- if ("p" %in% names(dot_args)) {
- .weighted_chisq_for_prob(dat, x.name, prob = dot_args[["p"]])
- } else {
- crosstable_statistics(data = dat, statistics = "auto", weights = ".weights", ...)
- }
-#' @importFrom stats xtabs
-#' @rdname weighted_sd
-#' @export
-weighted_chisqtest.formula <- function(formula, data, ...) {
- vars <- all.vars(formula)
- dot_args <- list(...)
- if (length(vars) < 3 && !"p" %in% names(dot_args)) {
- vars <- c(vars, ".weights")
- data$.weights <- 1
- }
- if ("p" %in% names(dot_args)) {
- dat <- data[vars]
- colnames(dat)[ncol(dat)] <- ".weights"
- .weighted_chisq_for_prob(dat, names(dat)[1], prob = dot_args[["p"]])
- } else {
- tab <- as.table(round(stats::xtabs(data[[vars[3]]] ~ data[[vars[1]]] + data[[vars[2]]])))
- class(tab) <- "table"
- crosstable_statistics(data = tab, statistics = "auto", weights = NULL, ...)
- }
-.weighted_chisq_for_prob <- function(dat, x.name, prob) {
- if (!requireNamespace("survey", quietly = TRUE)) {
- stop("Package `survey` needed to for this function to work. Please install it.", call. = FALSE)
- }
- if (abs(sum(prob) - 1) > sqrt(.Machine$double.eps)) {
- prob <- prob / sum(prob)
- }
- dat$sj_subject_id <- 1:nrow(dat)
- dat$sj_weights <- dat$.weights
- design <- survey::svydesign(id = ~sj_subject_id, weights = ~sj_weights, data = dat)
- stable <- survey::svytable(stats::as.formula(paste0("~", x.name)), design)
- out <- stats::chisq.test(stable, p = prob)
- structure(class = "sj_xtab_stat2", list(
- estimate = out$statistic,
- p.value = out$p.value,
- stat.name = "Chi-squared",
- stat.html = "χ2",
- df = out$parameter,
- n_obs = nrow(dat),
- method = "Weighted chi-squared test for given probabilities",
- method.html = "Weighted χ2 test for given probabilities",
- method.short = "Chi-squared"
- ))
diff --git a/R/wtd_sd.R b/R/wtd_sd.R
index b7db04d8..95af674d 100644
--- a/R/wtd_sd.R
+++ b/R/wtd_sd.R
@@ -82,16 +82,6 @@
#' weighted_ttest(efc, e17age, weights = weight)
#' weighted_ttest(efc, e17age, c160age, weights = weight)
#' weighted_ttest(e17age ~ e16sex + weight, efc)
-#' # weighted Mann-Whitney-U-test ----
-#' weighted_mannwhitney(c12hour ~ c161sex + weight, efc)
-#' # weighted Chi-squared-test ----
-#' weighted_chisqtest(efc, c161sex, e16sex, weights = weight, correct = FALSE)
-#' weighted_chisqtest(c172code ~ c161sex + weight, efc)
-#' # weighted Chi-squared-test for given probabilities ----
-#' weighted_chisqtest(c172code ~ weight, efc, p = c(.33, .33, .34))
#' @export
weighted_sd <- function(x, weights = NULL) {
diff --git a/R/xtab_statistics.R b/R/xtab_statistics.R
index 4fb592e0..21824194 100644
--- a/R/xtab_statistics.R
+++ b/R/xtab_statistics.R
@@ -6,68 +6,70 @@
#' Supported measures are Cramer's V, Phi, Spearman's rho,
#' Kendall's tau and Pearson's r.
-#' @param data A data frame or a table object. If a table object, \code{x1} and
-#' \code{x2} will be ignored. For Kendall's \emph{tau}, Spearman's \emph{rho}
-#' or Pearson's product moment correlation coefficient, \code{data} needs
-#' to be a data frame. If \code{x1} and \code{x2} are not specified,
-#' the first two columns of the data frames are used as variables
-#' to compute the crosstab.
-#' @param formula A formula of the form \code{lhs ~ rhs} where \code{lhs} is a
-#' numeric variable giving the data values and \code{rhs} a factor giving the
-#' corresponding groups.
-#' @param tab A \code{\link{table}} or \code{\link[stats]{ftable}}. Tables of class
-#' \code{\link[stats]{xtabs}} and other will be coerced to \code{ftable}
-#' objects.
+#' @param data A data frame or a table object. If a table object, `x1` and
+#' `x2` will be ignored. For Kendall's _tau_, Spearman's _rho_ or Pearson's
+#' product moment correlation coefficient, `data` needs to be a data frame.
+#' If `x1` and `x2` are not specified, the first two columns of the data
+#' frames are used as variables to compute the crosstab.
+#' @param formula A formula of the form `lhs ~ rhs` where `lhs` is a
+#' numeric variable giving the data values and `rhs` a factor giving the
+#' corresponding groups.
+#' @param tab A [`table()`] or [`ftable()`]. Tables of class [`xtabs()`] and
+#' other will be coerced to `ftable` objects.
#' @param x1 Name of first variable that should be used to compute the
-#' contingency table. If \code{data} is a table object, this argument
-#' will be irgnored.
+#' contingency table. If `data` is a table object, this argument will be
+#' irgnored.
#' @param x2 Name of second variable that should be used to compute the
-#' contingency table. If \code{data} is a table object, this argument
-#' will be irgnored.
+#' contingency table. If `data` is a table object, this argument will be
+#' irgnored.
#' @param statistics Name of measure of association that should be computed. May
-#' be one of \code{"auto"}, \code{"cramer"}, \code{"phi"}, \code{"spearman"},
-#' \code{"kendall"}, \code{"pearson"} or \code{"fisher"}. See 'Details'.
-#' @param ci.lvl Scalar between 0 and 1. If not \code{NULL}, returns a data
-#' frame including lower and upper confidence intervals.
+#' be one of `"auto"`, `"cramer"`, `"phi"`, `"spearman"`, `"kendall"`,
+#' `"pearson"` or `"fisher"`. See 'Details'.
+#' @param ci.lvl Scalar between 0 and 1. If not `NULL`, returns a data
+#' frame including lower and upper confidence intervals.
+#' @param weights Name of variable in `x` that indicated the vector of weights
+#' that will be applied to weight all observations. Default is `NULL`, so no
+#' weights are used.
#' @param ... Other arguments, passed down to the statistic functions
-#' \code{\link[stats]{chisq.test}}, \code{\link[stats]{fisher.test}} or
-#' \code{\link[stats]{cor.test}}.
+#' [`chisq.test()`], [`fisher.test()`] or [`cor.test()`].
-#' @inheritParams means_by_group
#' @inheritParams bootstrap
#' @inheritParams boot_ci
-#' @return For \code{phi()}, the table's Phi value. For \code{cramer()}, the
-#' table's Cramer's V.
-#' \cr \cr
-#' For \code{crosstable_statistics()}, a list with following components:
-#' \describe{
-#' \item{\code{estimate}}{the value of the estimated measure of association.}
-#' \item{\code{p.value}}{the p-value for the test.}
-#' \item{\code{statistic}}{the value of the test statistic.}
-#' \item{\code{stat.name}}{the name of the test statistic.}
-#' \item{\code{stat.html}}{if applicable, the name of the test statistic, in HTML-format.}
-#' \item{\code{df}}{the degrees of freedom for the contingency table.}
-#' \item{\code{method}}{character string indicating the name of the measure of association.}
-#' \item{\code{method.html}}{if applicable, the name of the measure of association, in HTML-format.}
-#' \item{\code{method.short}}{the short form of association measure, equals the \code{statistics}-argument.}
-#' \item{\code{fisher}}{logical, if Fisher's exact test was used to calculate the p-value.}
-#' }
+#' @return For [`phi()`], the table's Phi value. For [`cramers_v()]`, the
+#' table's Cramer's V.
+#' For `crosstable_statistics()`, a list with following components:
+#' - `estimate`: the value of the estimated measure of association.
+#' - `p.value`: the p-value for the test.
+#' - `statistic`: the value of the test statistic.
+#' - `stat.name`: the name of the test statistic.
+#' - `stat.html`: if applicable, the name of the test statistic, in HTML-format.
+#' - `df`: the degrees of freedom for the contingency table.
+#' - `method`: character string indicating the name of the measure of association.
+#' - `method.html`: if applicable, the name of the measure of association, in HTML-format.
+#' - `method.short`: the short form of association measure, equals the `statistics`-argument.
+#' - `fisher`: logical, if Fisher's exact test was used to calculate the p-value.
#' @details The p-value for Cramer's V and the Phi coefficient are based
-#' on \code{chisq.test()}. If any expected value of a table cell is
-#' smaller than 5, or smaller than 10 and the df is 1, then \code{fisher.test()}
-#' is used to compute the p-value, unless \code{statistics = "fisher"}; in
-#' this case, the use of \code{fisher.test()} is forced to compute the
-#' p-value. The test statistic is calculated with \code{cramer()} resp.
-#' \code{phi()}.
-#' \cr \cr
-#' Both test statistic and p-value for Spearman's rho, Kendall's tau
-#' and Pearson's r are calculated with \code{cor.test()}.
-#' \cr \cr
-#' When \code{statistics = "auto"}, only Cramer's V or Phi are calculated,
-#' based on the dimension of the table (i.e. if the table has more than
-#' two rows or columns, Cramer's V is calculated, else Phi).
+#' on `chisq.test()`. If any expected value of a table cell is smaller than 5,
+#' or smaller than 10 and the df is 1, then `fisher.test()` is used to compute
+#' the p-value, unless `statistics = "fisher"`; in this case, the use of
+#' `fisher.test()` is forced to compute the p-value. The test statistic is
+#' calculated with `cramers_v()` resp. `phi()`.
+#' Both test statistic and p-value for Spearman's rho, Kendall's tau and
+#' Pearson's r are calculated with `cor.test()`.
+#' When `statistics = "auto"`, only Cramer's V or Phi are calculated, based on
+#' the dimension of the table (i.e. if the table has more than two rows or
+#' columns, Cramer's V is calculated, else Phi).
+#' @references Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
+#' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
+#' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
+#' \doi{10.3390/math11091982}
#' @examples
#' # Phi coefficient for 2x2 tables
@@ -118,9 +120,9 @@ crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("au
weights <- deparse(substitute(weights))
# if names were quotes, remove quotes
- x1 <- gsub("\"", "", x1, fixed = T)
- x2 <- gsub("\"", "", x2, fixed = T)
- weights <- gsub("\"", "", weights, fixed = T)
+ x1 <- gsub("\"", "", x1, fixed = TRUE)
+ x2 <- gsub("\"", "", x2, fixed = TRUE)
+ weights <- gsub("\"", "", weights, fixed = TRUE)
if (sjmisc::is_empty(weights) || weights == "NULL")
weights <- NULL
@@ -140,8 +142,9 @@ crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("au
if (!is.null(weights)) {
tab <- as.table(round(stats::xtabs(data[[3]] ~ data[[1]] + data[[2]])))
class(tab) <- "table"
- } else
+ } else {
tab <- table(data)
+ }
} else {
# 'data' is a table - copy to table object
tab <- data
@@ -152,7 +155,7 @@ crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("au
"Need arguments `data`, `x1` and `x2` to compute %s-statistics.",
- call. = F
+ call. = FALSE
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 4596c636..2fb1d7b8 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -22,17 +22,17 @@ reference:
- crosstable_statistics
- table_values
-- title: "Weighted Statistics"
+- title: "Weighted Estimates and Dispersion"
- weight
- weighted_ttest
-- title: "Other (Summary) Statistics"
+- title: "Summary Statistics and Tests"
- gmd
- - mannwhitney
- - mean_n
- - means_by_group
+ - chi_squared_test
+ - kruskal_wallis_test
+ - mann_whitney_test
- var_pop
- title: "Tools for Regression Models"
diff --git a/docs/404.html b/docs/404.html
deleted file mode 100644
index 2c858e83..00000000
--- a/docs/404.html
+++ /dev/null
As contributors and maintainers of this project, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities.
- anova-statistics.Rmd
This vignettes demontrates those functions of the sjstats-package that deal with Anova tables. These functions report different effect size measures, which are useful beyond significance tests (p-values), because they estimate the magnitude of effects, independent from sample size. sjstats provides following functions:
- -Befor we start, we fit a simple model:
-library(sjstats) -# load sample data -data(efc) - -# fit linear model -fit <- aov( - c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, - data = efc -)
All functions accept objects of class aov
or anova
, so you can also use model fits from the car package, which allows fitting Anova’s with different types of sum of squares. Other objects, like lm
, will be coerced to anova
The following functions return the effect size statistic as named numeric vector, using the model’s term names.
-The eta-squared is the proportion of the total variability in the dependent variable that is accounted for by the variation in the independent variable. It is the ratio of the sum of squares for each group level to the total sum of squares. It can be interpreted as percentage of variance accounted for by a variable.
-For variables with 1 degree of freedom (in the numerator), the square root of eta-squared is equal to the correlation coefficient r. For variables with more than 1 degree of freedom, eta-squared equals R2. This makes eta-squared easily interpretable. Furthermore, these effect sizes can easily be converted into effect size measures that can be, for instance, further processed in meta-analyses.
-Eta-squared can be computed simply with:
-eta_sq(fit, ci.lvl = .95) -#> term etasq conf.low conf.high -#> 1 as.factor(e42dep) 0.266 0.217 0.312 -#> 2 as.factor(c172code) 0.005 0.000 0.018 -#> 3 c160age 0.048 0.024 0.080
The partial eta-squared value is the ratio of the sum of squares for each group level to the sum of squares for each group level plus the residual sum of squares. It is more difficult to interpret, because its value strongly depends on the variability of the residuals. Partial eta-squared values should be reported with caution, and Levine and Hullett (2002) recommend reporting eta- or omega-squared rather than partial eta-squared.
-Use the partial
-argument to compute partial eta-squared values:
eta_sq(fit, partial = TRUE, ci.lvl = .95) -#> term partial.etasq conf.low conf.high -#> 1 as.factor(e42dep) 0.281 0.232 0.327 -#> 2 as.factor(c172code) 0.008 0.000 0.023 -#> 3 c160age 0.066 0.038 0.101
While eta-squared estimates tend to be biased in certain situations, e.g. when the sample size is small or the independent variables have many group levels, omega-squared estimates are corrected for this bias.
-Omega-squared can be simply computed with:
-omega_sq(fit, ci.lvl = .95) -#> term omegasq conf.low conf.high -#> 1 as.factor(e42dep) 0.263 0.214 0.310 -#> 2 as.factor(c172code) 0.004 -0.002 0.016 -#> 3 c160age 0.048 0.023 0.078
also has a partial
-argument to compute partial omega-squared values. Computing the partial omega-squared statistics is based on bootstrapping. In this case, use n
to define the number of samples (1000 by default.)
omega_sq(fit, partial = TRUE, ci.lvl = .95) -#> term partial.omegasq conf.low conf.high -#> 1 as.factor(e42dep) 0.278 0.229 0.325 -#> 2 as.factor(c172code) 0.005 -0.002 0.020 -#> 3 c160age 0.065 0.036 0.100
Espilon-squared is a less common measure of effect size. It is sometimes considered as an “adjusted r-squared” value. You can compute this effect size using epsilon_sq()
epsilon_sq(fit, ci.lvl = .95) -#> term epsilonsq conf.low conf.high -#> 1 as.factor(e42dep) 0.264 0.214 0.310 -#> 2 as.factor(c172code) 0.004 -0.002 0.016 -#> 3 c160age 0.048 0.023 0.079
The anova_stats()
function takes a model input and computes a comprehensive summary, including the above effect size measures, returned as tidy data frame:
anova_stats(fit) -#> term df sumsq meansq statistic p.value etasq partial.etasq omegasq partial.omegasq epsilonsq cohens.f power -#> 1 as.factor(e42dep) 3 577756.33 192585.444 108.786 0.000 0.266 0.281 0.263 0.278 0.264 0.626 1.00 -#> 2 as.factor(c172code) 2 11722.05 5861.024 3.311 0.037 0.005 0.008 0.004 0.005 0.004 0.089 0.63 -#> 3 c160age 1 105169.60 105169.595 59.408 0.000 0.048 0.066 0.048 0.065 0.048 0.267 1.00 -#> 4 Residuals 834 1476436.34 1770.307 NA NA NA NA NA NA NA NA NA
Like the other functions, the input may also be an object of class anova
, so you can also use model fits from the car package, which allows fitting Anova’s with different types of sum of squares:
anova_stats(car::Anova(fit, type = 3)) -#> term sumsq meansq df statistic p.value etasq partial.etasq omegasq partial.omegasq epsilonsq cohens.f power -#> 1 as.factor(e42dep) 426461.571 142153.857 3 80.299 0.000 0.212 0.224 0.209 0.221 0.209 0.537 1.000 -#> 2 as.factor(c172code) 7352.049 3676.025 2 2.076 0.126 0.004 0.005 0.002 0.003 0.002 0.071 0.429 -#> 3 c160age 105169.595 105169.595 1 59.408 0.000 0.052 0.066 0.051 0.065 0.051 0.267 1.000 -#> 4 Residuals 1476436.343 1770.307 834 NA NA NA NA NA NA NA NA NA
- bayesian-statistics.Rmd
This vignettes demontrates the mediation()
-function in sjstats. Before we start, we fit some models, including a mediation-object from the mediation-package, which we use for comparison with brms.
library(sjstats) -library(mediation) -library(brms) - -# load sample data -data(jobs) -set.seed(123) - -# linear models, for mediation analysis -b1 <- lm(job_seek ~ treat + econ_hard + sex + age, data = jobs) -b2 <- lm(depress2 ~ treat + job_seek + econ_hard + sex + age, data = jobs) - -# mediation analysis, for comparison with brms -m1 <- mediate(b1, b2, sims = 1000, treat = "treat", mediator = "job_seek")
# Fit Bayesian mediation model -f1 <- bf(job_seek ~ treat + econ_hard + sex + age) -f2 <- bf(depress2 ~ treat + job_seek + econ_hard + sex + age) - -m2 <- brm(f1 + f2 + set_rescor(FALSE), data = jobs, cores = 4)
is a summary function, especially for mediation analysis, i.e. for multivariate response models with casual mediation effects.
In the model m2, treat is the treatment effect, job_seek is the mediator effect, f1 describes the mediator model and f2 describes the outcome model.
returns a data frame with information on the direct effect (median value of posterior samples from treatment of the outcome model), mediator effect (median value of posterior samples from mediator of the outcome model), indirect effect (median value of the multiplication of the posterior samples from mediator of the outcome model and the posterior samples from treatment of the mediation model) and the total effect (median value of sums of posterior samples used for the direct and indirect effect). The proportion mediated is the indirect effect divided by the total effect.
The simplest call just needs the model-object.
-mediation(m2) -#> -#> # Causal Mediation Analysis for Stan Model -#> -#> Treatment: treat -#> Mediator: job_seek -#> Response: depress2 -#> -#> Estimate HDI (90%) -#> Direct effect: -0.04 [-0.11 0.03] -#> Indirect effect: -0.02 [-0.04 0.00] -#> Total effect: -0.05 [-0.13 0.02] -#> -#> Proportion mediated: 28.14% [-79.57% 135.86%]
Typically, mediation()
finds the treatment and mediator variables automatically. If this does not work, use the treatment
and mediator
arguments to specify the related variable names. For all values, the 90% HDIs are calculated by default. Use prob
to calculate a different interval.
Here is a comparison with the mediation package. Note that the summary()
-output of the mediation package shows the indirect effect first, followed by the direct effect.
summary(m1) -#> -#> Causal Mediation Analysis -#> -#> Quasi-Bayesian Confidence Intervals -#> -#> Estimate 95% CI Lower 95% CI Upper p-value -#> ACME -0.0157 -0.0387 0.01 0.19 -#> ADE -0.0438 -0.1315 0.04 0.35 -#> Total Effect -0.0595 -0.1530 0.02 0.21 -#> Prop. Mediated 0.2137 -2.0277 2.70 0.32 -#> -#> Sample Size Used: 899 -#> -#> -#> Simulations: 1000 - -mediation(m2, prob = .95) -#> -#> # Causal Mediation Analysis for Stan Model -#> -#> Treatment: treat -#> Mediator: job_seek -#> Response: depress2 -#> -#> Estimate HDI (95%) -#> Direct effect: -0.04 [-0.12 0.04] -#> Indirect effect: -0.02 [-0.04 0.01] -#> Total effect: -0.05 [-0.15 0.03] -#> -#> Proportion mediated: 28.14% [-178.65% 234.94%]
If you want to calculate mean instead of median values from the posterior samples, use the typical
-argument. Furthermore, there is a print()
-method, which allows to print more digits.
mediation(m2, typical = "mean", prob = .95) %>% print(digits = 4) -#> -#> # Causal Mediation Analysis for Stan Model -#> -#> Treatment: treat -#> Mediator: job_seek -#> Response: depress2 -#> -#> Estimate HDI (95%) -#> Direct effect: -0.0395 [-0.1244 0.0450] -#> Indirect effect: -0.0158 [-0.0400 0.0086] -#> Total effect: -0.0553 [-0.1482 0.0302] -#> -#> Proportion mediated: 28.5975% [-178.1953% 235.3902%]
As you can see, the results are similar to what the mediation package produces for non-Bayesian models.
- -vignettes/mixedmodels-statistics.Rmd
- mixedmodels-statistics.Rmd
This vignettes demontrates those functions of the sjstats-package that deal especially with mixed effects models. sjstats provides following functions:
and samplesize_mixed()
Befor we start, we fit a simple linear mixed model:
-# load sample data
-# fit linear mixed model
-m <- lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy)
-sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE)
-m2 <- lmer(Reaction ~ Days + (1 | mygrp) + (1 | Subject), sleepstudy)
The first two functions, design_effect()
and samplesize_mixed()
, can be used to approximately calculate the sample size in the context of power calculation. Calculating the sample size for simple linear models is pretty straightforward, however, for (linear) mixed models, statistical power is affected through the change of the variance of test statistics. This is what Hsieh et al. (2003) call a design effect (or variance inflation factor, VIF). Once this design effect is calculated, the sample size calculated for a standard design can be adjusted accordingly.
computes this design effect for linear mixed models with two-level design. It requires the approximated average number of observations per grouping cluster (i.e. level-2 unit) and the assumed intraclass correlation coefficient (ICC) for the multilevel-model. Typically, the minimum assumed value for the ICC is 0.05.
# Design effect for two-level model with 30 observations per
-# cluster group (level-2 unit) and an assumed intraclass
-# correlation coefficient of 0.05.
-design_effect(n = 30)
-#> [1] 2.45
-# Design effect for two-level model with 24 observation per cluster
-# group and an assumed intraclass correlation coefficient of 0.2.
-design_effect(n = 24, icc = 0.2)
-#> [1] 5.6
combines the functions for power calculation from the pwr-package and design effect design_effect()
. It computes an approximated sample size for linear mixed models (two-level-designs), based on power-calculation for standard design and adjusted for design effect for 2-level-designs.
# Sample size for multilevel model with 30 cluster groups and a small to
-# medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and
-# hence a total sample size of about 802 observations is needed.
-samplesize_mixed(eff.size = .3, k = 30)
-#> $`Subjects per Cluster`
-#> [1] 27
-#> $`Total Sample Size`
-#> [1] 802
-# Sample size for multilevel model with 20 cluster groups and a medium
-# to large effect size for linear models of 0.2. Five subjects per cluster and
-# hence a total sample size of about 107 observations is needed.
-samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9)
-#> $`Subjects per Cluster`
-#> [1] 5
-#> $`Total Sample Size`
-#> [1] 107
There are more ways to perform power calculations for multilevel models, however, most of these require very detailed knowledge about the sample characteristics and performing simulation studys. samplesize_mixed()
is a more pragmatic alternative to these approaches.
Most functions to fit multilevel and mixed effects models only allow to specify frequency weights, but not design (i.e. sampling or probability) weights, which should be used when analyzing complex samples and survey data.
implements an algorithm proposed by Aaparouhov (2006) and Carle (2009) to rescale design weights in survey data to account for the grouping structure of multilevel models, which then can be used for multilevel modelling.
To calculate a weight-vector that can be used in multilevel models, scale_weights()
needs the data frame with survey data as x
-argument. This data frame should contain 1) a cluster ID (argument cluster.id
), which represents the strata of the survey data (the level-2-cluster variable) and 2) the probability weights (argument pweight
), which represents the design or sampling weights of the survey data (level-1-weight).
then returns the original data frame, including two new variables: svywght_a
, where the sample weights pweight
are adjusted by a factor that represents the proportion of cluster size divided by the sum of sampling weights within each cluster. The adjustment factor for svywght_b
is the sum of sample weights within each cluster devided by the sum of squared sample weights within each cluster (see Carle (2009), Appendix B, for details).
-scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR)
-#> # A tibble: 2,992 x 9
-#> total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR svywght_a svywght_b
-#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-#> 1 1 2.2 1 3 2 31 97594. 1.57 1.20
-#> 2 7 2.08 2 3 1 29 39599. 0.623 0.525
-#> 3 3 1.48 2 1 2 42 26620. 0.898 0.544
-#> 4 4 1.32 2 4 2 33 34999. 0.708 0.550
-#> 5 1 2 2 1 1 41 14746. 0.422 0.312
-#> 6 6 2.2 2 4 1 38 28232. 0.688 0.516
-#> 7 350 1.6 1 3 2 33 93162. 1.89 1.46
-#> 8 NA 1.48 2 3 1 29 82276. 1.29 1.09
-#> 9 3 2.28 2 4 1 41 24726. 0.707 0.523
-#> 10 30 0.84 1 3 2 35 39895. 0.760 0.594
-#> # ... with 2,982 more rows
Aaparouhov T. 2006. General Multi-Level Modeling with Sampling Weights. Communications in Statistics—Theory and Methods (35): 439–460
-Carle AC. 2009. Fitting multilevel models in complex survey data with design weights: Recommendations. BMC Medical Research Methodology 9(49): 1-13
-Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation & the Health Professions 26: 239–257. doi: 10.1177/0163278703255230
-Collection of convenient functions for common statistical computations, which are not directly provided by R’s base or stats packages.
-This package aims at providing, first, shortcuts for statistical measures, which otherwise could only be calculated with additional effort (like Cramer’s V, Phi, or effict size statistics like Eta or Omega squared), or for which currently no functions available.
-Second, another focus lies on weighted variants of common statistical measures and tests like weighted standard error, mean, t-test, correlation, and more.
-The comprised tools include:
-Please visit https://strengejacke.github.io/sjstats/ for documentation and vignettes.
-To install the latest development snapshot (see latest changes below), type following commands into the R console:
-library(devtools) -devtools::install_github("strengejacke/sjstats")
To install the latest stable release from CRAN, type following command into the R console:
In case you want / have to cite my package, please use citation('sjstats')
for citation information.
- eta_sq()
) now internally call the related functions from the effectsize package.chisq_gof()
with incorrect effect sizes for certain Anova types (that included an intercept).sjstats is being re-structured, and many functions are re-implemented in new packages that are part of a new project called easystats.
-Therefore, following functions are now deprecated:
, please use effectsize::cohens_f()
, please use effectsize::standardize_parameters()
, please use parameters::model_parameters()
, please use parameters::rescale_weights()
, please use parameters::standard_error_robust()
have been renamed to weighted_*()
was renamed to survey_median()
is an alias for mwu()
is an alias for grpmean()
.sjstats is being re-structured, and many functions are re-implemented in new packages that are part of a new project called easystats. The aim of easystats is to provide a unifying and consistent framework to tame, discipline and harness the scary R statistics and their pesky models.
-Therefore, following functions are now deprecated:
, please use parameters::p_value()
, please use parameters::standard_error()
is an alias for deff()
is an alias for smpsize_lmm()
is an alias for xtab_statistics()
to fit zero-inflated Poisson models for survey-designs.phi()
and cramer()
can now compute confidence intervals.tidy_stan()
removes prior parameters from output.tidy_stan()
now also prints the probability of direction.odds_to_rr()
, to compute epsilon-squared effect-size.sjstats is being re-structured, and many functions are re-implemented in new packages that are part of a new project called easystats. The aim of easystats is to provide a unifying and consistent framework to tame, discipline and harness the scary R statistics and their pesky models.
-Therefore, following functions are now deprecated:
, please use insight::link_inverse()
, please use insight::model_info()
, please use insight::get_data()
, please use insight::find_predictors()
, please use insight::find_random()
, please use insight::find_random()
, please use insight::get_response()
, please use insight::find_response()
, please use insight::clean_names()
, please use performance::check_overdispersion()
, please use performance::check_zeroinflation()
, please use performance::check_convergence()
, please use performance::check_singularity()
, please use performance::item_reliability()
, please use performance::item_split_half()
, please use performance::performance_accuracy()
, please use performance::cronbachs_alpha()
, please use performance::item_difficulty()
, please use performance::item_intercor()
, please use parameters::principal_components()
, please use parameters::principal_components()
, please use performance::r2()
, please use performance::icc()
, please use performance::rmse()
, please use performance::rse()
, please use performance::mse()
, please use bayestestR::hdi()
, please use bayestestR::ci()
, please use bayestestR::rope()
, please use bayestestR::effective_sample()
, please use bayestestR::equivalence_test()
, please use performance::check_collinearity()
, please use performance::check_normality()
, please use performance::check_autocorrelation()
, please use performance::check_heteroscedasticity()
, please use performance::check_outliers()
) get a method
-argument to define the method for computing confidence intervals from bootstrapping.smpsize_lmm()
could result in negative sample-size recommendations. This was fixed, and a warning is now shown indicating that the parameters for the power-calculation should be modified.r
in mwu()
if group-factor contained more than two groups.model_family()
, link_inverse()
or model_frame()
: MixMod
(package GLMMadaptive), MCMCglmm, mlogit
and gmnl
, to compute uncertainty intervals of Bayesian models. Mimics the behaviour and style of hdi()
and is thus a convenient complement to functions like posterior_interval()
now finds better defaults for models with binomial outcome (like logistic regression models).r2()
for mixed models now also should work properly for mixed models fitted with rstanarm.anova_stats()
and alike (e.g. eta_sq()
) now all preserve original term names.model_family()
now returns $is_count = TRUE
, when model is a count-model, and $is_beta = TRUE
for models with beta-family.pred_vars()
checks that return value has only unique values.pred_vars()
gets a zi
-argument to return the variables from a model’s zero-inflation-formula.dplyr::n()
, to meet forthcoming changes in dplyr 0.8.0.boot_ci()
gets a ci.lvl
-argument in pca_rotate()
now supports all rotations from psych::principal()
gets a fe.only
-argument to return only fixed effects terms from mixed models, and a disp
-argument to return the variables from a model’s dispersion-formula.icc()
for Bayesian models gets a adjusted
-argument, to calculate adjusted and conditional ICC (however, only for Gaussian models).icc()
for non-Gaussian Bayes-models, a message is printed that recommends setting argument ppd
and resp_var()
now also work for brms-models with additional response information (like trial()
in formula).resp_var()
gets a combine
-argument, to return either the name of the matrix-column or the original variable names for matrix-columns.model_frame()
now also returns the original variables for matrix-column-variables.model_frame()
now also returns the variable from the dispersion-formula of glmmTMB-models.model_family()
and link_inverse()
now supports glmmPQL, felm and lm_robust-models.anova_stats()
and alike (omeqa_sq()
etc.) now support gam-models from package gam.p_value()
now supports objects of class svyolr
and get_re_var()
for objects returned by icc()
for Stan-models.var_names()
did not clear terms with log-log transformation, e.g. log(log(y))
for models with splines with only one column.omega_sq()
and eta_sq()
give more informative messages when using non-supported objects.r2()
and icc()
give more informative warnings and messages.tidy_stan()
supports printing simplex parameters of monotonic effects of brms models.grpmean()
and mwu()
get a file
and encoding
argument, to save the HTML output as file.model_frame()
now correctly names the offset-columns for terms provided as offset
-argument (i.e. for models where the offset was not specified inside the formula).weights
-argument in grpmean()
when variable name was passed as character vector.r2()
for glmmTMB models with ar1
random effects structure.wtd_chisqtest()
to compute a weighted Chi-squared test.wtd_median()
to compute the weighted median of variables.wtd_cor()
to compute weighted correlation coefficients of variables.mediation()
can now cope with models from different families, e.g. if the moderator or outcome is binary, while the treatment-effect is continuous.model_frame()
, link_inverse()
, pred_vars()
, resp_var()
, resp_val()
, r2()
and model_family()
now support clm2
-objects from package ordinal.anova_stats()
gives a more informative message for non-supported models or ANOVA-options.model_family()
and link_inverse()
for models fitted with pscl::hurdle()
or pscl::zeroinfl()
for grouped data frames, when grouping variable was an unlabelled factor.model_frame()
for coxph-models with polynomial or spline-terms.mediation()
for logical variables.wtd_ttest()
to compute a weighted t-test.wtd_mwu()
to compute a weighted Mann-Whitney-U or Kruskal-Wallis test.robust()
was revised, getting more arguments to specify different types of covariance-matrix estimation, and handling these more flexible.print()
-method for tidy_stan()
for brmsfit-objects with categorical-families.se()
now also computes standard errors for relative frequencies (proportions) of a vector.r2()
now also computes r-squared values for glmmTMB-models from genpois
gives more precise warnings for non-supported model-families.xtab_statistics()
gets a weights
-argument, to compute measures of association for contingency tables for weighted data.statistics
-argument in xtab_statistics()
gets a "fisher"
-option, to force Fisher’s Exact Test to be used.icc()
for generalized linear mixed models with Poisson or negative binomial families.icc()
gets an adjusted
-argument, to calculate the adjusted and conditional ICC for mixed models.weight.by
is now deprecated and renamed into weights
now also adjusts the n
-columm for weighted data.icc()
, re_var()
and get_re_var()
now correctly compute the random-effect-variances for models with multiple random slopes per random effect term (e.g., (1 + rs1 + rs2 | grp)
, mcse()
, hdi()
and n_eff()
for stan_polr()
did not work for intercept-only models.This function creates default priors for brms-regression - models, based on the same automatic prior-scale adjustment as in - rstanarm.
-auto_prior(formula, data, gaussian, locations = NULL)- -
formula | -A formula describing the model, which just needs to contain
-the model terms, but no notation of interaction, splines etc. Usually,
-you want only those predictors in the formula, for which automatic
-priors should be generated. Add informative priors afterwards to the
-returned |
data | -The data that will be used to fit the model. |
gaussian | -Logical, if the outcome is gaussian or not. |
locations | -A numeric vector with location values for the priors. If
- |
A brmsprior
is a small, convenient function to create
- some default priors for brms-models with automatically adjusted prior
- scales, in a similar way like rstanarm does. The default scale for
- the intercept is 10, for coefficients 2.5. If the outcome is gaussian,
- both scales are multiplied with sd(y)
. Then, for categorical
- variables, nothing more is changed. For numeric variables, the scales
- are divided by the standard deviation of the related variable.
- All prior distributions are normal distributions. auto_prior()
- is intended to quickly create default priors with feasible scales. If
- more precise definitions of priors is necessary, this needs to be done
- directly with brms-functions like set_prior()
As auto_prior()
also sets priors on the intercept, the model
- formula used in brms::brm()
must be rewritten to something like
- y ~ 0 + intercept ...
, see set_prior
-library(sjmisc) -data(efc) -efc$c172code <- as.factor(efc$c172code) -efc$c161sex <- to_label(efc$c161sex) - -mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) - -if (requireNamespace("brms", quietly = TRUE)) - auto_prior(mf, efc, TRUE)#> prior class coef group resp dpar nlpar bound -#> 1 normal(0, 38.96) Intercept -#> 2 normal(0, 9.74) b c161sexFemale -#> 3 normal(0, 0.73) b c160age -#> 4 normal(0, 9.74) b c172code2 -#> 5 normal(0, 9.74) b c172code3-## compare to -# library(rstanarm) -# m <- stan_glm(mf, data = efc, chains = 2, iter = 200) -# ps <- prior_summary(m) -# ps$prior_intercept$adjusted_scale -# ps$prior$adjusted_scale - -## usage -# ap <- auto_prior(mf, efc, TRUE) -# brm(mf, data = efc, priors = ap) - -# add informative priors -mf <- formula(neg_c_7 ~ c161sex + c172code) - -if (requireNamespace("brms", quietly = TRUE)) { - auto_prior(mf, efc, TRUE) + - brms::prior(normal(.1554, 40), class = "b", coef = "c160age") -}#> prior class coef group resp dpar nlpar bound -#> 1 normal(0, 38.95) Intercept -#> 2 normal(0, 9.74) b c161sexFemale -#> 3 normal(0, 9.74) b c172code2 -#> 4 normal(0, 9.74) b c172code3 -#> 5 normal(0.1554, 40) b c160age-# example with binary response -efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) -mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) - -if (requireNamespace("brms", quietly = TRUE)) - auto_prior(mf, efc, FALSE)#> prior class coef group resp dpar nlpar bound -#> 1 normal(0, 10) Intercept -#> 2 normal(0, 2.5) b c161sexFemale -#> 3 normal(0, 0.19) b c160age -#> 4 normal(0, 2.5) b c172code2 -#> 5 normal(0, 2.5) b c172code3 -#> 6 normal(0, 0.31) b e17age-
- boot_ci.Rd
Compute nonparametric bootstrap estimate, standard error, - confidence intervals and p-value for a vector of bootstrap - replicate estimates.
-boot_ci(data, ..., method = c("dist", "quantile"), ci.lvl = 0.95) - -boot_se(data, ...) - -boot_p(data, ...) - -boot_est(data, ...)- -
data | -A data frame that containts the vector with bootstrapped -estimates, or directly the vector (see 'Examples'). |
... | -Optional, unquoted names of variables with bootstrapped estimates.
-Required, if either |
method | -Character vector, indicating if confidence intervals should be
-based on bootstrap standard error, multiplied by the value of the
-quantile function of the t-distribution (default), or on sample
-quantiles of the bootstrapped values. See 'Details' in |
ci.lvl | -Numeric, the level of the confidence intervals. |
A tibble
with either bootstrap estimate,
- standard error, the lower and upper confidence intervals or the
- p-value for all bootstrapped estimates.
The methods require one or more vectors of bootstrap replicate estimates - as input.
returns the bootstrapped estimate, simply by
- computing the mean value of all bootstrap estimates.
computes the nonparametric bootstrap standard
- error by calculating the standard deviation of the input vector.
The mean value of the input vector and its standard error is used
- by boot_ci()
to calculate the lower and upper confidence
- interval, assuming a t-distribution of bootstrap estimate replicates
- (for method = "dist"
, the default, which is
- mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)
); for
- method = "quantile"
, 95% sample quantiles are used to compute
- the confidence intervals (quantile(x, probs = c(.025, .975))
- Use ci.lvl
to change the level for the confidence interval.
P-values from boot_p()
are also based on t-statistics,
- assuming normal distribution.
Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164
to generate nonparametric bootstrap samples.
-library(dplyr) -library(purrr) -data(efc) -bs <- bootstrap(efc, 100) - -# now run models for each bootstrapped sample -bs$models <- map(bs$strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)) - -# extract coefficient "dependency" and "gender" from each model -bs$dependency <- map_dbl(bs$models, ~coef(.x)[2]) -bs$gender <- map_dbl(bs$models, ~coef(.x)[3]) - -# get bootstrapped confidence intervals -boot_ci(bs$dependency)#> term conf.low conf.high -#> 1 x 1.320903 1.77407#> 2.5 % 97.5 % -#> 1.292945 1.796430-# alternative function calls. -boot_ci(bs$dependency)#> term conf.low conf.high -#> 1 x 1.320903 1.77407boot_ci(bs, dependency)#> term conf.low conf.high -#> 1 dependency 1.320903 1.77407boot_ci(bs, dependency, gender)#> term conf.low conf.high -#> 1 dependency 1.3209034 1.7740701 -#> 2 gender -0.1016646 0.9788897boot_ci(bs, dependency, gender, method = "q")#> term conf.low conf.high -#> 1 dependency 1.30501832 1.763890 -#> 2 gender -0.07012629 0.916922#> [1] 1.547487boot_est(bs$dependency)#> term estimate -#> 1 x 1.547487#> e42dep -#> 1.544687- -# bootstrap() and boot_ci() work fine within pipe-chains -efc %>% - bootstrap(100) %>% - mutate( - models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)), - dependency = map_dbl(models, ~coef(.x)[2]) - ) %>% - boot_ci(dependency)#> term conf.low conf.high -#> 1 dependency 1.263093 1.779068-# check p-value -boot_p(bs$gender)#> term p.value -#> 1 x 0.1103975#> Estimate Std. Error t value Pr(>|t|) -#> 0.4339069 0.2818786 1.5393398 0.1240780-if (FALSE) { -# 'spread_coef()' from the 'sjmisc'-package makes it easy to generate -# bootstrapped statistics like confidence intervals or p-values -library(dplyr) -library(sjmisc) -efc %>% - # generate bootstrap replicates - bootstrap(100) %>% - # apply lm to all bootstrapped data sets - mutate( - models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) - ) %>% - # spread model coefficient for all 100 models - spread_coef(models) %>% - # compute the CI for all bootstrapped model coefficients - boot_ci(e42dep, c161sex, c172code) - -# or... -efc %>% - # generate bootstrap replicates - bootstrap(100) %>% - # apply lm to all bootstrapped data sets - mutate( - models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) - ) %>% - # spread model coefficient for all 100 models - spread_coef(models, append = FALSE) %>% - # compute the CI for all bootstrapped model coefficients - boot_ci()}
Generates n
bootstrap samples of data
- returns the bootstrapped data frames as list-variable.
bootstrap(data, n, size)- -
data | -A data frame. |
n | -Number of bootstraps to be generated. |
size | -Optional, size of the bootstrap samples. May either be a number
-between 1 and |
A data frame with one column: a list-variable
- strap
, which contains resample-objects of class sj_resample
- These resample-objects are lists with three elements:
the original data frame, data
the rownmumbers id
, i.e. rownumbers of data
, indicating the resampled rows with replacement
the resample.id
, indicating the index of the resample (i.e. the position of the sj_resample
-object in the list strap
By default, each bootstrap sample has the same number of observations
- as data
. To generate bootstrap samples without resampling
- same observations (i.e. sampling without replacement), use
- size
to get bootstrapped data with a specific number
- of observations. However, specifying the size
-argument is much
- less memory-efficient than the bootstrap with replacement. Hence,
- it is recommended to ignore the size
-argument, if it is
- not really needed.
This function applies nonparametric bootstrapping, i.e. the function
- draws samples with replacement.
- There is an as.data.frame
- and a print
-method to get or
- print the resampled data frames. See 'Examples'. The as.data.frame
- method automatically applies whenever coercion is done because a data
- frame is required as input. See 'Examples' in boot_ci
to calculate confidence intervals from
- bootstrap samples.
-data(efc) -bs <- bootstrap(efc, 5) - -# now run models for each bootstrapped sample -lapply(bs$strap, function(x) lm(neg_c_7 ~ e42dep + c161sex, data = x))#> [[1]] -#> -#> Call: -#> lm(formula = neg_c_7 ~ e42dep + c161sex, data = x) -#> -#> Coefficients: -#> (Intercept) e42dep c161sex -#> 6.9036 1.4385 0.4329 -#> -#> -#> [[2]] -#> -#> Call: -#> lm(formula = neg_c_7 ~ e42dep + c161sex, data = x) -#> -#> Coefficients: -#> (Intercept) e42dep c161sex -#> 7.8918 1.3693 -0.1371 -#> -#> -#> [[3]] -#> -#> Call: -#> lm(formula = neg_c_7 ~ e42dep + c161sex, data = x) -#> -#> Coefficients: -#> (Intercept) e42dep c161sex -#> 7.7762 1.3265 0.1469 -#> -#> -#> [[4]] -#> -#> Call: -#> lm(formula = neg_c_7 ~ e42dep + c161sex, data = x) -#> -#> Coefficients: -#> (Intercept) e42dep c161sex -#> 5.4334 1.7106 0.7536 -#> -#> -#> [[5]] -#> -#> Call: -#> lm(formula = neg_c_7 ~ e42dep + c161sex, data = x) -#> -#> Coefficients: -#> (Intercept) e42dep c161sex -#> 6.3228 1.5540 0.5277 -#> -#>-# generate bootstrap samples with 600 observations for each sample -bs <- bootstrap(efc, 5, 600) - -# generate bootstrap samples with 70% observations of the original sample size -bs <- bootstrap(efc, 5, .7) - -# compute standard error for a simple vector from bootstraps -# use the `as.data.frame()`-method to get the resampled -# data frame -bs <- bootstrap(efc, 100) -bs$c12hour <- unlist(lapply(bs$strap, function(x) { - mean(as.data.frame(x)$c12hour, na.rm = TRUE) -})) - -# or as tidyverse-approach -if (require("dplyr") && require("purrr")) { - bs <- efc %>% - bootstrap(100) %>% - mutate( - c12hour = map_dbl(strap, ~mean(as.data.frame(.x)$c12hour, na.rm = TRUE)) - ) - - # bootstrapped standard error - boot_se(bs, c12hour) -}#>#> -#>#>-#> -#>#>-#> -#>#>#> -#>#>-#> -#>#> term std.err -#> 1 c12hour 1.6688
detects outliers in (generalized) linear models.
checks a linear model for (non-)constant error variance.
checks for independence of errors.
checks linear models for (non-)normality of residuals.
checks predictors of linear models for multicollinearity.
checks all of the above assumptions.
check_assumptions(x, model.column = NULL, as.logical = FALSE, ...) - -outliers(x, iterations = 5) - -heteroskedastic(x, model.column = NULL) - -autocorrelation(x, model.column = NULL, ...) - -normality(x, model.column = NULL) - -multicollin(x, model.column = NULL)- -
x | -Fitted |
model.column | -Name or index of the list-variable that contains the fitted
-model objects. Only applies, if |
as.logical | -Logical, if |
... | -Other arguments, passed down to |
iterations | -Numeric, indicates the number of iterations to remove -outliers. |
A data frame with the respective statistics.
- -These functions are wrappers that compute various test statistics,
- however, each of them returns a tibble instead of a list of values.
- Furthermore, all functions can also be applied to multiples models
- in stored in list-variables (see 'Examples').
- outliers()
wraps outlierTest
and iteratively
- removes outliers for iterations
times, or if the r-squared value
- (for glm: the AIC) did not improve after removing outliers. The function
- returns a tibble with r-squared and AIC statistics for the original
- and updated model, as well as the update model itself ($updated.model
- the number ($removed.count
) and indices of the removed observations
- ($removed.obs
- heteroskedastic()
wraps ncvTest
and returns
- the p-value of the test statistics as tibble. A p-value < 0.05 indicates
- a non-constant variance (heteroskedasticity).
- autocorrelation()
wraps durbinWatsonTest
- and returns the p-value of the test statistics as tibble. A p-value
- < 0.05 indicates autocorrelated residuals. In such cases, robust
- standard errors (see robust
return more accurate results
- for the estimates, or maybe a mixed model with error term for the
- cluster groups should be used.
- normality()
calls shapiro.test
- and checks the standardized residuals for normal distribution.
- The p-value of the test statistics is returned as tibble. A p-value
- < 0.05 indicates a significant deviation from normal distribution.
- Note that this formal test almost always yields significant results
- for the distribution of residuals and visual inspection (e.g. qqplots)
- are preferable (see plot_model
- type = "diag"
- multicollin()
wraps vif
and returns
- the maximum vif-value from a model as tibble. If this value is
- larger than about 4, multicollinearity exists, else not.
- In case of multicollinearity, the names of independent
- variables that vioalte contribute to multicollinearity are printed
- to the console.
- check_assumptions()
runs all of the above tests and returns
- a tibble with all test statistics included. In case the p-values
- are too confusing, use the as.logical
argument, where all
- p-values are replaced with either TRUE
(in case of violation)
- or FALSE
(in case of model conforms to assumption of linar
- regression).
These formal tests are very strict and in most cases violation of model
- assumptions are alerted, though the model is actually ok. It is
- preferable to check model assumptions based on visual inspection
- (see plot_model
with type = "diag"
-data(efc) - -fit <- lm(barthtot ~ c160age + c12hour + c161sex + c172code, data = efc) -outliers(fit)#>heteroskedastic(fit)#>#> heteroskedastic -#> 1 3.885808e-07autocorrelation(fit)#>#> autocorrelation -#> 1 0normality(fit)#>#> non.normality -#> 1 1.535796e-13check_assumptions(fit)#> -#> # Checking Model-Assumptions -#> -#> Model: barthtot ~ c160age + c12hour + c161sex + c172code -#> -#> violated statistic -#> Heteroskedasticity yes p = 0.000 -#> Non-normal residuals yes p = 0.000 -#> Autocorrelated residuals yes p = 0.000 -#> Multicollinearity no vif = 1.153-fit <- lm(barthtot ~ c160age + c12hour + c161sex + c172code + neg_c_7, - data = efc) -outliers(fit)#>#> models adjusted.r2 aic -#> 1 original 0.3458095 7487.639 -#> 2 updated 0.3530485 7468.980check_assumptions(fit, as.logical = TRUE)#> heteroskedasticity multicollinearity non.normal.resid autocorrelation -#> 1 TRUE FALSE TRUE TRUE-# apply function to multiple models in list-variable -library(purrr) -library(dplyr) -tmp <- efc %>% - bootstrap(50) %>% - mutate( - models = map(strap, ~lm(neg_c_7 ~ e42dep + c12hour + c161sex, data = .x)) - ) - -# for list-variables, argument 'model.column' is the -# quoted name of the list-variable with fitted models -tmp %>% normality("models")#> non.normality -#> 1 3.230058e-19 -#> 2 3.827347e-16 -#> 3 9.098247e-22 -#> 4 9.981165e-19 -#> 5 6.428575e-18 -#> 6 3.224082e-20 -#> 7 2.118794e-19 -#> 8 1.288646e-16 -#> 9 2.328003e-19 -#> 10 8.999720e-21 -#> 11 6.353047e-21 -#> 12 1.026013e-19 -#> 13 2.390006e-19 -#> 14 6.701265e-19 -#> 15 2.551566e-19 -#> 16 3.057700e-18 -#> 17 2.117815e-19 -#> 18 1.077834e-17 -#> 19 7.594577e-18 -#> 20 9.239676e-19 -#> 21 3.149844e-21 -#> 22 5.519879e-20 -#> 23 3.637194e-18 -#> 24 1.764174e-21 -#> 25 6.623029e-19 -#> 26 1.181069e-19 -#> 27 3.168530e-19 -#> 28 4.854560e-18 -#> 29 2.001162e-20 -#> 30 7.352369e-22 -#> 31 1.018042e-20 -#> 32 7.376915e-19 -#> 33 1.207461e-18 -#> 34 4.070908e-16 -#> 35 7.510733e-18 -#> 36 2.049412e-20 -#> 37 1.243810e-19 -#> 38 2.813157e-19 -#> 39 3.095559e-22 -#> 40 8.518521e-21 -#> 41 5.261510e-16 -#> 42 1.156834e-18 -#> 43 5.360721e-21 -#> 44 3.306733e-18 -#> 45 8.478782e-20 -#> 46 1.000268e-20 -#> 47 3.684449e-21 -#> 48 1.301398e-17 -#> 49 3.625559e-21 -#> 50 8.619752e-18tmp %>% heteroskedastic("models")#> heteroskedastic -#> 1 1.117145e-06 -#> 2 4.136378e-16 -#> 3 2.304193e-11 -#> 4 2.303091e-13 -#> 5 2.989163e-09 -#> 6 1.617715e-06 -#> 7 2.462472e-06 -#> 8 5.650847e-11 -#> 9 1.949355e-05 -#> 10 9.487583e-12 -#> 11 8.865950e-05 -#> 12 1.001106e-11 -#> 13 9.380742e-09 -#> 14 2.692178e-11 -#> 15 2.777129e-12 -#> 16 1.156712e-08 -#> 17 1.142231e-09 -#> 18 3.003215e-05 -#> 19 3.513974e-10 -#> 20 8.914693e-07 -#> 21 5.075324e-07 -#> 22 1.443102e-04 -#> 23 3.889332e-08 -#> 24 2.962249e-05 -#> 25 1.108236e-11 -#> 26 1.494601e-11 -#> 27 9.264544e-11 -#> 28 2.394459e-08 -#> 29 3.738415e-03 -#> 30 3.710447e-07 -#> 31 2.980297e-12 -#> 32 4.978182e-15 -#> 33 1.403946e-07 -#> 34 1.833919e-14 -#> 35 2.311357e-05 -#> 36 1.506452e-13 -#> 37 3.740443e-08 -#> 38 2.266819e-07 -#> 39 9.948937e-11 -#> 40 8.844100e-08 -#> 41 3.306678e-14 -#> 42 4.618659e-10 -#> 43 4.902720e-07 -#> 44 8.314707e-12 -#> 45 8.818074e-11 -#> 46 8.447289e-10 -#> 47 5.206397e-09 -#> 48 3.063647e-15 -#> 49 1.658216e-08 -#> 50 1.059808e-12-# Durbin-Watson-Test from package 'car' takes a little bit longer due -# to simulation of p-values... -# NOT RUN { -tmp %>% check_assumptions("models", as.logical = TRUE, reps = 100) -# }-
For logistic regression models, performs a Chi-squared - goodness-of-fit-test.
-chisq_gof(x, prob = NULL, weights = NULL)- -
x | -A numeric vector or a |
prob | -Vector of probabilities (indicating the population probabilities)
-of the same length as |
weights | -Vector with weights, used to weight |
For vectors, returns the object of the computed chisq.test
- For glm
-objects, an object of class chisq_gof
- following values: p.value
, the p-value for the goodness-of-fit test;
- z.score
, the standardized z-score for the goodness-of-fit test;
- rss
, the residual sums of squares term and chisq
, the pearson
- chi-squared statistic.
For vectors, this function is a convenient function for the
- chisq.test()
, performing goodness-of-fit test. For
- glm
-objects, this function performs a goodness-of-fit test.
- A well-fitting model shows no significant difference between the
- model and the observed data, i.e. the reported p-values should be
- greater than 0.05.
Hosmer, D. W., & Lemeshow, S. (2000). Applied Logistic Regression. Hoboken, NJ, USA: John Wiley & Sons, Inc. doi: 10.1002/0471722146
- --data(efc) -efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) -m <- glm( - neg_c_7d ~ c161sex + barthtot + c172code, - data = efc, - family = binomial(link = "logit") -) - -# goodness-of-fit test for logistic regression -chisq_gof(m)#> -#> # Chi-squared Goodness-of-Fit Test -#> -#> Chi-squared: 852.765 -#> z-score: 1.025 -#> p-value: 0.305 -#>#>-# goodness-of-fit test for vectors against probabilities -# differing from population -chisq_gof(efc$e42dep, c(0.3,0.2,0.22,0.28))#> -#> Chi-squared test for given probabilities -#> -#> data: dummy -#> X-squared = 234.76, df = 3, p-value < 2.2e-16 -#>#> -#> Chi-squared test for given probabilities -#> -#> data: dummy -#> X-squared = 0, df = 3, p-value = 1 -#>-
Compute Goodness-of-fit measures for various regression models, - including mixed and Bayesian regression models.
- -cod(x) - -r2(x, ...) - -# S3 method for lme -r2(x, n = NULL, ...) - -# S3 method for stanreg -r2(x, loo = FALSE, ...) - -# S3 method for brmsfit -r2(x, loo = FALSE, ...)- -
x | -Fitted model of class |
... | -Currently not used. |
n | -Optional, an |
loo | -Logical, if |
For r2()
, depending on the model, returns:
For linear models, the r-squared and adjusted r-squared values.
For mixed models, the marginal and conditional r-squared values.
For glm
objects, Cox & Snell's and Nagelkerke's pseudo r-squared values.
For brmsfit
or stanreg
objects, the Bayesian version of r-squared is computed, calling rstantools::bayes_R2()
If loo = TRUE
, for brmsfit
or stanreg
objects a LOO-adjusted version of r-squared is returned.
Models that are not currently supported return NULL
For cod()
, returns the D
Coefficient of Discrimination,
- also known as Tjur's R-squared value.
For linear models, the r-squared and adjusted r-squared value is returned,
- as provided by the summary
- For mixed models (from lme4 or glmmTMB) marginal and
- conditional r-squared values are calculated, based on
- Nakagawa et al. 2017. The distributional variance
- (or observation-level variance) is based on lognormal approximation,
- log(1+var(x)/mu^2)
- For lme
-models, an r-squared approximation by computing the
- correlation between the fitted and observed values, as suggested by
- Byrnes (2008), is returned as well as a simplified version of
- the Omega-squared value (1 - (residual variance / response variance),
- Xu (2003), Nakagawa, Schielzeth 2013), unless n
- is specified.
- If n
is given, for lme
-models pseudo r-squared measures based
- on the variances of random intercept (tau 00, between-group-variance)
- and random slope (tau 11, random-slope-variance), as well as the
- r-squared statistics as proposed by Snijders and Bosker 2012 and
- the Omega-squared value (1 - (residual variance full model / residual
- variance null model)) as suggested by Xu (2003) are returned.
- For generalized linear models, Cox & Snell's and Nagelkerke's
- pseudo r-squared values are returned.
- The ("unadjusted") r-squared value and its standard error for
- brmsfit
or stanreg
objects are robust measures, i.e.
- the median is used to compute r-squared, and the median absolute
- deviation as the measure of variability. If loo = TRUE
- a LOO-adjusted r-squared is calculated, which comes conceptionally
- closer to an adjusted r-squared measure.
This method calculates the Coefficient of Discrimination D
- for generalized linear (mixed) models for binary data. It is
- an alternative to other Pseudo-R-squared values like Nakelkerke's
- R2 or Cox-Snell R2. The Coefficient of Discrimination D
- can be read like any other (Pseudo-)R-squared value.
For mixed models, the marginal r-squared considers only the variance
- of the fixed effects, while the conditional r-squared takes both
- the fixed and random effects into account.
- For lme
-objects, if n
is given, the Pseudo-R2 statistic
- is the proportion of explained variance in the random effect after
- adding co-variates or predictors to the model, or in short: the
- proportion of the explained variance in the random effect of the
- full (conditional) model x
compared to the null (unconditional)
- model n
- The Omega-squared statistics, if n
is given, is 1 - the proportion
- of the residual variance of the full model compared to the null model's
- residual variance, or in short: the the proportion of the residual
- variation explained by the covariates.
- Alternative ways to assess the "goodness-of-fit" is to compare the ICC
- of the null model with the ICC of the full model (see icc
Bolker B et al. (2017): GLMM FAQ
Byrnes, J. 2008. Re: Coefficient of determination (R^2) when using lme() (https://stat.ethz.ch/pipermail/r-sig-mixed-models/2008q2/000713.html)
Kwok OM, Underhill AT, Berry JW, Luo W, Elliott TR, Yoon M. 2008. Analyzing Longitudinal Data with Multilevel Models: An Example with Individuals Living with Lower Extremity Intra-Articular Fractures. Rehabilitation Psychology 53(3): 370-86. doi: 10.1037/a0012765
Nakagawa S, Schielzeth H. 2013. A general and simple method for obtaining R2 from generalized linear mixed-effects models. Methods in Ecology and Evolution, 4(2):133-142. doi: 10.1111/j.2041-210x.2012.00261.x
Nakagawa S, Johnson P, Schielzeth H (2017) The coefficient of determination R2 and intra-class correlation coefficient from generalized linear mixed-effects models revisted and expanded. J. R. Soc. Interface 14. doi: 10.1098/rsif.2017.0213
Rabe-Hesketh S, Skrondal A. 2012. Multilevel and longitudinal modeling using Stata. 3rd ed. College Station, Tex: Stata Press Publication
Raudenbush SW, Bryk AS. 2002. Hierarchical linear models: applications and data analysis methods. 2nd ed. Thousand Oaks: Sage Publications
Snijders TAB, Bosker RJ. 2012. Multilevel analysis: an introduction to basic and advanced multilevel modeling. 2nd ed. Los Angeles: Sage
Xu, R. 2003. Measuring explained variation in linear mixed effects models. Statist. Med. 22:3527-3541. doi: 10.1002/sim.1572
Tjur T. 2009. Coefficients of determination in logistic regression models - a new proposal: The coefficient of discrimination. The American Statistician, 63(4): 366-372
-data(efc) - -# Tjur's R-squared value -efc$services <- ifelse(efc$tot_sc_e > 0, 1, 0) -fit <- glm(services ~ neg_c_7 + c161sex + e42dep, - data = efc, family = binomial(link = "logit")) -cod(fit)#> -#> R-Squared for (Generalized) Linear (Mixed) Model -#> -#> Tjur's D: 0.023 -#>-library(lme4)#>#> -#> R-Squared for (Generalized) Linear (Mixed) Model -#> -#> Family : gaussian (identity) -#> Formula: ~Days | Subject Reaction ~ Days NA -#> -#> Marginal R2: 0.279 -#> Conditional R2: 0.799 -#>#> -#> R-Squared for (Generalized) Linear (Mixed) Model -#> -#> R-squared: 0.256 -#> adjusted R-squared: 0.254 -#>-# Pseudo-R-squared values -fit <- glm(services ~ neg_c_7 + c161sex + e42dep, - data = efc, family = binomial(link = "logit")) -r2(fit)#> -#> R-Squared for (Generalized) Linear (Mixed) Model -#> -#> Cox & Snell's R-squared: 0.023 -#> Nagelkerke's R-squared: 0.030 -#>-
provides an alternative convergence test for
- merMod
-objects; is_singular()
- post-fitting convergence warnings. If the model fit is singular,
- warning about negative eigenvalues of the Hessian can most likely
- be ignored.
converge_ok(x, tolerance = 0.001) - -is_singular(x, tolerance = 1e-05, ...)- -
x | -A |
tolerance | -Indicates up to which value the convergence result is
-accepted. The smaller |
... | -Currently not used. |
For converge_ok()
, a logical vector, which is TRUE
- convergence is fine and FALSE
if convergence is suspicious.
- Additionally, the convergence value is returned as return value's name.
- is_singluar()
returns TRUE
if the model fit is singular.
provides an alternative convergence test for
- merMod
-objects, as discussed
- here
- and suggested by Ben Bolker in
- this comment.
- If a model is "singular", this means that some dimensions of the variance-covariance
- matrix have been estimated as exactly zero. is_singular()
checks if
- a model fit is singular, and can be used in case of post-fitting convergence
- warnings, such as warnings about negative eigenvalues of the Hessian. If the fit
- is singular (i.e. is_singular()
returns TRUE
), these warnings
- can most likely be ignored.
- There is no gold-standard about how to deal with singularity and which
- random-effects specification to choose. Beside using fully Bayesian methods
- (with informative priors), proposals in a frequentist framework are:
avoid fitting overly complex models, such that the variance-covariance matrices can be estimated precisely enough (Matuschek et al. 2017)
use some form of model selection to choose a model that balances predictive accuracy and overfitting/type I error (Bates et al. 2015, Matuschek et al. 2017)
“keep it maximal”, i.e. fit the most complex model consistent with the experimental design, removing only terms required to allow a non-singular fit (Barr et al. 2013)
Bates D, Kliegl R, Vasishth S, Baayen H. Parsimonious Mixed Models. arXiv:1506.04967, June 2015.
Barr DJ, Levy R, Scheepers C, Tily HJ. Random effects structure for confirmatory hypothesis testing: Keep it maximal. Journal of Memory and Language, 68(3):255-278, April 2013.
Matuschek H, Kliegl R, Vasishth S, Baayen H, Bates D. Balancing type I error and power in linear mixed models. Journal of Memory and Language, 94:305-315, 2017.
-library(sjmisc) -library(lme4) -data(efc) -# create binary response -efc$hi_qol <- dicho(efc$quol_5) -# prepare group variable -efc$grp = as.factor(efc$e15relat) -# data frame for fitted model -mydf <- data.frame(hi_qol = as.factor(efc$hi_qol), - sex = as.factor(efc$c161sex), - c12hour = as.numeric(efc$c12hour), - neg_c_7 = as.numeric(efc$neg_c_7), - grp = efc$grp) -# fit glmer -fit <- glmer(hi_qol ~ sex + c12hour + neg_c_7 + (1|grp), - data = mydf, family = binomial("logit")) - -converge_ok(fit)#> 1.19757278379967e-05 -#> TRUE-
This function calculates various measure of association for - contingency tables and returns the statistic and p-value. - Supported measures are Cramer's V, Phi, Spearman's rho, - Kendall's tau and Pearson's r.
-cramer(tab, ...) - -# S3 method for formula -cramer( - formula, - data, - ci.lvl = NULL, - n = 1000, - method = c("dist", "quantile"), - ... -) - -phi(tab, ...) - -crosstable_statistics( - data, - x1 = NULL, - x2 = NULL, - statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), - weights = NULL, - ... -) - -xtab_statistics( - data, - x1 = NULL, - x2 = NULL, - statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), - weights = NULL, - ... -)- -
tab | -A |
... | -Other arguments, passed down to the statistic functions
- |
formula | -A formula of the form |
data | -A data frame or a table object. If a table object, |
ci.lvl | -Scalar between 0 and 1. If not |
n | -Number of bootstraps to be generated. |
method | -Character vector, indicating if confidence intervals should be
-based on bootstrap standard error, multiplied by the value of the
-quantile function of the t-distribution (default), or on sample
-quantiles of the bootstrapped values. See 'Details' in |
x1 | -Name of first variable that should be used to compute the
-contingency table. If |
x2 | -Name of second variable that should be used to compute the
-contingency table. If |
statistics | -Name of measure of association that should be computed. May
-be one of |
weights | -Name of variable in |
For phi()
, the table's Phi value. For cramer()
, the
- table's Cramer's V.
- For crosstable_statistics()
, a list with following components:
the value of the estimated measure of association.
the p-value for the test.
the value of the test statistic.
the name of the test statistic.
if applicable, the name of the test statistic, in HTML-format.
the degrees of freedom for the contingency table.
character string indicating the name of the measure of association.
if applicable, the name of the measure of association, in HTML-format.
the short form of association measure, equals the statistics
logical, if Fisher's exact test was used to calculate the p-value.
The p-value for Cramer's V and the Phi coefficient are based
- on chisq.test()
. If any expected value of a table cell is
- smaller than 5, or smaller than 10 and the df is 1, then fisher.test()
- is used to compute the p-value, unless statistics = "fisher"
; in
- this case, the use of fisher.test()
is forced to compute the
- p-value. The test statistic is calculated with cramer()
- phi()
- Both test statistic and p-value for Spearman's rho, Kendall's tau
- and Pearson's r are calculated with cor.test()
- When statistics = "auto"
, only Cramer's V or Phi are calculated,
- based on the dimension of the table (i.e. if the table has more than
- two rows or columns, Cramer's V is calculated, else Phi).
-# Phi coefficient for 2x2 tables -tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE)) -phi(tab)#> [1] 0.1336306-# Cramer's V for nominal variables with more than 2 categories -tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) -cramer(tab)#> [1] 0.180269#> [1] 0.05258249-# bootstrapped confidence intervals -cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100)#> cramer conf.low conf.high -#> 1 0.05258249 -0.00860759 0.111614-# 2x2 table, compute Phi automatically -crosstable_statistics(efc, e16sex, c161sex)#> -#> # Measure of Association for Contingency Tables -#> -#> Chi-squared: 2.2327 -#> Phi: 0.0526 -#> p-value: 0.1351-# more dimensions than 2x2, compute Cramer's V automatically -crosstable_statistics(efc, c172code, c161sex)#> -#> # Measure of Association for Contingency Tables -#> -#> Chi-squared: 4.1085 -#> Cramer's V: 0.0699 -#> p-value: 0.1282-# ordinal data, use Kendall's tau -crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall")#> -#> # Measure of Association for Contingency Tables -#> -#> z: -9.5951 -#> Kendall's tau: -0.2496 -#> p-value: <0.001-# calcilate Spearman's rho, with continuity correction -crosstable_statistics(efc, - e42dep, - quol_5, - statistics = "spearman", - exact = FALSE, - continuity = TRUE -)#> -#> # Measure of Association for Contingency Tables -#> -#> S: 157974157.4198 -#> Spearman's rho: -0.3177 -#> p-value: <0.001-
Compute the coefficient of variation.
-cv(x, ...)- -
x | -Fitted linear model of class |
... | -More fitted model objects, to compute multiple coefficients of -variation at once. |
Numeric, the coefficient of variation.
-The advantage of the cv is that it is unitless. This allows - coefficient of variation to be compared to each other in ways - that other measures, like standard deviations or root mean - squared residuals, cannot be.
- --#> [1] 0.3948098-
computes the root mean squared error from a model fitted
- to kfold cross-validated test-training-data. cv_compare()
- does the same, for multiple formulas at once (by calling cv_error()
- for each formula).
cv_error(data, formula, k = 5) - -cv_compare(data, formulas, k = 5)- -
data | -A data frame. |
formula | -The formula to fit the linear model for the test and training data. |
k | -The number of folds for the kfold-crossvalidation. |
formulas | -A list of formulas, to fit linear models for the test and training data. |
A data frame with the root mean squared errors for the training and test data.
first generates cross-validated test-training pairs, using
- crossv_kfold
and then fits a linear model, which
- is described in formula
, to the training data. Then, predictions
- for the test data are computed, based on the trained models.
- The training error is the mean value of the rmse
- all trained models; the test error is the rmse based on all
- residuals from the test data.
-#> Warning: unnest() has a new interface. See ?unnest for details. -#> Try `df %>% unnest(c(predicted, residuals))`, with `mutate()` if needed#> model train.error test.error -#> 1 neg_c_7 ~ barthtot + c161sex 3.5065 3.519-cv_compare(efc, formulas = list( - neg_c_7 ~ barthtot + c161sex, - neg_c_7 ~ barthtot + c161sex + e42dep, - neg_c_7 ~ barthtot + c12hour -))#> Warning: unnest() has a new interface. See ?unnest for details. -#> Try `df %>% unnest(c(predicted, residuals))`, with `mutate()` if needed#> Warning: unnest() has a new interface. See ?unnest for details. -#> Try `df %>% unnest(c(predicted, residuals))`, with `mutate()` if needed#> Warning: unnest() has a new interface. See ?unnest for details. -#> Try `df %>% unnest(c(predicted, residuals))`, with `mutate()` if needed#> model train.error test.error -#> 1 neg_c_7 ~ barthtot + c161sex 3.5066 3.5223 -#> 2 neg_c_7 ~ barthtot + c161sex + e42dep 3.4865 3.5089 -#> 3 neg_c_7 ~ barthtot + c12hour 3.5028 3.5205-
Compute the design effect (also called Variance Inflation Factor) - for mixed models with two-level design.
- -deff(n, icc = 0.05)- -
n | -Average number of observations per grouping cluster (i.e. level-2 unit). |
icc | -Assumed intraclass correlation coefficient for multilevel-model. |
The design effect (Variance Inflation Factor) for the two-level model.
- -The formula for the design effect is simply (1 + (n - 1) * icc)
Bland JM. 2000. Sample size in guidelines trials. Fam Pract. (17), 17-20.
- Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. doi: 10.1177/0163278703255230
- Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. doi: 10.1002/0470013192.bsa492
- Thompson DM, Fernald DH, Mold JW. 2012. Intraclass Correlation Coefficients Typical of Cluster-Randomized Studies: Estimates From the Robert Wood Johnson Prescription for Health Projects. The Annals of Family Medicine;10(3):235-40. doi: 10.1370/afm.1347
-# Design effect for two-level model with 30 observations per -# cluster group (level-2 unit) and an assumed intraclass -# correlation coefficient of 0.05. -deff(n = 30)#> [1] 2.45-# Design effect for two-level model with 24 observation per cluster -# group and an assumed intraclass correlation coefficient of 0.2. -deff(n = 24, icc = 0.2)#> [1] 5.6-
Compute the design effect (also called Variance Inflation Factor) - for mixed models with two-level design.
-design_effect(n, icc = 0.05)- -
n | -Average number of observations per grouping cluster (i.e. level-2 unit). |
icc | -Assumed intraclass correlation coefficient for multilevel-model. |
The design effect (Variance Inflation Factor) for the two-level model.
-The formula for the design effect is simply (1 + (n - 1) * icc)
Bland JM. 2000. Sample size in guidelines trials. Fam Pract. (17), 17-20.
- Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. doi: 10.1177/0163278703255230
- Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. doi: 10.1002/0470013192.bsa492
- Thompson DM, Fernald DH, Mold JW. 2012. Intraclass Correlation Coefficients Typical of Cluster-Randomized Studies: Estimates From the Robert Wood Johnson Prescription for Health Projects. The Annals of Family Medicine;10(3):235-40. doi: 10.1370/afm.1347
-# Design effect for two-level model with 30 observations per -# cluster group (level-2 unit) and an assumed intraclass -# correlation coefficient of 0.05. -design_effect(n = 30)#> [1] 2.45-# Design effect for two-level model with 24 observation per cluster -# group and an assumed intraclass correlation coefficient of 0.2. -design_effect(n = 24, icc = 0.2)#> [1] 5.6-
helper-function, telling user if model is supported or not
- -.badlink(link, family)- - -
glmmTMB returns a list of model information, one for conditional and one for zero-inflated part, so here we "unlist" it
- -.collapse_cond(x)- - -
Get distributional variance for beta-family
- -.get_variance_beta(mu, phi)- - -
Get dispersion-specific variance
- -.get_variance_dispersion(x, vals, faminfo, obs.terms)- - -
- -.get_variance_fixed(vals)- - -
- dot-get_variance_random.Rd
- -.get_variance_random(terms, x, vals)- - -
- dot-get_variance_residual.Rd
- -.get_variance_residual(x, var.cor, faminfo, name)- - -
Returns the (partial) eta-squared, (partial) omega-squared,
- epsilon-squared statistic or Cohen's F for all terms in an anovas.
- anova_stats()
returns a tidy summary, including all these statistics
- and power for each term.
anova_stats(model, digits = 3) - -epsilon_sq(model, partial = FALSE, ci.lvl = NULL) - -eta_sq(model, partial = FALSE, ci.lvl = NULL) - -omega_sq(model, partial = FALSE, ci.lvl = NULL)- -
model | -A fitted anova-model of class |
digits | -Amount of digits for returned values. |
partial | -Logical, if |
ci.lvl | -Scalar between 0 and 1. If not |
A data frame with the term name(s) and effect size statistics; if
- ci.lvl
is not NULL
, a data frame including lower and
- upper confidence intervals is returned. For anova_stats()
, a tidy
- data frame with all statistics is returned (excluding confidence intervals).
See details in eta_squared
Levine TR, Hullett CR (2002): Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research (pdf)
- Tippey K, Longnecker MT (2016): An Ad Hoc Method for Computing Pseudo-Effect Size for Mixed Model. (pdf)
-# load sample data -data(efc) - -# fit linear model -fit <- aov( - c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, - data = efc -) - -eta_sq(fit)#> term etasq -#> 1 as.factor(e42dep) 0.266 -#> 2 as.factor(c172code) 0.005 -#> 3 c160age 0.048omega_sq(fit)#> term omegasq -#> 1 as.factor(e42dep) 0.263 -#> 2 as.factor(c172code) 0.004 -#> 3 c160age 0.048eta_sq(fit, partial = TRUE)#> term partial.etasq -#> 1 as.factor(e42dep) 0.281 -#> 2 as.factor(c172code) 0.008 -#> 3 c160age 0.066eta_sq(fit, partial = TRUE, ci.lvl = .8)#> term partial.etasq conf.low conf.high -#> 1 as.factor(e42dep) 0.281 0.248 0.311 -#> 2 as.factor(c172code) 0.008 0.001 0.016 -#> 3 c160age 0.066 0.047 0.089#>-#> -#> -#> -#> -#>#> term sumsq meansq df statistic p.value etasq -#> 1 as.factor(e42dep) 426461.571 142153.857 3 80.299 0.000 0.212 -#> 2 as.factor(c172code) 7352.049 3676.025 2 2.076 0.126 0.004 -#> 3 c160age 105169.595 105169.595 1 59.408 0.000 0.052 -#> 4 Residuals 1476436.343 1770.307 834 NA NA NA -#> partial.etasq omegasq partial.omegasq epsilonsq cohens.f power -#> 1 0.224 0.209 0.221 0.209 0.537 1.000 -#> 2 0.005 0.002 0.003 0.002 0.071 0.429 -#> 3 0.066 0.051 0.065 0.051 0.267 1.000 -#> 4 NA NA NA NA NA NA
, find_normal()
and find_cauchy()
find the
- shape, mean and standard deviation resp. the location and scale parameters
- to describe the beta, normal or cauchy distribution, based on two
- percentiles. find_beta2()
finds the shape parameters for a Beta
- distribution, based on a probability value and its standard error
- or confidence intervals.
find_beta(x1, p1, x2, p2) - -find_beta2(x, se, ci, n) - -find_cauchy(x1, p1, x2, p2) - -find_normal(x1, p1, x2, p2)- -
x1 | -Value for the first percentile. |
p1 | -Probability of the first percentile. |
x2 | -Value for the second percentile. |
p2 | -Probability of the second percentile. |
x | -Numeric, a probability value between 0 and 1. Typically indicates
-a prevalence rate of an outcome of interest; Or an integer value
-with the number of observed events. In this case, specify |
se | -The standard error of |
ci | -The upper limit of the confidence interval of |
n | -Numeric, number of total observations. Needs to be specified, if
- |
A list of length two, with the two distribution parameters than can - be used to define the distribution, which (best) describes - the shape for the given input parameters.
-These functions can be used to find parameter for various distributions,
- to define prior probabilities for Bayesian analyses. x1
- p1
, x2
and p2
are parameters that describe two
- quantiles. Given this knowledge, the distribution parameters are
- returned.
- Use find_beta2()
, if the known parameters are, e.g. a prevalence
- rate or similar probability, and its standard deviation or confidence
- interval. In this case. x
should be a probability,
- for example a prevalence rate of a certain event. se
- needs to be the standard error for this probability. Alternatively,
- ci
can be specified, which should indicate the upper limit
- of the confidence interval od the probability (prevalence rate) x
- If the number of events out of a total number of trials is known
- (e.g. 12 heads out of 30 coin tosses), x
can also be the number
- of observed events, while n
indicates the total amount of trials
- (in the above example, the function call would be: find_beta2(x = 12, n = 30)
Cook JD. Determining distribution parameters from quantiles. 2010: Department of Biostatistics, Texas (PDF)
- --# example from blogpost: -# https://www.johndcook.com/blog/2010/01/31/parameters-from-percentiles/ -# 10% of patients respond within 30 days of treatment -# and 80% respond within 90 days of treatment -find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8)#> $mean -#> [1] 53.78387 -#> -#> $sd -#> [1] 30.48026 -#>find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8)#> $location -#> [1] 48.54102 -#> -#> $scale -#> [1] 57.06339 -#>-parms <- find_normal(x1 = 30, p1 = .1, x2 = 90, p2 = .8) -curve( - dnorm(x, mean = parms$mean, sd = parms$sd), - from = 0, to = 200 -)-parms <- find_cauchy(x1 = 30, p1 = .1, x2 = 90, p2 = .8) -curve( - dcauchy(x, location = parms$location, scale = parms$scale), - from = 0, to = 200 -)- -find_beta2(x = .25, ci = .5)#> $shape1 -#> [1] 2.860267 -#> -#> $shape2 -#> [1] 7.93757 -#>-# find Beta distribution for 3 events out of 20 observations -find_beta2(x = 3, n = 20)#> $shape1 -#> [1] 4.157811 -#> -#> $shape2 -#> [1] 22.03272 -#>-
computes Gini's mean difference for a numeric vector
- or for all numeric vectors in a data frame.
gmd(x, ...)- -
x | -A vector or data frame. |
... | -Optional, unquoted names of variables that should be selected for
-further processing. Required, if |
For numeric vectors, Gini's mean difference. For non-numeric vectors
- or vectors of length < 2, returns NA
Gini's mean difference is defined as the mean absolute difference between
- any two distinct elements of a vector. Missing values from x
- silently removed.
David HA. Gini's mean difference rediscovered. Biometrika 1968(55): 573-575
- --#> [1] 9.297005gmd(efc, e17age, c160age, c12hour)#> # A tibble: 1 x 3 -#> e17age c160age c12hour -#> <dbl> <dbl> <dbl> -#> 1 9.30 15.2 47.9-
Computes mean, sd and se for each sub-group (indicated by grp
- of dv
grpmean( - x, - dv, - grp, - weights = NULL, - digits = 2, - out = c("txt", "viewer", "browser"), - encoding = "UTF-8", - file = NULL -) - -means_by_group( - x, - dv, - grp, - weights = NULL, - digits = 2, - out = c("txt", "viewer", "browser"), - encoding = "UTF-8", - file = NULL -)- -
x | -A (grouped) data frame. |
dv | -Name of the dependent variable, for which the mean value, grouped
-by |
grp | -Factor with the cross-classifying variable, where |
weights | -Name of variable in |
digits | -Numeric, amount of digits after decimal point when rounding -estimates and values. |
out | -Character vector, indicating whether the results should be printed
-to console ( |
encoding | -Character vector, indicating the charset encoding used
-for variable and value labels. Default is |
file | -Destination file, if the output should be saved as file.
-Only used when |
For non-grouped data frames, grpmean()
returns a data frame with
- following columns: term
, mean
, N
, std.dev
- std.error
and p.value
. For grouped data frames, returns
- a list of such data frames.
This function performs a One-Way-Anova with dv
as dependent
- and grp
as independent variable, by calling
- lm(count ~ as.factor(grp))
. Then contrast
- is called to get p-values for each sub-group. P-values indicate whether
- each group-mean is significantly different from the total mean.
-#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> term mean N std.dev std.error p.value -#> 1 independent 9.91 66 8.01 0.99 <0.001 -#> 2 slightly dependent 17.54 225 17.74 1.18 <0.001 -#> 3 moderately dependent 34.52 306 41.54 2.37 0.98 -#> 4 severely dependent 75.90 304 61.72 3.54 <0.001 -#> 5 Total 42.44 901 50.82 1.69 -#> -#> Anova: R2=0.245; adj.R2=0.242; F=96.908; p=0.000#> -#> # Grouped Means for Sepal.Width by Species -#> -#> term mean N std.dev std.error p.value -#> 1 setosa 3.43 50 0.38 0.05 <0.001 -#> 2 versicolor 2.77 50 0.31 0.04 <0.001 -#> 3 virginica 2.97 50 0.32 0.05 0.04 -#> 4 Total 3.06 150 0.44 0.04 -#> -#> Anova: R2=0.401; adj.R2=0.393; F=49.160; p=0.000-# also works for grouped data frames -library(dplyr) -efc %>% - group_by(c172code) %>% - grpmean(c12hour, e42dep)#> -#> Grouped by: -#> carer's level of education: low level of education -#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> term mean N std.dev std.error p.value -#> 1 independent 16.33 12 10.74 3.10 0.02 -#> 2 slightly dependent 15.38 42 9.55 1.47 <0.001 -#> 3 moderately dependent 42.05 61 46.53 5.96 0.70 -#> 4 severely dependent 85.52 65 56.42 7.00 <0.001 -#> 5 Total 49.81 180 52.24 3.89 -#> -#> Anova: R2=0.307; adj.R2=0.295; F=25.955; p=0.000 -#> -#> -#> Grouped by: -#> carer's level of education: intermediate level of education -#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> term mean N std.dev std.error p.value -#> 1 independent 7.96 45 3.91 0.58 <0.001 -#> 2 slightly dependent 17.12 135 16.52 1.42 <0.001 -#> 3 moderately dependent 33.55 163 41.05 3.22 0.75 -#> 4 severely dependent 79.71 163 63.13 4.94 <0.001 -#> 5 Total 41.76 506 51.42 2.29 -#> -#> Anova: R2=0.284; adj.R2=0.280; F=66.374; p=0.000 -#> -#> -#> Grouped by: -#> carer's level of education: high level of education -#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> term mean N std.dev std.error p.value -#> 1 independent 15.20 5 18.43 8.24 0.36 -#> 2 slightly dependent 18.08 39 12.98 2.08 0.15 -#> 3 moderately dependent 28.42 62 35.64 4.53 0.67 -#> 4 severely dependent 63.38 50 62.69 8.87 <0.001 -#> 5 Total 36.62 156 46.38 3.71 -#> -#> Anova: R2=0.167; adj.R2=0.151; F=10.155; p=0.000 -#> -#>-# weighting -efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) -grpmean(efc, c12hour, e42dep, weights = weight)#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> term mean N std.dev std.error p.value -#> 1 independent 9.97 70 8.62 1.06 <0.001 -#> 2 slightly dependent 17.25 225 16.84 1.12 <0.001 -#> 3 moderately dependent 36.25 304 43.38 2.48 0.76 -#> 4 severely dependent 78.32 313 63.24 3.63 <0.001 -#> 5 Total 43.98 901 52.55 1.75 -#> -#> Anova: R2=0.250; adj.R2=0.248; F=99.732; p=0.000-
computes the highest density interval for values from
- MCMC samples, while cred_int()
computes the credible interval (or
- uncertainty interval). rope()
calculates the proportion of a posterior
- distribution that lies within a region of practical equivalence.
- equi_test()
combines these two functions and performs a
- "HDI+ROPE decision rule" (Test for Practical Equivalence) (Kruschke 2018)
- to check whether parameter values should be accepted or rejected against
- the background of a formulated null hypothesis. n_eff()
- the the number of effective samples (effective sample size). mcse()
- returns the Monte Carlo standard error. mediation()
is a short
- summary for multivariate-response mediation-models.
hdi(x, ...) - -# S3 method for stanreg -hdi(x, prob = 0.9, trans = NULL, type = c("fixed", - "random", "all"), ...) - -# S3 method for brmsfit -hdi(x, prob = 0.9, trans = NULL, type = c("fixed", - "random", "all"), ...) - -cred_int(x, ...) - -# S3 method for stanreg -cred_int(x, prob = 0.9, trans = NULL, - type = c("fixed", "random", "all"), ...) - -# S3 method for brmsfit -cred_int(x, prob = 0.9, trans = NULL, - type = c("fixed", "random", "all"), ...) - -equi_test(x, ...) - -# S3 method for stanreg -equi_test(x, rope, eff_size, out = c("txt", "viewer", - "browser", "plot"), ...) - -# S3 method for brmsfit -equi_test(x, rope, eff_size, out = c("txt", "viewer", - "browser", "plot"), ...) - -mcse(x, ...) - -# S3 method for brmsfit -mcse(x, type = c("fixed", "random", "all"), ...) - -# S3 method for stanreg -mcse(x, type = c("fixed", "random", "all"), ...) - -mediation(x, ...) - -# S3 method for brmsfit -mediation(x, treatment, mediator, prob = 0.9, - typical = "median", ...) - -n_eff(x, ...) - -# S3 method for stanreg -n_eff(x, type = c("fixed", "random", "all"), ...) - -# S3 method for brmsfit -n_eff(x, type = c("fixed", "random", "all"), ...) - -rope(x, rope, ...) - -# S3 method for stanreg -rope(x, rope, trans = NULL, type = c("fixed", - "random", "all"), ...) - -# S3 method for brmsfit -rope(x, rope, trans = NULL, type = c("fixed", - "random", "all"), ...)- -
x | -A |
... | -Further arguments passed down to
prob | -Vector of scalars between 0 and 1, indicating the mass within
-the credible interval that is to be estimated. See |
trans | -Name of a function or character vector naming a function, used
-to apply transformations on the returned HDI-values resp.
-(for |
type | -For mixed effects models, specify the type of effects that should
-be returned. |
rope | -Vector of length two, indicating the lower and upper limit of a -range around zero, which indicates the region of practical equivalence. -Values of the posterior distribution within this range are considered as -being "practically equivalent to zero". |
eff_size | -A scalar indicating the effect size (the size of an negligible
-effect) that is used to calculate the limits of the ROPE for the test of
-practical equivalence. If not specified, an effect size of .1 is used for
-linear models, as suggested by Kruschke 2018 (see 'Details').
-If |
out | -Character vector, indicating whether the results should be printed
-to console ( |
treatment | -Character, name of the treatment variable (or direct effect)
-in a (multivariate response) mediator-model. If missing, |
mediator | -Character, name of the mediator variable in a (multivariate
-response) mediator-model. If missing, |
typical | -The typical value that will represent the Bayesian point estimate.
-By default, the posterior median is returned. See |
For hdi()
, if x
is a vector, returns a vector of length
- two with the lower and upper limit of the HDI; if x
is a
- stanreg
, stanfit
or brmsfit
object, returns a
- tibble with lower and upper HDI-limits for each predictor. To distinguish
- multiple HDI values, column names for the HDI get a suffix when prob
- has more than one element.
- For rope()
, returns a tibble with two columns: the proportion of
- values from x
that are within and outside the boundaries of
- rope
- equi_test()
returns a tibble with a column decision
- indicates whether or not a parameter value is accepted/rejected;
- inside.rope
, which indicates the proportion of the whole posterior
- distribution that lies inside the ROPE (not just the proportion of
- values from the 95% HDI); and the lower and upper interval from the 95%-HDI.
- mcse()
and n_eff()
return a tibble with two columns: one
- with the term names and one with the related statistic resp. effective
- sample size.
- mediation()
returns a data frame with direct, indirect, mediator and
- total effect of a multivariate-response mediation-model, as well as the
- proportion mediated. The effect sizes are mean values of the posterior
- samples.
Computation for HDI is based on the code from Kruschke 2015, pp. 727f.
- For default sampling in Stan (4000 samples), the 90% intervals for HDI are
- more stable than, for instance, 95% intervals. An effective sample size
- (see nsamples
) of at least 10.000 is recommended if
- 95% intervals should be computed (see Kruschke 2015, p. 183ff).
Credible intervals (or uncertainty intervals) are simply the quantiles
- for a given probability of the posterior draws. See
- posterior_interval
for more details.
The Monte Carlo Standard Error is another useful measure of accuracy of
- the chains. It is defined as standard deviation of the chains divided by
- their effective sample size (the formula for mcse()
is from
- Kruschke 2015, p. 187). The MCSE “provides a quantitative suggestion
- of how big the estimation noise is”.
The effective sample size divides the actual sample size by the amount
- of autocorrelation. The effective sample size is a measure of “how
- much independent information there is in autocorrelated chains”, or:
- “What would be the sample size of a completely non-autocorrelated chain
- that yielded the same information?” (Kruschke 2015, p182-3).
- The ratio of effective number of samples and total number of samples
- (provided in tidy_stan()
) ranges from 0 to 1, and should be close
- to 1. The closer this ratio comes to zero means that the chains may be
- inefficient, but possibly still okay.
There are no fixed rules to set the limits for the region of practical
- equivalence. However, there are some conventions described by
- Kruschke (2018) how to specify the limits of the rope. One
- convention for linear models is to set the limits about .1 SD of the
- dependent variable around zero (i.e. 0 +/- .1 * sd(y)
), where
- .1 stands for half of a small effect size. Another, more conservative
- convention to set the ROPE limits is a range of half a standard
- deviation around zero (see Norman et al. 2003), which indicates
- a clinical relevant effect (i.e. 0 +/- .25 * sd(y)
or even
- 0 +/- .5 * sd(y)
computes the 95%-HDI for x
and checks if a
- model predictor's HDI lies completely outside, completely inside or
- partially inside the ROPE. If the HDI is completely outside the ROPE,
- the "null hypothesis" for this parameter is "rejected". If the ROPE
- completely covers the HDI, i.e. all most credible values of a parameter
- are inside the region of practical equivalence, the null hypothesis
- is accepted. Else, it's undecided whether to accept or reject the
- null hypothesis. In short, desirable results are low proportions inside
- the ROPE (the closer to zero the better) and the H0 should be rejected.
- If neither the rope
nor eff_size
argument are specified,
- the effect size (the size of an negligible effect) will be set to 0.1
- and the ROPE is 0 +/- .1 * sd(y)
for linear models. This is the
- suggested way to specify the ROPE limits according to Kruschke (2018).
- For models with binary outcome, there is no direct way to specify the
- effect size that defines the ROPE limits. Two examples from Kruschke
- suggest that a negligible change is about .05 on the logit-scale.
- In these cases, it is recommended to specify the rope
- however, if not specified, the ROPE limits are caluclated as suggested
- by Kruschke: The effect size is the probability of "success" for the
- outcome, divided by pi
. For all other models,
- 0 +/- .1 * sd(intercept)
is used to determine the ROPE limits.
- If eff_size
is specified, but rope
is not, then
- the same formulas apply, except that .1
is replaced by the
- value in eff_size
. If rope
is specified, eff_size
- will be ignored. See also section ROPE in 'Details'.
- The advantage of Bayesian testing for practical equivalence over
- classical frequentist null hypothesis significance testing is that
- discrete decisions are avoided, “because such decisions encourage
- people to ignore the magnitude of the parameter value and its uncertainty”
- (Kruschke (2018)).
returns a data frame with information on the
- direct effect (mean value of posterior samples from treatment
- of the outcome model), mediator effect (mean value of posterior
- samples from mediator
of the outcome model), indirect effect
- (mean value of the multiplication of the posterior samples from
- mediator
of the outcome model and the posterior samples from
- treatment
of the mediation model) and the total effect (mean
- value of sums of posterior samples used for the direct and indirect
- effect). The proportion mediated is the indirect effect divided
- by the total effect.
- For all values, the 90% HDIs are calculated by default. Use prob
- to calculate a different interval.
- The arguments treatment
and mediator
do not necessarily
- need to be specified. If missing, mediation()
tries to find the
- treatment and mediator variable automatically. If this does not work,
- specify these variables.
Since equi_test()
computes 95% HDI, a number of 10.000 samples
- produces more stable results (see Kruschke 2015, p183ff).
Kruschke JK. Doing Bayesian Data Analysis: A Tutorial with R, JAGS, and Stan. 2nd edition. Academic Press, 2015
- Kruschke JK. Rejecting or Accepting Parameter Values in Bayesian Estimation. Advances in Methods and Practices in Psychological Science. 2018; doi: 10.1177/2515245918771304
- Norman GR, Sloan JA, Wyrwich KW. Interpretation of Changes in Health-related Quality of Life: The Remarkable Universality of Half a Standard Deviation. Medical Care. 2003;41: 582-592. doi: 10.1097/01.MLR.0000062554.74615.4C
# NOT RUN { -if (require("rstanarm")) { - fit <- stan_glm(mpg ~ wt + am, data = mtcars, chains = 1) - hdi(fit) - - # return multiple intervals - hdi(fit, prob = c(.5, .7, .9)) - - # fit logistic regression model - fit <- stan_glm( - vs ~ wt + am, - data = mtcars, - family = binomial("logit"), - chains = 1 - ) - # compute hdi, transform on "odds ratio scale" - hdi(fit, trans = exp) - - # compute rope, on scale of linear predictor. finds proportion - # of posterior distribution values between -1 and 1. - rope(fit, rope = c(-1, 1)) - - # compute rope, boundaries as "odds ratios". finds proportion of - # posterior distribution values, which - after being exponentiated - - # are between .8 and 1.25 (about -.22 and .22 on linear scale) - rope(fit, rope = c(.8, 1.25), trans = exp) - - # Test for Practical Equivalence - equi_test(fit) - equi_test(fit, out = "plot") -} -# }--
This function calculates the intraclass-correlation
- (icc) - sometimes also called variance partition coefficient
- (vpc) - for random intercepts of mixed effects models. Currently,
- merMod
, glmmTMB
- stanreg
and brmsfit
objects are supported.
icc(x, ...) - -# S3 method for merMod -icc(x, adjusted = FALSE, ...) - -# S3 method for glmmTMB -icc(x, adjusted = FALSE, ...) - -# S3 method for stanreg -icc(x, re.form = NULL, typical = "mean", - prob = 0.89, ppd = FALSE, adjusted = FALSE, ...) - -# S3 method for brmsfit -icc(x, re.form = NULL, typical = "mean", - prob = 0.89, ppd = FALSE, ...)- -
x | -Fitted mixed effects model (of class |
... | -Currently not used. |
adjusted | -Logical, if |
re.form | -Formula containing group-level effects to be considered in
-the prediction. If |
typical | -Character vector, naming the function that will be used as
-measure of central tendency for the ICC. The default is "mean". See
- |
prob | -Vector of scalars between 0 and 1, indicating the mass within
-the credible interval that is to be estimated. See |
ppd | -Logical, if |
A numeric vector with all random intercept intraclass-correlation-coefficients.
- Furthermore, if adjusted = FALSE
, between- and within-group variances
- as well as random-slope variance are returned as attributes.
- For stanreg
or brmsfit
objects, the HDI for each statistic
- is also included as attribute.
The "simple" ICC (with both ppd
and adjusted
set to
) is calculated by dividing the between-group-variance (random
- intercept variance) by the total variance (i.e. sum of between-group-variance
- and within-group (residual) variance).
- The calculation of the ICC for generalized linear mixed models with binary outcome is based on
- Wu et al. (2012). For other distributions (negative binomial, poisson, ...),
- calculation is based on Nakagawa et al. 2017, however, for
- non-Gaussian models it is recommended to compute the adjusted ICC (with
- adjusted = TRUE
, see below).
- ICC for unconditional and conditional models
- Usually, the ICC is calculated for the null model ("unconditional model").
- However, according to Raudenbush and Bryk (2002) or
- Rabe-Hesketh and Skrondal (2012) it is also feasible to compute the ICC
- for full models with covariates ("conditional models") and compare how
- much a level-2 variable explains the portion of variation in the grouping
- structure (random intercept).
- ICC for random-slope models
- Caution: For models with random slopes and random intercepts,
- the ICC would differ at each unit of the predictors. Hence, the ICC for these
- kind of models cannot be understood simply as proportion of variance
- (see Goldstein et al. 2010). For convenience reasons, as the
- icc()
function also extracts the different random effects
- variances, the ICC for random-slope-intercept-models is reported
- nonetheless, but it is usually no meaningful summary of the
- proportion of variances.
- To get a meaningful ICC also for models with random slopes, use adjusted = TRUE
- The adjusted ICC uses the mean random effect variance, which is based
- on the random effect variances for each value of the random slope
- (see Johnson et al. 2014).
- ICC for models with multiple or nested random effects
- Caution: By default, for three-level-models, depending on the
- nested structure of the model, or for models with multiple random effects,
- icc()
only reports the proportion of variance explained for each
- grouping level. Use adjusted = TRUE
to calculate the adjusted and
- conditional ICC, which condition on all random effects.
- Adjusted and conditional ICC
- If adjusted = TRUE
, an adjusted and conditional ICC are calculated,
- which take all sources of uncertainty (of all random effects)
- into account to report an "adjusted" ICC, as well as the conditional ICC.
- The latter also takes the fixed effects variances into account (see
- Nakagawa et al. 2017). If random effects are not nested and not
- cross-classified, the adjusted (adjusted = TRUE
) and unadjusted
- (adjusted = FALSE
) ICC are identical. adjust = TRUE
- a meaningful ICC for models with random slopes. Furthermore, the adjusted
- ICC is recommended for models with other distributions than Gaussian.
- ICC for specific group-levels
- To calculate the proportion of variance for specific levels related to each
- other (e.g., similarity of level-1-units within
- level-2-units or level-2-units within level-3-units) must be computed
- manually. Use get_re_var
to get the between-group-variances
- and residual variance of the model, and calculate the ICC for the various level
- correlations.
- For example, for the ICC between level 1 and 2:
- sum(get_re_var(fit)) / (sum(get_re_var(fit)) + get_re_var(fit, "sigma_2"))
- or for the ICC between level 2 and 3:
- get_re_var(fit)[2] / sum(get_re_var(fit))
- ICC for Bayesian models
- If ppd = TRUE
, icc()
calculates a variance decomposition based on
- the posterior predictive distribution. In this case, first, the draws from
- the posterior predictive distribution not conditioned on group-level
- terms (posterior_predict(..., re.form = NA)
) are calculated as well
- as draws from this distribution conditioned on all random effects
- (by default, unless specified else in re.form
) are taken. Then, second,
- the variances for each of these draws are calculated. The "ICC" is then the
- ratio between these two variances. This is the recommended way to
- analyse random-effect-variances for non-Gaussian models. It is then possible
- to compare variances accross models, also by specifying different group-level
- terms via the re.form
- Sometimes, when the variance of the posterior predictive distribution is
- very large, the variance ratio in the output makes no sense, e.g. because
- it is negative. In such cases, it might help to use a more robust measure
- to calculate the central tendency of the variances. For example, use
- typical = "median"
Some notes on why the ICC is useful, based on Grace-Martin:
It can help you determine whether or not a linear mixed model is even necessary. If you find that the correlation is zero, that means the observations within clusters are no more similar than observations from different clusters. Go ahead and use a simpler analysis technique.
It can be theoretically meaningful to understand how much of the overall variation in the response is explained simply by clustering. For example, in a repeated measures psychological study you can tell to what extent mood is a trait (varies among people, but not within a person on different occasions) or state (varies little on average among people, but varies a lot across occasions).
It can also be meaningful to see how the ICC (as well as the between and within cluster variances) changes as variable are added to the model.
In short, the ICC can be interpreted as “the proportion of the variance
- explained by the grouping structure in the population” (Hox 2002: 15).
- The random effect variances indicate the between- and within-group
- variances as well as random-slope variance and random-slope-intercept
- correlation. The components are denoted as following:
Within-group (residual) variance: sigma_2
Between-group-variance: tau.00 (variation between individual intercepts and average intercept)
Random-slope-variance: tau.11 (variation between individual slopes and average slope)
Random-Intercept-Slope-covariance: tau.01
Random-Intercept-Slope-correlation: rho.01
Aguinis H, Gottfredson RK, Culpepper SA. 2013. Best-Practice Recommendations for Estimating Cross-Level Interaction Effects Using Multilevel Modeling. Journal of Management 39(6): 1490-1528 (doi: 10.1177/0149206313478188 -)
Goldstein H, Browne W, Rasbash J. 2010. Partitioning Variation in Multilevel Models. Understanding Statistics, 1:4, 223-231 (doi: 10.1207/S15328031US0104_02 -)
Grace-Martion K. The Intraclass Correlation Coefficient in Mixed Models, web
Hox J. 2002. Multilevel analysis: techniques and applications. Mahwah, NJ: Erlbaum
Johnson PC, O'Hara RB. 2014. Extension of Nakagawa & Schielzeth's R2GLMM to random slopes models. Methods Ecol Evol, 5: 944-946. (doi: 10.1111/2041-210X.12225 -)
Nakagawa S, Johnson P, Schielzeth H (2017) The coefficient of determination R2 and intra-class correlation coefficient from generalized linear mixed-effects models revisted and expanded. J. R. Soc. Interface 14. doi: 10.1098/rsif.2017.0213
Rabe-Hesketh S, Skrondal A. 2012. Multilevel and longitudinal modeling using Stata. 3rd ed. College Station, Tex: Stata Press Publication
Raudenbush SW, Bryk AS. 2002. Hierarchical linear models: applications and data analysis methods. 2nd ed. Thousand Oaks: Sage Publications
Wu S, Crespi CM, Wong WK. 2012. Comparison of methods for estimating the intraclass correlation coefficient for binary responses in cancer prevention cluster randomized trials. Contempory Clinical Trials 33: 869-880 (doi: 10.1016/j.cct.2012.05.004 -)
Further helpful online-ressources:
CrossValidated (2012) Intraclass correlation (ICC) for an interaction?
CrossValidated (2014) Interpreting the random effect in a mixed-effect model
CrossValidated (2014) how to partition the variance explained at group level and individual level
-#> -#> Intraclass Correlation Coefficient for Linear mixed model -#> -#> Family : gaussian (identity) -#> Formula: Reaction ~ 1 + (1 | Subject) -#> -#> ICC (Subject): 0.3949 -#>-# note: ICC for random-slope-intercept model usually not -# meaningful, unless you use "adjusted = TRUE" - see 'Note'. -fit1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy) -icc(fit1)#>#> -#> Intraclass Correlation Coefficient for Linear mixed model -#> -#> Family : gaussian (identity) -#> Formula: Reaction ~ Days + (Days | Subject) -#> -#> ICC (Subject): 0.4830 -#>icc(fit1, adjusted = TRUE)#> -#> Intraclass Correlation Coefficient for Generalized Linear Mixed Model -#> -#> Family : gaussian (identity) -#> Formula: ~Days | Subject Reaction ~ Days NA -#> -#> Adjusted ICC: 0.7217 -#> Conditional ICC: 0.5206 -#>-sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE) -fit2 <- lmer(Reaction ~ Days + (1 | mygrp) + (1 | Subject), sleepstudy) - -icc1 <- icc(fit1)#>#> -#> Intraclass Correlation Coefficient for Linear mixed model -#> -#> Family : gaussian (identity) -#> Formula: Reaction ~ Days + (Days | Subject) -#> -#> Within-group-variance: 654.941 -#> Between-group-variance: 611.898 (Subject) -#> Random-slope-variance: 35.081 (Subject.Days) -#> Slope-Intercept-covariance: 9.614 (Subject.(Intercept)) -#> Slope-Intercept-correlation: 0.066 (Subject) -#>#> -#> Intraclass Correlation Coefficient for Linear mixed model -#> -#> Family : gaussian (identity) -#> Formula: Reaction ~ Days + (1 | mygrp) + (1 | Subject) -#> -#> Within-group-variance: 946.474 -#> Between-group-variance: 14.082 (mygrp) -#> Between-group-variance: 1381.597 (Subject) -#>-# NOT RUN { -# compute ICC for Bayesian mixed model, with an ICC for each -# sample of the posterior. The print()-method then shows -# the median ICC as well as 89% HDI for the ICC. -# Change interval with print-method: -# print(icc(m, posterior = TRUE), prob = .5) - -if (requireNamespace("brms", quietly = TRUE)) { - library(dplyr) - sleepstudy$mygrp <- sample(1:5, size = 180, replace = TRUE) - sleepstudy <- sleepstudy %>% - group_by(mygrp) %>% - mutate(mysubgrp = sample(1:30, size = n(), replace = TRUE)) - m <- brms::brm( - Reaction ~ Days + (1 | mygrp / mysubgrp) + (1 | Subject), - data = sleepstudy - ) - - # by default, 89% interval - icc(m) - - # show 50% interval - icc(m, prob = .5) - - # variances based on posterior predictive distribution - icc(m, ppd = TRUE) -} -# }-
- |
- - | -Survey-weighted negative binomial generalised linear model |
- - | -Survey-weighted zero-inflated Poisson model |
- Bootstrapping- - |
- |
- - | -Generate nonparametric bootstrap replications |
- - | -Standard error and confidence intervals for bootstrapped estimates |
- Effect Size Statistics for Anova- - |
- |
- - | -Effect size statistics for anova |
- Statistics for Crosstables- - |
- |
- - | -Measures of association for contingency tables |
- - | -Expected and relative table values |
- Weighted Statistics- - |
- |
- - | -Weight a variable |
- Weighted statistics for tests and variables |
- Other (Summary) Statistics- - |
- |
- - | -Gini's Mean Difference |
- - | -Mann-Whitney-U-Test |
- - | -Row means with min amount of valid values |
- - | -Summary of mean values by group |
- - | -Calculate population variance and standard deviation |
- Tools for Regression Models- - |
- |
- - | -Compute model quality |
- - | -Compute model quality |
- - | -Test and training error from model cross-validation |
- Tools for Mixed Models- - |
- |
- - | -Design effects for two-level mixed models |
- - | -Sample size for linear mixed models |
- - | -Standard error of sample mean for mixed models |
- Tools for Bayesian Models- - |
- |
- - | -Create default priors for brms-models |
- - | -Summary of Bayesian multivariate-response mediation-models |
- Find Parameters of Distribution- - |
- |
- - | -Determining distribution parameters |
- Miscellaneous- - |
- |
- - | -Compute trends in status inequalities |
- - | -Find prime numbers |
- - | -Get relative risks estimates from logistic regressions or odds ratio values |
- - | -Proportions of values in a vector |
This method computes the proportional change of absolute - (rate differences) and relative (rate ratios) inequalities - of prevalence rates for two different status groups, as proposed - by Mackenbach et al. (2015).
-inequ_trend(data, prev.low, prev.hi)- -
data | -A data frame that contains the variables with prevalence rates for both low -and high status groups (see 'Examples'). |
prev.low | -The name of the variable with the prevalence rates for -the low status groups. |
prev.hi | -The name of the variable with the prevalence rates for -the hi status groups. |
A data frame with the prevalence rates as well as the values for the
- proportional change in absolute (rd
) and relative (rr
- ineqqualities.
Given the time trend of prevalence rates of an outcome for two status - groups (e.g. the mortality rates for people with lower and higher - socioeconomic status over 40 years), this function computes the - proportional change of absolute and relative inequalities, expressed - in changes in rate differences and rate ratios. The function implements - the algorithm proposed by Mackenbach et al. 2015.
-Mackenbach JP, Martikainen P, Menvielle G, de Gelder R. 2015. The Arithmetic of Reducing Relative and Absolute Inequalities in Health: A Theoretical Analysis Illustrated with European Mortality Data. Journal of Epidemiology and Community Health 70(7): 730-36. doi: 10.1136/jech-2015-207018
- --# This example reproduces Fig. 1 of Mackenbach et al. 2015, p.5 - -# 40 simulated time points, with an initial rate ratio of 2 and -# a rate difference of 100 (i.e. low status group starts with a -# prevalence rate of 200, the high status group with 100) - -# annual decline of prevalence is 1% for the low, and 3% for the -# high status group - -n <- 40 -time <- seq(1, n, by = 1) -lo <- rep(200, times = n) -for (i in 2:n) lo[i] <- lo[i - 1] * .99 - -hi <- rep(100, times = n) -for (i in 2:n) hi[i] <- hi[i - 1] * .97 - -prev.data <- data.frame(lo, hi) - -# print values -inequ_trend(prev.data, lo, hi)#> $data -#> lo hi rr rd -#> 1 200.0000 100.00000 2.000000 100.0000 -#> 2 198.0000 97.00000 2.041237 101.0000 -#> 3 196.0200 94.09000 2.083324 101.9300 -#> 4 194.0598 91.26730 2.126280 102.7925 -#> 5 192.1192 88.52928 2.170120 103.5899 -#> 6 190.1980 85.87340 2.214865 104.3246 -#> 7 188.2960 83.29720 2.260533 104.9988 -#> 8 186.4131 80.79828 2.307141 105.6148 -#> 9 184.5489 78.37434 2.354711 106.1746 -#> 10 182.7034 76.02311 2.403262 106.6803 -#> 11 180.8764 73.74241 2.452814 107.1340 -#> 12 179.0677 71.53014 2.503387 107.5375 -#> 13 177.2770 69.38424 2.555004 107.8927 -#> 14 175.5042 67.30271 2.607684 108.2015 -#> 15 173.7492 65.28363 2.661451 108.4655 -#> 16 172.0117 63.32512 2.716326 108.6866 -#> 17 170.2916 61.42537 2.772333 108.8662 -#> 18 168.5886 59.58260 2.829494 109.0060 -#> 19 166.9028 57.79513 2.887834 109.1076 -#> 20 165.2337 56.06127 2.947377 109.1725 -#> 21 163.5814 54.37943 3.008148 109.2020 -#> 22 161.9456 52.74805 3.070172 109.1975 -#> 23 160.3261 51.16561 3.133474 109.1605 -#> 24 158.7229 49.63064 3.198082 109.0922 -#> 25 157.1356 48.14172 3.264022 108.9939 -#> 26 155.5643 46.69747 3.331321 108.8668 -#> 27 154.0086 45.29655 3.400008 108.7121 -#> 28 152.4685 43.93765 3.470111 108.5309 -#> 29 150.9439 42.61952 3.541660 108.3243 -#> 30 149.4344 41.34093 3.614684 108.0935 -#> 31 147.9401 40.10071 3.689214 107.8394 -#> 32 146.4607 38.89769 3.765280 107.5630 -#> 33 144.9961 37.73076 3.842915 107.2653 -#> 34 143.5461 36.59883 3.922150 106.9473 -#> 35 142.1106 35.50087 4.003019 106.6098 -#> 36 140.6895 34.43584 4.085555 106.2537 -#> 37 139.2826 33.40277 4.169794 105.8799 -#> 38 137.8898 32.40068 4.255769 105.4891 -#> 39 136.5109 31.42866 4.343517 105.0823 -#> 40 135.1458 30.48580 4.433074 104.6600 -#> -#> attr(,"class") -#> [1] "sj_inequ_trend"-# plot trends - here we see that the relative inequalities -# are increasing over time, while the absolute inequalities -# are first increasing as well, but later are decreasing -# (while rel. inequ. are still increasing) -plot(inequ_trend(prev.data, lo, hi))-
This functions checks whether a number is, or numbers in a - vector are prime numbers.
-is_prime(x)- -
x | -An integer, or a vector of integers. |
for each prime number in x
-is_prime(89)#> [1] TRUEis_prime(15)#> [1] FALSE#> [1] TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE-
This function is similar to the SPSS MEAN.n
function and computes
- row means from a data.frame
or matrix
if at least n
- values of a row are valid (and not NA
mean_n(dat, n, digits = 2)- -
dat | -A data frame with at least two columns, where row means are applied. |
n | -May either be
If a row's sum of valid values is less than |
digits | -Numeric value indicating the number of decimal places to be used for rounding mean -value. Negative values are allowed (see 'Details'). |
A vector with row mean values of df
for those rows with at least n
- valid values. Else, NA
is returned.
Rounding to a negative number of digits
means rounding to a power of
- ten, so for example mean_n(df, 3, digits = -2) rounds to the
- nearest hundred.
- For n
, must be a numeric value from 0
to ncol(dat)
. If
- a row in dat
has at least n
non-missing values, the
- row mean is returned. If n
is a non-integer value from 0 to 1,
- n
is considered to indicate the proportion of necessary non-missing
- values per row. E.g., if n = .75
, a row must have at least ncol(dat) * n
- non-missing values for the row mean to be calculated. See 'Examples'.
-dat <- data.frame(c1 = c(1,2,NA,4), - c2 = c(NA,2,NA,5), - c3 = c(NA,4,NA,NA), - c4 = c(2,3,7,8)) - -# needs at least 4 non-missing values per row -mean_n(dat, 4) # 1 valid return value#> [1] NA 2.75 NA NA-# needs at least 3 non-missing values per row -mean_n(dat, 3) # 2 valid return values#> [1] NA 2.75 NA 5.67-# needs at least 2 non-missing values per row -mean_n(dat, 2)#> [1] 1.50 2.75 NA 5.67-# needs at least 1 non-missing value per row -mean_n(dat, 1) # all means are shown#> [1] 1.50 2.75 7.00 5.67-# needs at least 50% of non-missing values per row -mean_n(dat, .5) # 3 valid return values#> [1] 1.50 2.75 NA 5.67-# needs at least 75% of non-missing values per row -mean_n(dat, .75) # 2 valid return values#> [1] NA 2.75 NA 5.67-
Computes mean, sd and se for each sub-group (indicated by grp
- of dv
means_by_group( - x, - dv, - grp, - weights = NULL, - digits = 2, - out = c("txt", "viewer", "browser"), - encoding = "UTF-8", - file = NULL -) - -grpmean( - x, - dv, - grp, - weights = NULL, - digits = 2, - out = c("txt", "viewer", "browser"), - encoding = "UTF-8", - file = NULL -)- -
x | -A (grouped) data frame. |
dv | -Name of the dependent variable, for which the mean value, grouped
-by |
grp | -Factor with the cross-classifying variable, where |
weights | -Name of variable in |
digits | -Numeric, amount of digits after decimal point when rounding -estimates and values. |
out | -Character vector, indicating whether the results should be printed
-to console ( |
encoding | -Character vector, indicating the charset encoding used
-for variable and value labels. Default is |
file | -Destination file, if the output should be saved as file.
-Only used when |
For non-grouped data frames, means_by_group()
returns a data frame with
- following columns: term
, mean
, N
, std.dev
- std.error
and p.value
. For grouped data frames, returns
- a list of such data frames.
This function performs a One-Way-Anova with dv
as dependent
- and grp
as independent variable, by calling
- lm(count ~ as.factor(grp))
. Then contrast
- is called to get p-values for each sub-group. P-values indicate whether
- each group-mean is significantly different from the total mean.
-#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> Category | Mean | N | SD | SE | p -#> ---------------------------------------------------------- -#> independent | 9.91 | 66 | 8.01 | 0.99 | <0.001 -#> slightly dependent | 17.54 | 225 | 17.74 | 1.18 | <0.001 -#> moderately dependent | 34.52 | 306 | 41.54 | 2.37 | 0.98 -#> severely dependent | 75.90 | 304 | 61.72 | 3.54 | <0.001 -#> Total | 42.44 | 901 | 50.82 | 1.69 | -#> -#> Anova: R2=0.245; adj.R2=0.242; F=96.908; p=0.000#> -#> # Grouped Means for Sepal.Width by Species -#> -#> Category | Mean | N | SD | SE | p -#> ---------------------------------------------- -#> setosa | 3.43 | 50 | 0.38 | 0.05 | <0.001 -#> versicolor | 2.77 | 50 | 0.31 | 0.04 | <0.001 -#> virginica | 2.97 | 50 | 0.32 | 0.05 | 0.04 -#> Total | 3.06 | 150 | 0.44 | 0.04 | -#> -#> Anova: R2=0.401; adj.R2=0.393; F=49.160; p=0.000-# also works for grouped data frames -if (require("dplyr")) { - efc %>% - group_by(c172code) %>% - means_by_group(c12hour, e42dep) -}#> -#> Grouped by: -#> carer's level of education: low level of education -#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> Category | Mean | N | SD | SE | p -#> ---------------------------------------------------------- -#> independent | 16.33 | 12 | 10.74 | 3.10 | 0.02 -#> slightly dependent | 15.38 | 42 | 9.55 | 1.47 | <0.001 -#> moderately dependent | 42.05 | 61 | 46.53 | 5.96 | 0.70 -#> severely dependent | 85.52 | 65 | 56.42 | 7.00 | <0.001 -#> Total | 49.81 | 180 | 52.24 | 3.89 | -#> -#> Anova: R2=0.307; adj.R2=0.295; F=25.955; p=0.000 -#> -#> -#> Grouped by: -#> carer's level of education: intermediate level of education -#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> Category | Mean | N | SD | SE | p -#> ---------------------------------------------------------- -#> independent | 7.96 | 45 | 3.91 | 0.58 | <0.001 -#> slightly dependent | 17.12 | 135 | 16.52 | 1.42 | <0.001 -#> moderately dependent | 33.55 | 163 | 41.05 | 3.22 | 0.75 -#> severely dependent | 79.71 | 163 | 63.13 | 4.94 | <0.001 -#> Total | 41.76 | 506 | 51.42 | 2.29 | -#> -#> Anova: R2=0.284; adj.R2=0.280; F=66.374; p=0.000 -#> -#> -#> Grouped by: -#> carer's level of education: high level of education -#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> Category | Mean | N | SD | SE | p -#> ---------------------------------------------------------- -#> independent | 15.20 | 5 | 18.43 | 8.24 | 0.36 -#> slightly dependent | 18.08 | 39 | 12.98 | 2.08 | 0.15 -#> moderately dependent | 28.42 | 62 | 35.64 | 4.53 | 0.67 -#> severely dependent | 63.38 | 50 | 62.69 | 8.87 | <0.001 -#> Total | 36.62 | 156 | 46.38 | 3.71 | -#> -#> Anova: R2=0.167; adj.R2=0.151; F=10.155; p=0.000 -#> -#>-# weighting -efc$weight <- abs(rnorm(n = nrow(efc), mean = 1, sd = .5)) -means_by_group(efc, c12hour, e42dep, weights = weight)#> -#> # Grouped Means for average number of hours of care per week by elder's dependency -#> -#> Category | Mean | N | SD | SE | p -#> ---------------------------------------------------------- -#> independent | 8.86 | 72 | 6.09 | 0.75 | <0.001 -#> slightly dependent | 17.28 | 225 | 15.09 | 1.01 | <0.001 -#> moderately dependent | 35.26 | 296 | 41.66 | 2.38 | 0.75 -#> severely dependent | 76.53 | 297 | 61.73 | 3.54 | <0.001 -#> Total | 42.35 | 901 | 50.62 | 1.69 | -#> -#> Anova: R2=0.256; adj.R2=0.254; F=103.078; p=0.000
is a short summary for multivariate-response
- mediation-models.
mediation(x, ...) - -# S3 method for brmsfit -mediation(x, treatment, mediator, prob = 0.9, typical = "median", ...)- -
x | -A |
... | -Not used. |
treatment | -Character, name of the treatment variable (or direct effect)
-in a (multivariate response) mediator-model. If missing, |
mediator | -Character, name of the mediator variable in a (multivariate
-response) mediator-model. If missing, |
prob | -Vector of scalars between 0 and 1, indicating the mass within -the credible interval that is to be estimated. |
typical | -The typical value that will represent the Bayesian point estimate.
-By default, the posterior median is returned. See |
A data frame with direct, indirect, mediator and - total effect of a multivariate-response mediation-model, as well as the - proportion mediated. The effect sizes are mean values of the posterior - samples.
returns a data frame with information on the
- direct effect (mean value of posterior samples from treatment
- of the outcome model), mediator effect (mean value of posterior
- samples from mediator
of the outcome model), indirect effect
- (mean value of the multiplication of the posterior samples from
- mediator
of the outcome model and the posterior samples from
- treatment
of the mediation model) and the total effect (mean
- value of sums of posterior samples used for the direct and indirect
- effect). The proportion mediated is the indirect effect divided
- by the total effect.
- For all values, the 90% HDIs are calculated by default. Use prob
- to calculate a different interval.
- The arguments treatment
and mediator
do not necessarily
- need to be specified. If missing, mediation()
tries to find the
- treatment and mediator variable automatically. If this does not work,
- specify these variables.
This function performs a Mann-Whitney-U-Test (or Wilcoxon rank sum test,
- see wilcox.test
and wilcox_test
- for x
, for each group indicated by grp
. If grp
- has more than two categories, a comparison between each combination of
- two groups is performed.
- The function reports U, p and Z-values as well as effect size r
- and group-rank-means.
mwu( - data, - x, - grp, - distribution = "asymptotic", - out = c("txt", "viewer", "browser"), - encoding = "UTF-8", - file = NULL -) - -mannwhitney( - data, - x, - grp, - distribution = "asymptotic", - out = c("txt", "viewer", "browser"), - encoding = "UTF-8", - file = NULL -)- -
data | -A data frame. |
x | -Bare (unquoted) variable name, or a character vector with the variable name. |
grp | -Bare (unquoted) name of the cross-classifying variable, where
- |
distribution | -Indicates how the null distribution of the test statistic should be computed.
-May be one of |
out | -Character vector, indicating whether the results should be printed
-to console ( |
encoding | -Character vector, indicating the charset encoding used
-for variable and value labels. Default is |
file | -Destination file, if the output should be saved as file.
-Only used when |
(Invisibly) returns a data frame with U, p and Z-values for each group-comparison - as well as effect-size r; additionally, group-labels and groups' n's are - also included.
-This function calls the wilcox_test
with formula. If grp
- has more than two groups, additionally a Kruskal-Wallis-Test (see kruskal.test
- is performed.
- Interpretation of effect sizes, as a rule-of-thumb:
small effect >= 0.1
medium effect >= 0.3
large effect >= 0.5
-#> -#> # Mann-Whitney-U-Test -#> -#> Groups 1 = independent (n = 65) | 2 = slightly dependent (n = 224): -#> U = 7635.000, W = 5490.000, p = 0.003, Z = -3.020 -#> effect-size r = 0.178 -#> rank-mean(1) = 117.46 -#> rank-mean(2) = 152.99 -#> -#> Groups 1 = independent (n = 65) | 3 = moderately dependent (n = 304): -#> U = 8692.000, W = 6547.000, p < 0.001, Z = -4.273 -#> effect-size r = 0.222 -#> rank-mean(1) = 133.72 -#> rank-mean(3) = 195.96 -#> -#> Groups 1 = independent (n = 65) | 4 = severely dependent (n = 297): -#> U = 7905.500, W = 5760.500, p < 0.001, Z = -5.096 -#> effect-size r = 0.268 -#> rank-mean(1) = 121.62 -#> rank-mean(4) = 194.60 -#> -#> Groups 2 = slightly dependent (n = 224) | 3 = moderately dependent (n = 304): -#> U = 54664.500, W = 29464.500, p = 0.008, Z = -2.647 -#> effect-size r = 0.115 -#> rank-mean(2) = 244.04 -#> rank-mean(3) = 279.58 -#> -#> Groups 2 = slightly dependent (n = 224) | 4 = severely dependent (n = 297): -#> U = 51007.500, W = 25807.500, p < 0.001, Z = -4.386 -#> effect-size r = 0.192 -#> rank-mean(2) = 227.71 -#> rank-mean(4) = 286.11 -#> -#> Groups 3 = moderately dependent (n = 304) | 4 = severely dependent (n = 297): -#> U = 87819.500, W = 41459.500, p = 0.083, Z = -1.732 -#> effect-size r = 0.071 -#> rank-mean(3) = 288.88 -#> rank-mean(4) = 313.41 -#> -#> # Kruskal-Wallis-Test -#> -#> chi-squared = 38.476 -#> df = 3 -#> p < 0.001-
- nhanes_sample.Rd
Selected variables from the National Health and Nutrition Examination
- Survey that are used in the example from Lumley (2010), Appendix E.
- See svyglm.nb
for examples.
Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley
- -R/odds_to_rr.R
- odds_to_rr.Rd
converts odds ratios from a logistic regression
- model (including mixed models) into relative risks; or_to_rr()
- converts a single odds ratio estimate into a relative risk estimate.
odds_to_rr(fit) - -or_to_rr(or, p0)- -
fit | -A fitted binomial generalized linear (mixed) model with logit-link function -(logistic (multilevel) regression model). |
or | -Numeric, an odds ratio estimate. |
p0 | -Numeric, the risk of having a positive outcome in the control or -unexposed group (reference group), i.e. the number of outcome or "successes" -in the control divided by the total number of observations in the control -group. |
A data frame with relative risks and lower/upper confidence interval for
- the relative risks estimates; for or_to_rr()
, the risk ratio
- estimate.
This function extracts the odds ratios (exponentiated model coefficients)
- from logistic regressions (fitted with glm
or glmer
- and their related confidence intervals, and transforms these values
- into relative risks (and their related confidence intervals).
- The formula for transformation is based on Zhang and Yu (1998),
- Wang (2013) and Grant (2014):
- RR <- OR / (1 - P0 + (P0 * OR))
, where OR
is the odds
- ratio and P0
indicates the proportion of the incidence in
- the outcome variable for the control group (reference group).
Grant RL. 2014. Converting an odds ratio to a range of plausible relative risks for better communication of research findings. BMJ 348:f7450. doi: 10.1136/bmj.f7450
- Wang Z. 2013. Converting Odds Ratio to Relative Risk in Cohort Studies with Partial Data Information. J Stat Soft 2013;55. doi: 10.18637/jss.v055.i05
- Zhang J, Yu KF. 1998. What's the Relative Risk? A Method of Correcting the Odds Ratio in Cohort Studies of Common Outcomes. JAMA; 280(19): 1690-1. doi: 10.1001/jama.280.19.1690
-#># create binary response -sleepstudy$Reaction.dicho <- dicho(sleepstudy$Reaction, dich.by = "median") -# fit model -fit <- glmer(Reaction.dicho ~ Days + (Days | Subject), - data = sleepstudy, family = binomial("logit")) -# convert to relative risks -odds_to_rr(fit)#> Parameter Odds Ratio Risk Ratio CI_low CI_high -#> 1 (Intercept) 0.02201714 0.04308565 0.004411356 0.3597023 -#> 2 Days 2.43719045 1.41812942 1.212151446 1.5885357- -data(efc) -# create binary response -y <- ifelse(efc$neg_c_7 < median(na.omit(efc$neg_c_7)), 0, 1) -# create data frame for fitted model -mydf <- data.frame( - y = as.factor(y), - sex = to_factor(efc$c161sex), - dep = to_factor(efc$e42dep), - barthel = efc$barthtot, - education = to_factor(efc$c172code) -) -# fit model -fit <- glm(y ~., data = mydf, family = binomial(link = "logit")) -# convert to relative risks -odds_to_rr(fit)#>#> Parameter Odds Ratio Risk Ratio CI_low CI_high -#> 1 (Intercept) 2.014220 1.2942650 0.8005943 1.6101037 -#> 2 sex2 1.913887 1.3686646 1.1632760 1.5625768 -#> 3 dep2 1.624596 1.4468228 0.8511852 2.3180826 -#> 4 dep3 3.080617 2.1859172 1.4029261 3.1216957 -#> 5 dep4 2.484804 1.9230853 1.0500095 3.0354848 -#> 6 barthel 0.970638 0.9865251 0.9822535 0.9906641 -#> 7 education2 1.254234 1.1027736 0.9233955 1.2698368 -#> 8 education3 1.327901 1.1280723 0.9057139 1.3294549-# replicate OR/RR for coefficient "sex" from above regression -# p0 ~ .44, or ~ 1.914 -prop.table(table(mydf$y, mydf$sex))#> -#> 1 2 -#> 0 0.1324355 0.3153760 -#> 1 0.1054994 0.4466891or_to_rr(1.914, 0.1055 / (.1324 + .1055))#> [1] 1.361962-
This function returns the p-values for fitted model objects.
- -p_value(fit, ...) - -# S3 method for lmerMod -p_value(fit, p.kr = FALSE, ...)- -
fit | -A model object. |
... | -Currently not used. |
p.kr | -Logical, if |
A data.frame
with the model coefficients' names (term
- p-values (p.value
) and standard errors (std.error
For linear mixed models (lmerMod
-objects), the computation of
- p-values (if p.kr = TRUE
) is based on conditional F-tests
- with Kenward-Roger approximation for the df, using the
- pbkrtest-package. If pbkrtest is not available or
- p.kr = FALSE
, or if x
is a glmerMod
- computation of p-values is based on normal-distribution assumption,
- treating the t-statistics as Wald z-statistics.
- If p-values already have been computed (e.g. for merModLmerTest
- from the lmerTest-package), these will be returned.
- The print()
-method has a summary
-argument, that - in
- case p.kr = TRUE
- also prints information on the approximated
- degrees of freedom (see 'Examples'). A shortcut is the
- summary()
-method, which simply calls print(..., summary = TRUE)
-#> term p.value std.error -#> 1 (Intercept) 0.000 0.566 -#> 2 e42dep 0.000 0.133 -#> 3 c172code 0.207 0.198#> -#>#>-#> -#>#>-#> -#>fit <- gls(follicles ~ sin(2*pi*Time) + cos(2*pi*Time), Ovary, - correlation = corAR1(form = ~ 1 | Mare)) -p_value(fit)#> term p.value std.error -#> 1 (Intercept) 0.000 0.665 -#> 2 sin(2 * pi * Time) 0.000 0.645 -#> 3 cos(2 * pi * Time) 0.198 0.698-# lme4-fit -library(lme4) -sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE) -fit <- lmer(Reaction ~ Days + (1 | mygrp) + (1 | Subject), sleepstudy)#>pv <- p_value(fit, p.kr = TRUE)#>-# normal output -pv#> term p.value std.error -#> 1 (Intercept) 0 9.766 -#> 2 Days 0 0.815#> term p.value std.error df statistic -#> 1 (Intercept) 0 9.766 22.785 25.742 -#> 2 Days 0 0.815 160.682 12.838#> term p.value std.error df statistic -#> 1 (Intercept) 0 9.766 22.785 25.742 -#> 2 Days 0 0.815 160.682 12.838-
- -pca(x) - -pca_rotate(x, nf = NULL, rotation = c("varimax", "quartimax", "promax", - "oblimin", "simplimax", "cluster", "none"))- -
x | -A data frame or a |
nf | -Number of components to extract. If |
rotation | -Rotation of the factor loadings. May be one of
- |
A tidy data frame with either all loadings of principal components
- (for pca()
) or a rotated loadings matrix (for pca_rotate()
The print()
-method for pca_rotate()
has a
- cutoff
-argument, which is a scalar between 0 and 1, indicating
- which (absolute) values from the loadings should be blank in the
- output. By default, all loadings below .1 (or -.1) are not shown.
-data(efc) -# recveive first item of COPE-index scale -start <- which(colnames(efc) == "c82cop1") -# recveive last item of COPE-index scale -end <- which(colnames(efc) == "c90cop9") - -# extract principal components -pca(efc[, start:end])#> PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 -#> Standard deviation 1.7976 1.1311 0.9665 0.8609 0.8156 0.7951 0.7426 0.7251 -#> Eigenvalue 3.2314 1.2793 0.9342 0.7412 0.6651 0.6322 0.5515 0.5258 -#> Proportion variance 0.3590 0.1421 0.1038 0.0824 0.0739 0.0702 0.0613 0.0584 -#> Cumulative variance 0.3590 0.5012 0.6050 0.6873 0.7612 0.8315 0.8928 0.9512 -#> PC9 -#> Standard deviation 0.6627 -#> Eigenvalue 0.4392 -#> Proportion variance 0.0488 -#> Cumulative variance 1.0000-# extract principal components, varimax-rotation. -# number of components based on Kaiser-criteria -pca_rotate(efc[, start:end])#> variable PC1 PC2 -#> 1 c82cop1 0.2911 0.5964 -#> 2 c83cop2 -0.5976 -0.4235 -#> 3 c84cop3 -0.6885 -0.1564 -#> 4 c85cop4 -0.726 -0.119 -#> 5 c86cop5 -0.6426 -#> 6 c87cop6 -0.6934 0.1213 -#> 7 c88cop7 -0.6768 -0.3796 -#> 8 c89cop8 0.6364 -#> 9 c90cop9 0.7542 -#> -#> PC1 PC2 -#> Proportion variance 0.312 0.190 -#> Cumulative variance 0.312 0.501 -#> Proportion explained 0.622 0.378 -#> Cumulative explained 0.622 1.000-
This function calculates the predictive accuracy of linear - or logistic regression models.
- -pred_accuracy(data, fit, method = c("cv", "boot"), k = 5, n = 1000)- -
data | -A data frame. |
fit | -Fitted model object of class |
method | -Character string, indicating whether crossvalidation
-( |
k | -The number of folds for the kfold-crossvalidation. |
n | -Number of bootstraps to be generated. |
A list with two values: The accuracy
of the model predictions, i.e.
- the proportion of accurately predicted values from the model and
- its standard error, std.error
For linar models, the accuracy is the correlation coefficient
- between the actual and the predicted value of the outcome. For
- logistic regression models, the accuracy corresponds to the
- AUC-value, calculated with the auc
- The accuracy is the mean value of multiple correlation resp.
- AUC-values, which are either computed with crossvalidation
- or nonparametric bootstrapping (see argument method
- The standard error is the standard deviation of the computed
- correlation resp. AUC-values.
-data(efc) -fit <- lm(neg_c_7 ~ barthtot + c161sex, data = efc) - -# accuracy for linear model, with crossvalidation -pred_accuracy(efc, fit)#> -#> # Accuracy of Model Predictions -#> -#> Accuracy: 41.12% -#> SE: 6.56%-points -#> Method: Correlation between observed and predicted-# accuracy for linear model, with bootstrapping -pred_accuracy(efc, fit, method = "boot", n = 100)#> -#> # Accuracy of Model Predictions -#> -#> Accuracy: 41.30% -#> SE: 2.78%-points -#> Method: Correlation between observed and predicted-# accuracy for logistic regression, with crossvalidation -efc$services <- sjmisc::dicho(efc$tot_sc_e, dich.by = 0, as.num = TRUE) -fit <- glm(services ~ neg_c_7 + c161sex + e42dep, - data = efc, family = binomial(link = "logit")) -pred_accuracy(efc, fit)#> -#> # Accuracy of Model Predictions -#> -#> Accuracy: 58.38% -#> SE: 3.92%-points -#> Method: Area under Curve-
, R/model_family.R
, R/model_frame.R
, and 6 more
- pred_vars.Rd
Several functions to retrieve information from model objects, - like variable names, link-inverse function, model frame, - model family etc., in a tidy and consistent way.
- -link_inverse(x, multi.resp = FALSE, mv = FALSE) - -model_family(x, multi.resp = FALSE, mv = FALSE) - -model_frame(x, fe.only = TRUE) - -pred_vars(x, ...) - -# S3 method for default -pred_vars(x, fe.only = FALSE, ...) - -# S3 method for glmmTMB -pred_vars(x, fe.only = FALSE, zi = FALSE, - disp = FALSE, ...) - -# S3 method for MixMod -pred_vars(x, fe.only = FALSE, zi = FALSE, ...) - -re_grp_var(x) - -grp_var(x) - -resp_val(x) - -resp_var(x, combine = TRUE) - -var_names(x)- -
x | -A fitted model; for |
mv, multi.resp | -Logical, if |
fe.only | -Logical, if |
... | -Currently not used. |
zi | -Logical, if |
disp | -Logical, if |
combine | -Logical, if |
For pred_vars()
and resp_var()
, the name(s) of the
- response or predictor variables from x
as character vector.
- resp_val()
returns the values from x
's response vector.
- re_grp_var()
returns the group factor of random effects in
- mixed models, or NULL
if x
has no such random effects term
- (grp_var()
is an alias for re_grp_var()
- link_inverse()
returns, if known, the inverse link function from
- x
; else NULL
for those models where the inverse link function
- can't be identified.
- model_frame()
is similar to model.frame()
- but should also work for model objects that don't have a S3-generic for
- model.frame()
- var_names()
returns the "cleaned" variable
- names, i.e. things like s()
for splines or log()
- removed.
- model_family()
returns a list with information about the
- model family (see 'Details').
returns a list with information about the
- model family for many different model objects. Following information
- is returned, where all values starting with is_
are logicals.
: family is binomial (but not negative binomial)
: family is either poisson or negative binomial
: family is negative binomial
: model is a count model (i.e. family is either poisson or negative binomial)
: family is beta
: model has logit link
: family is gaussian
: family is ordinal or cumulative link
: family is categorical link
: model has zero-inflation component
: model is a multivariate response model (currently only works for brmsfit objects)
: model response contains additional information about the trials
: the link-function
: the family-object
slighty differs from model.frame()
, especially
- for spline terms and matrix-variables created with cbind()
(for example
- in binomial models, where the response is a combination of successes and
- trials) . Where model.frame()
returns a matrix for splines,
- model_frame()
returns the data of the original variable and uses
- the same column name as in the data
-argument from the model-function.
- This makes it easier, for instance, to get data that should be used as new
- data in predict()
. For matrix-variables created with cbind()
- model_frame()
returns the original variable as matrix and
- additionally each column as own variable. See 'Examples'.
-#> [1] "e42dep" "c161sex"resp_var(fit)#> [1] "neg_c_7"resp_val(fit)#> [1] 12 20 11 10 12 19 15 11 15 10 28 18 13 18 16 13 11 11 13 17 11 9 8 14 11 -#> [26] 23 11 15 11 25 9 15 20 9 10 19 8 17 16 17 14 14 16 19 17 15 16 19 17 10 -#> [51] 14 14 9 12 25 17 22 13 15 7 19 11 15 13 11 14 17 7 15 11 19 10 10 20 10 -#> [76] 12 15 7 13 12 16 10 15 15 15 25 11 10 11 14 10 10 13 10 11 18 14 12 10 9 -#> [101] 13 14 10 10 13 13 12 12 18 7 13 14 11 16 15 15 9 17 17 22 16 14 9 13 9 -#> [126] 17 17 9 13 14 12 18 7 10 12 20 12 14 12 10 12 11 14 11 13 10 12 12 10 9 -#> [151] 15 12 11 14 16 18 11 11 14 14 12 10 9 12 8 10 11 10 11 13 7 10 11 12 15 -#> [176] 10 16 13 20 7 12 17 14 10 12 9 7 16 13 14 8 8 20 7 15 7 9 14 11 12 -#> [201] 11 12 18 8 13 16 8 13 14 11 8 12 24 11 11 13 9 13 20 12 16 15 20 10 12 -#> [226] 12 12 11 10 9 10 8 10 12 10 9 11 7 9 11 11 12 11 14 12 12 19 12 15 11 -#> [251] 17 8 13 11 10 8 10 19 10 18 8 11 9 10 13 11 9 9 8 9 8 8 11 9 10 -#> [276] 12 9 17 20 12 7 9 7 8 8 14 7 10 8 16 9 16 13 8 20 16 9 9 8 15 -#> [301] 16 19 8 12 17 12 14 11 9 11 9 8 10 8 10 15 13 8 10 10 12 14 12 7 8 -#> [326] 16 9 16 7 8 13 9 7 9 9 8 17 7 8 9 7 10 10 11 18 9 10 13 8 12 -#> [351] 9 7 10 8 7 7 12 12 9 8 10 18 16 11 15 10 9 9 12 18 12 13 17 9 8 -#> [376] 7 16 12 14 15 10 9 17 17 21 17 17 15 9 12 12 22 11 14 11 9 8 12 13 13 -#> [401] 9 10 12 9 11 13 11 17 10 18 10 16 10 10 14 11 11 10 11 8 15 12 10 13 13 -#> [426] 13 12 9 13 10 15 18 11 14 11 12 12 14 15 8 10 9 7 8 18 7 7 7 11 8 -#> [451] 11 11 16 13 14 14 7 9 7 17 7 10 9 9 7 12 14 7 10 20 7 8 9 11 10 -#> [476] 14 7 8 8 10 8 12 10 14 11 8 11 17 10 22 8 9 19 11 18 16 18 15 19 10 -#> [501] 13 15 7 8 22 8 20 19 10 7 25 9 11 7 11 9 8 12 9 20 7 12 9 9 8 -#> [526] 10 8 17 12 9 9 8 7 8 9 17 17 8 9 9 10 9 7 8 27 25 14 28 16 11 -#> [551] 15 7 9 7 7 8 13 19 15 14 20 20 14 10 11 15 7 14 11 13 16 13 10 17 10 -#> [576] 12 11 7 8 15 13 11 7 18 17 12 18 17 13 10 19 7 8 10 18 17 19 8 12 10 -#> [601] 14 10 13 9 8 8 9 15 11 7 8 11 21 8 11 10 10 11 10 11 9 13 17 9 8 -#> [626] 8 9 13 14 14 9 12 8 11 10 11 11 10 10 10 12 13 7 8 12 8 8 13 10 12 -#> [651] 16 8 10 13 10 9 10 12 11 9 10 9 13 10 9 10 8 7 8 7 7 9 8 11 9 -#> [676] 10 12 11 7 16 12 10 8 12 23 10 10 18 13 12 18 9 12 13 9 7 10 7 8 17 -#> [701] 11 14 11 23 14 8 7 15 8 12 9 15 17 13 13 10 20 10 11 25 10 12 10 12 10 -#> [726] 8 14 8 18 8 15 11 12 10 7 10 13 14 7 7 14 11 11 11 9 7 15 9 9 18 -#> [751] 8 15 7 8 13 8 8 9 7 7 9 8 8 13 10 11 13 11 8 12 8 9 16 11 19 -#> [776] 12 12 9 10 10 9 13 7 11 13 10 10 13 9 14 15 15 9 10 8 8 9 9 9 9 -#> [801] 9 13 9 12 14 12 8 10 7 22 18 16 13 15 24 11 14 12 11 10 7 10 10 12 10 -#> [826] 7 9 16 14 12 9 10 8 9 7 8 10 9 8 10 10 7 11 8 10 11 14 7 8 10 -#> [851] 10 11 11 8 8 9 11 7 7 8 9 9 7 13 15 11 24 8 9 7 10 15 18 22 18 -#> [876] 9 11 14 7 9 17 23 12 13 15 8 8 14 10 10 -#> attr(,"label") -#> [1] "Negative impact with 7 items"-link_inverse(fit)(2.3)#> [1] 2.3-# example from ?stats::glm -counts <- c(18, 17, 15, 20, 10, 20, 25, 13, 12) -outcome <- gl(3, 1, 9) -treatment <- gl(3, 3) -m <- glm(counts ~ outcome + treatment, family = poisson()) - -link_inverse(m)(.3)#> [1] 1.349859#> [1] 1.349859-outcome <- as.numeric(outcome) -m <- glm(counts ~ log(outcome) + as.factor(treatment), family = poisson()) -var_names(m)#> [1] "counts" "outcome" "treatment"-# model.frame and model_frame behave slightly different -library(splines) -m <- lm(neg_c_7 ~ e42dep + ns(c160age, knots = 2), data = efc) -head(model.frame(m))#> neg_c_7 e42dep ns(c160age, knots = 2).1 ns(c160age, knots = 2).2 -#> 1 12 3 0.49465270 0.08689310 -#> 2 20 3 0.49766116 0.04922034 -#> 3 11 3 0.45855117 0.53896628 -#> 4 10 4 0.47509770 0.33176607 -#> 5 12 4 0.50819077 -0.08263434 -#> 6 19 4 0.49465270 0.08689310#> neg_c_7 e42dep c160age -#> 1 12 3 56 -#> 2 20 3 54 -#> 3 11 3 80 -#> 4 10 4 69 -#> 5 12 4 47 -#> 6 19 4 56-library(lme4) -data(cbpp) -cbpp$trials <- cbpp$size - cbpp$incidence -m <- glm(cbind(incidence, trials) ~ period, data = cbpp, family = binomial) -head(model.frame(m))#> cbind(incidence, trials).incidence cbind(incidence, trials).trials period -#> 1 2 12 1 -#> 2 3 9 2 -#> 3 4 5 3 -#> 4 0 5 4 -#> 5 3 19 1 -#> 6 1 17 2#> cbind(incidence, trials).incidence cbind(incidence, trials).trials period -#> 1 2 12 1 -#> 2 3 9 2 -#> 3 4 5 3 -#> 4 0 5 4 -#> 5 3 19 1 -#> 6 1 17 2 -#> incidence trials -#> 1 2 12 -#> 2 3 9 -#> 3 4 5 -#> 4 0 5 -#> 5 3 19 -#> 6 1 17-resp_var(m, combine = TRUE)#> [1] "cbind(incidence, trials)"resp_var(m, combine = FALSE)#> [1] "incidence" "trials"-# get random effects grouping factor from mixed models -library(lme4) -data(sleepstudy) -m <- lmer(Reaction ~ Days + (1 + Days | Subject), data = sleepstudy) -re_grp_var(m)#> [1] "Subject"-# get model predictors, with and w/o dispersion formula -# NOT RUN { -library(glmmTMB) -data("Salamanders") -m <- glmmTMB( - count ~ spp + cover + mined + poly(DOP, 3) + (1 | site), - ziformula = ~spp + mined, - dispformula = ~DOY, - data = Salamanders, - family = nbinom2 -) - -pred_vars(m) -pred_vars(m, fe.only = TRUE) -pred_vars(m, disp = TRUE) -# }-
calculates the proportion of a value or category
- in a variable. props()
does the same, but allows for
- multiple logical conditions in one statement. It is similar
- to mean()
with logical predicates, however, both
- prop()
and props()
work with grouped data frames.
prop(data, ..., weights = NULL, na.rm = TRUE, digits = 4) - -props(data, ..., na.rm = TRUE, digits = 4)- -
data | -A data frame. May also be a grouped data frame (see 'Examples'). |
... | -One or more value pairs of comparisons (logical predicates). Put -variable names the left-hand-side and values to match on the -right hand side. Expressions may be quoted or unquoted. See -'Examples'. |
weights | -Vector of weights that will be applied to weight all observations.
-Must be a vector of same length as the input vector. Default is
- |
na.rm | -Logical, whether to remove NA values from the vector when the
-proportion is calculated. |
digits | -Amount of digits for returned values. |
For one condition, a numeric value with the proportion of the values - inside a vector. For more than one condition, a tibble with one column - of conditions and one column with proportions. For grouped data frames, - returns a tibble with one column per group with grouping categories, - followed by one column with proportions per condition.
only allows one logical statement per comparison,
- while props()
allows multiple logical statements per comparison.
- However, prop()
supports weighting of variables before calculating
- proportions, and comparisons may also be quoted. Hence, prop()
- also processes comparisons, which are passed as character vector
- (see 'Examples').
-#> [1] 0.0733-# expression may also be completely quoted -prop(efc, "e42dep == 1")#> [1] 0.0733-# use "props()" for multiple logical statements -props(efc, e17age > 70 & e17age < 80)#> [1] 0.3199-# proportion of value 1 in e42dep, and all values greater -# than 2 in e42dep, including missing values. will return a tibble -prop(efc, e42dep == 1, e42dep > 2, na.rm = FALSE)#> condition prop -#> 1 e42dep==1 0.0727 -#> 2 e42dep>2 0.6718-# for factors or character vectors, use quoted or unquoted values -library(sjmisc) -# convert numeric to factor, using labels as factor levels -efc$e16sex <- to_label(efc$e16sex) -efc$n4pstu <- to_label(efc$n4pstu) - -# get proportion of female older persons -prop(efc, e16sex == female)#> [1] 0.6715-# get proportion of male older persons -prop(efc, e16sex == "male")#> [1] 0.3285-# "props()" needs quotes around non-numeric factor levels -props(efc, - e17age > 70 & e17age < 80, - n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' -)#> condition prop -#> 1 e17age>70&e17age<80 0.3199 -#> 2 n4pstu==CareLevel1|n4pstu==CareLevel3 0.3137#> [1] 0.8092efc %>% prop(e17age > 70, e16sex == 1)#> condition prop -#> 1 e17age>70 0.8092 -#> 2 e16sex==1 0.0000#> Warning: Factor `e16sex` contains implicit NA, consider using `forcats::fct_explicit_na`#> Warning: Factor `e16sex` contains implicit NA, consider using `forcats::fct_explicit_na`#> elder's gender e42dep>2 -#> 1 male 0.6847 -#> 2 female 0.6744-efc %>% - select(e42dep, c161sex, c172code, e16sex) %>% - group_by(c161sex, c172code) %>% - prop(e42dep > 2, e16sex == 1)#> carer's gender carer's level of education e42dep>2 e16sex==1 -#> 1 Male low level of education 0.6829 0 -#> 5 Male intermediate level of education 0.6590 0 -#> 3 Male high level of education 0.7872 0 -#> 4 Female low level of education 0.7101 0 -#> 2 Female intermediate level of education 0.5929 0 -#> 6 Female high level of education 0.6881 0-# same for "props()" -efc %>% - select(e42dep, c161sex, c172code, c12hour, n4pstu) %>% - group_by(c161sex, c172code) %>% - props( - e42dep > 2, - c12hour > 20 & c12hour < 40, - n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' - )#> carer's gender carer's level of education e42dep>2 c12hour>20&c12hour<40 -#> 1 Male low level of education 0.6829 0.2439 -#> 5 Male intermediate level of education 0.6590 0.1756 -#> 3 Male high level of education 0.7872 0.1489 -#> 4 Female low level of education 0.7101 0.1957 -#> 2 Female intermediate level of education 0.5929 0.1504 -#> 6 Female high level of education 0.6881 0.2018 -#> n4pstu==CareLevel1|n4pstu==CareLevel3 -#> 1 0.2250 -#> 5 0.3111 -#> 3 0.3191 -#> 4 0.3433 -#> 2 0.3540 -#> 6 0.2752-
These functions extracts random effect variances as well as
- random-intercept-slope-correlation of mixed effects models.
- Currently, merMod
, glmmTMB
- stanreg
and brmsfit
- objects are supported.
re_var(x, adjusted = FALSE) - -get_re_var(x, comp = c("tau.00", "tau.01", "tau.11", "rho.01", - "sigma_2"))- -
x | -Fitted mixed effects model (of class |
adjusted | -Logical, if |
comp | -Name of the variance component to be returned. See 'Details'. |
returns the value of the requested variance component,
- re_var()
returns all random effects variances.
The random effect variances indicate the between- and within-group
- variances as well as random-slope variance and random-slope-intercept
- correlation. Use following values for comp
to get the particular
- variance component:
Within-group (residual) variance
Between-group-variance (variation between individual intercepts and average intercept)
Random-slope-variance (variation between individual slopes and average slope)
The within-group-variance is affected by factors at level one, i.e.
- by the lower-level direct effects. Level two factors (i.e. cross-level
- direct effects) affect the between-group-variance. Cross-level
- interaction effects are group-level factors that explain the
- variance in random slopes (Aguinis et al. 2013).
- If adjusted = TRUE
, the variance of the fixed and random
- effects as well as of the additive dispersion and
- distribution-specific variance are returned (see Johnson et al. 2014
- and Nakagawa et al. 2017):
variance attributable to the fixed effects
(mean) variance of random effects
variance due to additive dispersion
distribution-specific variance
sum of dispersion and distribution
Aguinis H, Gottfredson RK, Culpepper SA. 2013. Best-Practice Recommendations for Estimating Cross-Level Interaction Effects Using Multilevel Modeling. Journal of Management 39(6): 1490-1528 (doi: 10.1177/0149206313478188 -)
Johnson PC, O'Hara RB. 2014. Extension of Nakagawa & Schielzeth's R2GLMM to random slopes models. Methods Ecol Evol, 5: 944-946. (doi: 10.1111/2041-210X.12225 -)
Nakagawa S, Johnson P, Schielzeth H (2017) The coefficient of determination R2 and intra-class correlation coefficient from generalized linear mixed-effects models revisted and expanded. J. R. Soc. Interface 14. doi: 10.1098/rsif.2017.0213
-library(lme4) -fit1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy) - -# all random effect variance components -re_var(fit1)#> Within-group-variance: 654.941 -#> Between-group-variance: 611.898 (Subject) -#> Random-slope-variance: 35.081 (Subject.Days) -#> Slope-Intercept-covariance: 9.614 (Subject.(Intercept)) -#> Slope-Intercept-correlation: 0.066 (Subject)re_var(fit1, adjusted = TRUE)#> -#> Variance Components of Mixed Models -#> -#> Family : gaussian (identity) -#> Formula: list(conditional = Reaction ~ Days, random = ~Days | Subject) -#> -#> fixed: 908.953 -#> random: 1698.233 -#> residual: 654.941 -#> dispersion: 0.000 -#> distribution: 654.941 -#> -#>-# just the rand. slope-intercept covariance -get_re_var(fit1, "tau.01")#> Subject.(Intercept) -#> 9.613886-sleepstudy$mygrp <- sample(1:45, size = 180, replace = TRUE) -fit2 <- lmer(Reaction ~ Days + (1 | mygrp) + (Days | Subject), sleepstudy) -re_var(fit2)#> Within-group-variance: 605.912 -#> Between-group-variance: 44.917 (mygrp) -#> Between-group-variance: 615.511 (Subject) -#> Random-slope-variance: 38.301 (Subject.Days) -#> Slope-Intercept-covariance: 1.108 (Subject.(Intercept)) -#> Slope-Intercept-correlation: 0.007 (Subject)-
- reliab_test.Rd
These function compute various measures of internal consistencies - for tests or item-scales of questionnaires.
- -reliab_test(x, scale.items = FALSE, digits = 3, out = c("txt", - "viewer", "browser")) - -split_half(x, digits = 3) - -cronb(x) - -difficulty(x) - -mic(x, cor.method = c("pearson", "spearman", "kendall"))- -
x | -Depending on the function, |
scale.items | -Logical, if |
digits | -Amount of digits for returned values. |
out | -Character vector, indicating whether the results should be printed
-to console ( |
cor.method | -Correlation computation method. May be one of
- |
A data frame with the corrected item-total correlations (item
- discrimination, column item.discr
) and Cronbach's alpha
- (if item deleted, column alpha.if.deleted
) for each item
- of the scale, or NULL
if data frame had too less columns.
A list with two values: the split-half reliability splithalf
- the Spearman-Brown corrected split-half reliability spearmanbrown
The Cronbach's Alpha value for x
The mean inter-item-correlation value for x
The item difficulty value for x
This function calculates the item discriminations (corrected item-total
- correlations for each item of x
with the remaining items) and
- the Cronbach's alpha for each item, if it was deleted from the scale.
- The absolute value of the item discrimination indices should be
- above 0.1. An index between 0.1 and 0.3 is considered as "fair",
- while an index above 0.3 (or below -0.3) is "good". Items with
- low discrimination indices are often ambiguously worded and
- should be examined. Items with negative indices should be
- examined to determine why a negative value was obtained (e.g.
- reversed answer categories regarding positive and negative poles).
This function calculates the split-half reliability for items in
- the data frame x
, including the Spearman-Brown adjustment.
- Splitting is done by selecting odd versus even columns in x
- A value closer to 1 indicates greater internal consistency.
The Cronbach's Alpha value for x
. A value closer to 1
- indicates greater internal consistency, where usually following
- rule of thumb is applied to interprete the results:
- α < 0.5 is unacceptable,
- 0.5 < α < 0.6 is poor,
- 0.6 < α < 0.7 is questionable,
- 0.7 < α < 0.8 is acceptable,
- and everything > 0.8 is good or excellent.
This function calculates a mean inter-item-correlation, i.e.
- a correlation matrix of x
will be computed (unless
- x
is already a matrix as returned by the
- cor
-function) and the mean
- of the sum of all item's correlation values is returned.
- Requires either a data frame or a computed cor
- “Ideally, the average inter-item correlation for a set of
- items should be between .20 and .40, suggesting that while the
- items are reasonably homogenous, they do contain sufficiently
- unique variance so as to not be isomorphic with each other.
- When values are lower than .20, then the items may not be
- representative of the same content domain. If values are higher than
- .40, the items may be only capturing a small bandwidth of the construct.”
- (Piedmont 2014)
This function calculates the item difficutly, which should
- range between 0.2 and 0.8. Lower values are a signal for
- more difficult items, while higher values close to one
- are a sign for easier items. The ideal value for item difficulty
- is p + (1 - p) / 2
, where p = 1 / max(x)
. In most
- cases, the ideal item difficulty lies between 0.5 and 0.8.
Spearman C. 1910. Correlation calculated from faulty data. British Journal of Psychology (3): 271-295. doi: 10.1111/j.2044-8295.1910.tb00206.x
- Brown W. 1910. Some experimental results in the correlation of mental abilities. British Journal of Psychology (3): 296-322. doi: 10.1111/j.2044-8295.1910.tb00207.x
- Piedmont RL. 2014. Inter-item Correlations. In: Michalos AC (eds) Encyclopedia of Quality of Life and Well-Being Research. Dordrecht: Springer, 3303-3304. doi: 10.1007/978-94-007-0753-5_1493
-library(sjlabelled) -# Data from the EUROFAMCARE sample dataset -data(efc) - -# retrieve variable and value labels -varlabs <- get_label(efc) - -# recveive first item of COPE-index scale -start <- which(colnames(efc) == "c82cop1") -# recveive last item of COPE-index scale -end <- which(colnames(efc) == "c90cop9") - -# create data frame with COPE-index scale -x <- efc[, c(start:end)] -colnames(x) <- varlabs[c(start:end)] - -# reliability tests -reliab_test(x)#> term -#> 1 do you feel you cope well as caregiver? -#> 2 do you find caregiving too demanding? -#> 3 does caregiving cause difficulties in your relationship with your friends? -#> 4 does caregiving have negative effect on your physical health? -#> 5 does caregiving cause difficulties in your relationship with your family? -#> 6 does caregiving cause financial difficulties? -#> 7 do you feel trapped in your role as caregiver? -#> 8 do you feel supported by friends/neighbours? -#> 9 do you feel caregiving worthwhile? -#> alpha.if.deleted item.discr -#> 1 0.539 -0.241 -#> 2 0.384 0.329 -#> 3 0.339 0.408 -#> 4 0.324 0.441 -#> 5 0.380 0.357 -#> 6 0.366 0.416 -#> 7 0.353 0.368 -#> 8 0.534 -0.029 -#> 9 0.556 -0.112-# split-half-reliability -split_half(x)#> -#> # Internal Consistency -#> -#> Split-Half Reliability: 0.410 -#> Spearman-Brown Adjustment: 0.581-# cronbach's alpha -cronb(x)#> [1] 0.459369-# mean inter-item-correlation -mic(x)#> [1] 0.09176831-# item difficulty -difficulty(x)#> -#> # Item Difficulty -#> -#> difficulty ideal -#> do you feel you cope well as caregiver? 0.78 0.62 -#> do you find caregiving too demanding? 0.51 0.62 -#> does caregiving cause difficulties in your relationship with your friends? 0.41 0.62 -#> does caregiving have negative effect on your physical health? 0.44 0.62 -#> does caregiving cause difficulties in your relationship with your family? 0.35 0.62 -#> does caregiving cause financial difficulties? 0.32 0.62 -#> do you feel trapped in your role as caregiver? 0.48 0.62 -#> do you feel supported by friends/neighbours? 0.54 0.62 -#> do you feel caregiving worthwhile? 0.73 0.62-# NOT RUN { -library(sjPlot) -sjt.df(reliab_test(x), describe = FALSE, show.cmmn.row = TRUE, - string.cmmn = sprintf("Cronbach's α=%.2f", cronb(x))) - -# Compute PCA on Cope-Index, and perform a -# reliability check on each extracted factor. -factors <- sjt.pca(x)$factor.index -findex <- sort(unique(factors)) -library(sjPlot) -for (i in seq_len(length(findex))) { - rel.df <- subset(x, select = which(factors == findex[i])) - if (ncol(rel.df) >= 3) { - sjt.df(reliab_test(rel.df), describe = FALSE, show.cmmn.row = TRUE, - use.viewer = FALSE, title = "Item-Total-Statistic", - string.cmmn = sprintf("Scale's overall Cronbach's α=%.2f", - cronb(rel.df))) - } - } -# }-
Compute various measures or tests to assess the model quality, - like root mean squared error, residual standard error or mean square error - of fitted linear (mixed effects) models. For logistic regression models, - or mixed models with binary outcome, the error rate, binned residuals, - Chi-square goodness-of-fit-test or the Hosmer-Lemeshow Goodness-of-fit-test - can be performed.
- -cv(x, ...) - -chisq_gof(x, prob = NULL, weights = NULL) - -hoslem_gof(x, n.bins = 10) - -rmse(x, normalized = FALSE) - -rse(x) - -mse(x) - -error_rate(x) - -binned_resid(x, term = NULL, n.bins = NULL)- -
x | -Fitted linear model of class |
... | -More fitted model objects, to compute multiple coefficients of -variation at once. |
prob | -Vector of probabilities (indicating the population probabilities)
-of the same length as |
weights | -Vector with weights, used to weight |
n.bins | -Numeric, the number of bins to divide the data. For
- |
normalized | -Logical, use |
term | -Name of independent variable from |
rmse(), rse(), mse(), cv()
These functions return a number, the requested statistic.
A list with four values: the error rate of the full and the null model, - as well as the chi-squared and p-value from the Likelihood-Ratio-Test - between the full and null model.
A data frame representing the data that is mapped to the plot, which is - automatically plotted. In case all residuals are inside the error bounds, - points are black. If some of the residuals are outside the error bounds - (indicates by the grey-shaded area), blue points indicate residuals that - are OK, while red points indicate model under- or overfitting for the - related range of estimated probabilities.
For vectors, returns the object of the computed chisq.test
- For glm
-objects, an object of class chisq_gof
- following values: p.value
, the p-value for the goodness-of-fit test;
- z.score
, the standardized z-score for the goodness-of-fit test;
- rss
, the residual sums of squares term and chisq
, the pearson
- chi-squared statistic.
An object of class hoslem_test
with following values: chisq
- the Hosmer-Lemeshow chi-squared statistic; df
, degrees of freedom
- and p.value
the p-value for the goodness-of-fit test.
The RMSE is the square root of the variance of the residuals and indicates
- the absolute fit of the model to the data (difference between observed data
- to model's predicted values). “RMSE can be interpreted as the standard
- deviation of the unexplained variance, and has the useful property
- of being in the same units as the response variable. Lower values
- of RMSE indicate better fit. RMSE is a good measure of how accurately
- the model predicts the response, and is the most important criterion
- for fit if the main purpose of the model is prediction.”
- (Grace-Martin K: Assessing the Fit of Regression Models)
- The normalized RMSE is the proportion of the RMSE related to the
- range of the response variable. Hence, lower values indicate
- less residual variance.
The residual standard error is the square root of the residual - sum of squares divided by the residual degrees of freedom.
The mean square error is the mean of the sum of squared residuals, - i.e. it measures the average of the squares of the errors. Lower - values (closer to zero) indicate better fit.
The advantage of the cv is that it is unitless. This allows
- coefficient of variation to be compared to each other in ways
- that other measures, like standard deviations or root mean
- squared residuals, cannot be.
- “It is interesting to note the differences between a model's CV
- and R-squared values. Both are unitless measures that are indicative
- of model fit, but they define model fit in two different ways: CV
- evaluates the relative closeness of the predictions to the actual
- values while R-squared evaluates how much of the variability in the
- actual values is explained by the model.”
- (source: UCLA-FAQ)
The error rate is a crude measure for model fit for logistic regression
- models. It is defined as the proportion of cases for which the
- deterministic prediction is wrong, i.e. the proportion where the the
- predicted probability is above 0.5, although y = 0 (and vice versa).
- In general, the error rate should be below 0.5 (i.e. 50%), the
- closer to zero, the better. Furthermore, the error rate of the full
- model should be considerably below the null model's error rate
- (cf. Gelman and Hill 2007, pp. 99). The print()
-method also
- prints the results from the Likelihood-Ratio-Test, comparing the full
- to the null model.
Binned residual plots are achieved by “dividing the data into
- categories (bins) based on their fitted values, and then plotting
- the average residual versus the average fitted value for each bin.”
- (Gelman, Hill 2007: 97). If the model were true, one would
- expect about 95% of the residuals to fall inside the error bounds.
- If term
is not NULL
, one can compare the residuals in
- relation to a specific model predictor. This may be helpful to check
- if a term would fit better when transformed, e.g. a rising and falling
- pattern of residuals along the x-axis (the pattern is indicated by
- a green line) is a signal to consider taking the logarithm of the
- predictor (cf. Gelman and Hill 2007, pp. 97ff).
For vectors, this function is a convenient function for the
- chisq.test()
, performing goodness-of-fit test. For
- glm
-objects, this function performs a goodness-of-fit test.
- A well-fitting model shows no significant difference between the
- model and the observed data, i.e. the reported p-values should be
- greater than 0.05.
A well-fitting model shows no significant difference between - the model and the observed data, i.e. the reported p-value should be - greater than 0.05.
Gelman A, Hill J (2007) Data Analysis Using Regression and Multilevel/Hierarchical Models. Cambridge, New York: Cambridge University Press
- Everitt, Brian (1998). The Cambridge Dictionary of Statistics. Cambridge, UK New York: Cambridge University Press
- Hosmer, D. W., & Lemeshow, S. (2000). Applied Logistic Regression. Hoboken, NJ, USA: John Wiley & Sons, Inc. doi: 10.1002/0471722146
- Grace-Martin K: Assessing the Fit of Regression Models
for R-squared or pseudo-R-squared values.
-#> [1] 25.46792rse(fit)#> [1] 25.51134cv(fit)#> [1] 0.3948098#> [1] 23.43815mse(fit)#> [1] 549.3468cv(fit)#> [1] 0.07851768-# normalized RMSE -library(nlme) -fit <- lme(distance ~ age, data = Orthodont) -rmse(fit, normalized = TRUE)#> [1] 0.07242178-#coefficient of variation for variable -cv(efc$e17age)#> [1] 0.1023027-# Error Rate -efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) -m <- glm( - neg_c_7d ~ c161sex + barthtot + c172code, - data = efc, - family = binomial(link = "logit") -) -error_rate(m)#> -#> # Error Rate of Logistic Regression Model -#> -#> Full model: 29.45% -#> Null model: 45.15% -#> -#> # Likelihood-Ratio-Test -#> -#> Chi-squared: 166.050 -#> p-value: 0.000 -#>-# Binned residuals -binned_resid(m)binned_resid(m, "barthtot")-# goodness-of-fit test for logistic regression -chisq_gof(m)#> -#> # Chi-squared Goodness-of-Fit Test -#> -#> Chi-squared: 852.765 -#> z-score: 1.025 -#> p-value: 0.305 -#>#>-# goodness-of-fit test for logistic regression -hoslem_gof(m)#> -#> # Hosmer-Lemeshow Goodness-of-Fit Test -#> -#> Chi-squared: 18.707 -#> df: 8 -#> p-value: 0.017 -#>#>-# goodness-of-fit test for vectors against probabilities -# differing from population -chisq_gof(efc$e42dep, c(0.3,0.2,0.22,0.28))#> -#> Chi-squared test for given probabilities -#> -#> data: dummy -#> X-squared = 234.76, df = 3, p-value < 2.2e-16 -#>#> -#> Chi-squared test for given probabilities -#> -#> data: dummy -#> X-squared = 0, df = 3, p-value = 1 -#>- -
computes robust standard error for regression models.
- This method calls one of the vcov*()
-functions from the
- sandwich-package for robust covariance matrix estimators. Results are
- returned as tidy data frame.
- svy()
is intended to compute standard errors for survey
- designs (complex samples) fitted with regular lm
- glm
functions, as alternative to the survey-package.
- It simulates sampling weights by adjusting the residual degrees
- of freedom based on the precision weights used to fit x
- and then calls robust()
with the adjusted model.
robust( - x, - vcov.fun = "vcovHC", - vcov.type = c("HC3", "const", "HC", "HC0", "HC1", "HC2", "HC4", "HC4m", "HC5"), - vcov.args = NULL, - conf.int = FALSE, - exponentiate = FALSE -) - -svy( - x, - vcov.fun = "vcovHC", - vcov.type = c("HC1", "const", "HC", "HC0", "HC3", "HC2", "HC4", "HC4m", "HC5"), - vcov.args = NULL, - conf.int = FALSE, - exponentiate = FALSE -)- -
x | -A fitted model of any class that is supported by the |
vcov.fun | -String, indicating the name of the |
vcov.type | -Character vector, specifying the estimation type for the
-robust covariance matrix estimation (see |
vcov.args | -List of named vectors, used as additional arguments that
-are passed down to |
conf.int | -Logical, |
exponentiate | -Logical, whether to exponentiate the coefficient estimates -and confidence intervals (typical for logistic regression). |
A summary of the model, including estimates, robust standard error, - p-value and - optionally - the confidence intervals.
simply calls robust()
, but first adjusts the
- residual degrees of freedom based on the model weights.
- Hence, for svy()
, x
should be fitted with weights.
- This simulates sampling weights like in survey designs, though
- lm
and glm
implement precision weights.
- The results from svy()
are usually more accurate than simple
- weighted standard errors for complex samples. However, results from
- the survey package are still more exactly, especially
- regarding the estimates.
- vcov.type
for svy()
defaults to "HC1"
, because
- standard errors with this estimation type come closest to the standard
- errors from the survey-package.
- Currently, svy()
only works for objects of class lm
-#> -#> Call: -#> lm(formula = barthtot ~ c160age + c12hour + c161sex + c172code, -#> data = efc) -#> -#> Residuals: -#> Min 1Q Median 3Q Max -#> -74.639 -15.246 4.251 19.009 73.327 -#> -#> Coefficients: -#> Estimate Std. Error t value Pr(>|t|) -#> (Intercept) 90.06448 6.17237 14.592 <2e-16 *** -#> c160age -0.22156 0.07111 -3.116 0.0019 ** -#> c12hour -0.27810 0.01865 -14.915 <2e-16 *** -#> c161sex -0.26178 2.08649 -0.125 0.9002 -#> c172code -0.76215 1.41971 -0.537 0.5915 -#> --- -#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 -#> -#> Residual standard error: 25.35 on 816 degrees of freedom -#> (87 observations deleted due to missingness) -#> Multiple R-squared: 0.2696, Adjusted R-squared: 0.266 -#> F-statistic: 75.28 on 4 and 816 DF, p-value: < 2.2e-16 -#>robust(fit)#> term estimate std.error statistic p.value -#> 1 (Intercept) 90.0644792 5.58174936 16.1355291 4.832660e-51 -#> 2 c160age -0.2215581 0.07106417 -3.1177189 1.886499e-03 -#> 3 c12hour -0.2781004 0.02119537 -13.1208077 8.012747e-36 -#> 4 c161sex -0.2617783 1.91864853 -0.1364389 8.915080e-01 -#> 5 c172code -0.7621525 1.39456606 -0.5465159 5.848608e-01-confint(fit)#> 2.5 % 97.5 % -#> (Intercept) 77.9488902 102.18006829 -#> c160age -0.3611297 -0.08198647 -#> c12hour -0.3146997 -0.24150107 -#> c161sex -4.3573007 3.83374416 -#> c172code -3.5488594 2.02455439robust(fit, conf.int = TRUE)#> term estimate std.error conf.low conf.high statistic -#> 1 (Intercept) 90.0644792 5.58174936 79.1082006 101.02075786 16.1355291 -#> 2 c160age -0.2215581 0.07106417 -0.3610482 -0.08206799 -3.1177189 -#> 3 c12hour -0.2781004 0.02119537 -0.3197042 -0.23649650 -13.1208077 -#> 4 c161sex -0.2617783 1.91864853 -4.0278463 3.50428977 -0.1364389 -#> 5 c172code -0.7621525 1.39456606 -3.4995120 1.97520692 -0.5465159 -#> p.value -#> 1 4.832660e-51 -#> 2 1.886499e-03 -#> 3 8.012747e-36 -#> 4 8.915080e-01 -#> 5 5.848608e-01robust(fit, vcov.type = "HC1", conf.int = TRUE) # "HC1" should be Stata default#> term estimate std.error conf.low conf.high statistic -#> 1 (Intercept) 90.0644792 5.55391758 79.1628309 100.96612755 16.2163874 -#> 2 c160age -0.2215581 0.07066514 -0.3602650 -0.08285124 -3.1353241 -#> 3 c12hour -0.2781004 0.02103734 -0.3193941 -0.23680669 -13.2193675 -#> 4 c161sex -0.2617783 1.90939743 -4.0096876 3.48613101 -0.1370999 -#> 5 c172code -0.7621525 1.38747118 -3.4855856 1.96128056 -0.5493105 -#> p.value -#> 1 1.788307e-51 -#> 2 1.778136e-03 -#> 3 2.730414e-36 -#> 4 8.909856e-01 -#> 5 5.829427e-01-library(sjmisc) -# dichtomozize service usage by "service usage yes/no" -efc$services <- sjmisc::dicho(efc$tot_sc_e, dich.by = 0) -fit <- glm(services ~ neg_c_7 + c161sex + e42dep, - data = efc, family = binomial(link = "logit")) - -robust(fit)#> term estimate std.error statistic p.value -#> 1 (Intercept) -0.5198021 0.38048112 -1.366171 0.172232067 -#> 2 neg_c_7 0.0419026 0.02035790 2.058296 0.039853669 -#> 3 c161sex -0.2189336 0.16263979 -1.346126 0.178606278 -#> 4 e42dep 0.2134784 0.07970607 2.678320 0.007536269robust(fit, conf.int = TRUE, exponentiate = TRUE)#> term estimate std.error conf.low conf.high statistic p.value -#> 1 (Intercept) 0.5946382 0.38048112 0.2818017 1.254764 -1.366171 0.172232067 -#> 2 neg_c_7 1.0427929 0.02035790 1.0019492 1.085302 2.058296 0.039853669 -#> 3 c161sex 0.8033751 0.16263979 0.5838345 1.105470 -1.346126 0.178606278 -#> 4 e42dep 1.2379767 0.07970607 1.0587020 1.447609 2.678320 0.007536269-
Compute an approximated sample size for linear mixed models - (two-level-designs), based on power-calculation for standard - design and adjusted for design effect for 2-level-designs.
-samplesize_mixed( - eff.size, - df.n = NULL, - power = 0.8, - sig.level = 0.05, - k, - n, - icc = 0.05 -) - -smpsize_lmm( - eff.size, - df.n = NULL, - power = 0.8, - sig.level = 0.05, - k, - n, - icc = 0.05 -)- -
eff.size | -Effect size. |
df.n | -Optional argument for the degrees of freedom for numerator. See 'Details'. |
power | -Power of test (1 minus Type II error probability). |
sig.level | -Significance level (Type I error probability). |
k | -Number of cluster groups (level-2-unit) in multilevel-design. |
n | -Optional, number of observations per cluster groups -(level-2-unit) in multilevel-design. |
icc | -Expected intraclass correlation coefficient for multilevel-model. |
A list with two values: The number of subjects per cluster, and the - total sample size for the linear mixed model.
-The sample size calculation is based on a power-calculation for the
- standard design. If df.n
is not specified, a power-calculation
- for an unpaired two-sample t-test will be computed (using
- pwr.t.test
of the pwr-package).
- If df.n
is given, a power-calculation for general linear models
- will be computed (using pwr.f2.test
of the
- pwr-package). The sample size of the standard design
- is then adjusted for the design effect of two-level-designs (see
- design_effect
). Thus, the sample size calculation is appropriate
- in particular for two-level-designs (see Snijders 2005). Models that
- additionally include repeated measures (three-level-designs) may work
- as well, however, the computed sample size may be less accurate.
Cohen J. 1988. Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale,NJ: Lawrence Erlbaum.
- Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. doi: 10.1177/0163278703255230
- Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. doi: 10.1002/0470013192.bsa492
-# Sample size for multilevel model with 30 cluster groups and a small to -# medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and -# hence a total sample size of about 802 observations is needed. -samplesize_mixed(eff.size = .3, k = 30)#> $`Subjects per Cluster` -#> [1] 27 -#> -#> $`Total Sample Size` -#> [1] 802 -#>-# Sample size for multilevel model with 20 cluster groups and a medium -# to large effect size for linear models of 0.2. Five subjects per cluster and -# hence a total sample size of about 107 observations is needed. -samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9)#> $`Subjects per Cluster` -#> [1] 5 -#> -#> $`Total Sample Size` -#> [1] 107 -#>
- scale_weights.Rd
Most functions to fit multilevel and mixed effects models only
- allow to specify frequency weights, but not design (i.e. sampling or probability)
- weights, which should be used when analyzing complex samples and survey data.
- scale_weights()
implements an algorithm proposed by Aaparouhov (2006)
- and Carle (2009) to rescale design weights in survey data to account for
- the grouping structure of multilevel models, which then can be used for
- multilevel modelling.
scale_weights(x, cluster.id, pweight)- -
x | -A data frame. |
cluster.id | -Variable indicating the grouping structure (strata) of -the survey data (level-2-cluster variable). |
pweight | -Variable indicating the probability (design or sampling) -weights of the survey data (level-1-weight). |
, with two new variables: svywght_a
and svywght_b
- which represent the rescaled design weights to use in multilevel models
- (use these variables for the weights
Rescaling is based on two methods: For svywght_a
, the sample
- weights pweight
are adjusted by a factor that represents the proportion
- of cluster size divided by the sum of sampling weights within each cluster.
- The adjustment factor for svywght_b
is the sum of sample weights
- within each cluster devided by the sum of squared sample weights within
- each cluster (see Carle (2009), Appendix B).
- Regarding the choice between scaling methods A and B, Carle suggests
- that "analysts who wish to discuss point estimates should report results
- based on weighting method A. For analysts more interested in residual
- between-cluster variance, method B may generally provide the least biased
- estimates". In general, it is recommended to fit a non-weighted model
- and weighted models with both scaling methods and when comparing the
- models, see whether the "inferential decisions converge", to gain
- confidence in the results.
- Though the bias of scaled weights decreases with increasing cluster size,
- method A is preferred when insufficient or low cluster size is a concern.
- The cluster ID and probably PSU may be used as random effects (e.g.
- nested design, or cluster and PSU as varying intercepts), depending
- on the survey design that should be mimicked.
Carle AC. Fitting multilevel models in complex survey data with design weights: Recommendations BMC Medical Research Methodology 2009, 9(49): 1-13
- Asparouhov T. General Multi-Level Modeling with Sampling Weights Communications in Statistics - Theory and Methods 2006, 35: 439-460
-#>#> # A tibble: 2,992 x 9 -#> total age RIAGENDR RIDRETH1 SDMVPSU SDMVSTRA WTINT2YR svywght_a svywght_b -#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> -#> 1 1 2.2 1 3 2 31 97594. 1.57 1.20 -#> 2 7 2.08 2 3 1 29 39599. 0.623 0.525 -#> 3 3 1.48 2 1 2 42 26620. 0.898 0.544 -#> 4 4 1.32 2 4 2 33 34999. 0.708 0.550 -#> 5 1 2 2 1 1 41 14746. 0.422 0.312 -#> 6 6 2.2 2 4 1 38 28232. 0.688 0.516 -#> 7 350 1.6 1 3 2 33 93162. 1.89 1.46 -#> 8 NA 1.48 2 3 1 29 82276. 1.29 1.09 -#> 9 3 2.28 2 4 1 41 24726. 0.707 0.523 -#> 10 30 0.84 1 3 2 35 39895. 0.760 0.594 -#> # ... with 2,982 more rows-if (require("lme4")) { - nhanes_sample <- scale_weights(nhanes_sample, SDMVSTRA, WTINT2YR) - glmer( - total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)) + (1 | SDMVPSU), - family = poisson(), - data = nhanes_sample, - weights = svywght_a - ) -}#>#> Generalized linear mixed model fit by maximum likelihood (Laplace -#> Approximation) [glmerMod] -#> Family: poisson ( log ) -#> Formula: total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)) + (1 | -#> SDMVPSU) -#> Data: nhanes_sample -#> Weights: svywght_a -#> AIC BIC logLik deviance df.resid -#> 78844.27 78920.47 -39409.14 78818.27 2582 -#> Random effects: -#> Groups Name Std.Dev. -#> SDMVPSU (Intercept) 0.1018 -#> Number of obs: 2595, groups: SDMVPSU, 2 -#> Fixed Effects: -#> (Intercept) factor(RIAGENDR)2 -#> 2.491801 -1.021308 -#> log(age) factor(RIDRETH1)2 -#> 0.838726 -0.088627 -#> factor(RIDRETH1)3 factor(RIDRETH1)4 -#> -0.013333 0.722511 -#> factor(RIDRETH1)5 factor(RIAGENDR)2:log(age) -#> -0.106521 -1.012695 -#> factor(RIAGENDR)2:factor(RIDRETH1)2 factor(RIAGENDR)2:factor(RIDRETH1)3 -#> -0.009086 0.732985 -#> factor(RIAGENDR)2:factor(RIDRETH1)4 factor(RIAGENDR)2:factor(RIDRETH1)5 -#> 0.275967 0.542074
Compute standard error for a variable, for all variables - of a data frame, for joint random and fixed effects - coefficients of (non-/linear) mixed models, the adjusted - standard errors for generalized linear (mixed) models, or - for intraclass correlation coefficients (ICC).
- -se(x, ...)- -
x | -(Numeric) vector, a data frame, an |
... | -Currently not used. |
The standard error of x
Standard error for variables
- For variables and data frames, the standard error is the square root of the
- variance divided by the number of observations (length of vector).
- Standard error for mixed models
- For linear mixed models, and generalized linear mixed models, this
- function computes the standard errors for joint (sums of) random and fixed
- effects coefficients (unlike se.coef
, which returns the
- standard error for fixed and random effects separately). Hence, se()
- returns the appropriate standard errors for coef.merMod
- Standard error for generalized linear models
- For generalized linear models, approximated standard errors, using the delta
- method for transformed regression parameters are returned (Oehlert 1992).
- Standard error for proportions and mean value
- To compute the standard error for relative frequencies (i.e. proportions, or
- mean value if x
has only two categories), this vector must be supplied
- as table, e.g. se(table(iris$Species))
. se()
than computes the
- relative frequencies (proportions) for each value and the related standard
- error for each value. This might be useful to add standard errors or confidence
- intervals to descriptive statistics. If standard errors for weighted variables
- are required, use xtabs()
, e.g. se(xtabs(weights ~ variable))
- Standard error for regression coefficient and p-value
- se()
also returns the standard error of an estimate (regression
- coefficient) and p-value, assuming a normal distribution to compute
- the z-score from the p-value (formula in short: b / qnorm(p / 2)
- See 'Examples'.
Computation of standard errors for coefficients of mixed models
- is based on this code.
- Standard errors for generalized linear (mixed) models, if
- type = "re"
, are approximations based on the delta
- method (Oehlert 1992).
- A remark on standard errors:
- “Standard error represents variation in the point estimate, but
- confidence interval has usual Bayesian interpretation only with flat prior.”
- (Gelman 2017)
Oehlert GW. 1992. A note on the delta method. American Statistician 46(1).
- Gelman A 2017. How to interpret confidence intervals? http://andrewgelman.com/2017/03/04/interpret-confidence-intervals/
-#> [1] 0.09589874#> c12hour e15relat e16sex -#> 1.69162290 0.06942207 0.01565588-# compute standard error for merMod-coefficients -library(lme4) -fit <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy) -se(fit)#> $Subject -#> (Intercept) Days -#> 1 13.86572 2.775269 -#> 2 13.86572 2.775269 -#> 3 13.86572 2.775269 -#> 4 13.86572 2.775269 -#> 5 13.86572 2.775269 -#> 6 13.86572 2.775269 -#> 7 13.86572 2.775269 -#> 8 13.86572 2.775269 -#> 9 13.86572 2.775269 -#> 10 13.86572 2.775269 -#> 11 13.86572 2.775269 -#> 12 13.86572 2.775269 -#> 13 13.86572 2.775269 -#> 14 13.86572 2.775269 -#> 15 13.86572 2.775269 -#> 16 13.86572 2.775269 -#> 17 13.86572 2.775269 -#> 18 13.86572 2.775269 -#>-# compute odds-ratio adjusted standard errors, based on delta method -# with first-order Taylor approximation. -data(efc) -efc$services <- sjmisc::dicho(efc$tot_sc_e, dich.by = 0) -fit <- glm( - services ~ neg_c_7 + c161sex + e42dep, - data = efc, - family = binomial(link = "logit") -) -se(fit)#> # A tibble: 4 x 3 -#> term estimate std.error -#> <chr> <dbl> <dbl> -#> 1 (Intercept) 0.595 0.224 -#> 2 neg_c_7 1.04 0.0204 -#> 3 c161sex 0.803 0.130 -#> 4 e42dep 1.24 0.0972-# compute odds-ratio adjusted standard errors for generalized -# linear mixed model, also based on delta method - -# create binary response -sleepstudy$Reaction.dicho <- dicho(sleepstudy$Reaction, dich.by = "median") -fit <- glmer( - Reaction.dicho ~ Days + (Days | Subject), - data = sleepstudy, - family = binomial("logit") -) -se(fit)#> $Subject -#> (Intercept) Days -#> 1 1.859251 0.4700877 -#> 2 2.622321 0.4115004 -#> 3 2.622321 0.4115004 -#> 4 1.633616 0.3289753 -#> 5 1.745522 0.4826391 -#> 6 1.714059 0.3560052 -#> 7 1.715195 0.4646524 -#> 8 2.219966 0.4133501 -#> 9 2.622321 0.4115004 -#> 10 1.806673 0.5077360 -#> 11 2.314690 0.3959941 -#> 12 2.111495 0.4335945 -#> 13 1.830037 0.3388438 -#> 14 1.686571 0.4933028 -#> 15 1.988285 0.4534415 -#> 16 2.167228 0.4013762 -#> 17 2.314690 0.3959941 -#> 18 1.778644 0.4321839 -#>-# compute standard error for proportions -efc$e42dep <- to_label(efc$e42dep) -se(table(efc$e42dep))#> value proportion std.error -#> 1 independent 0.07325194 0.008680166 -#> 2 slightly dependent 0.24972253 0.014420404 -#> 3 moderately dependent 0.33962264 0.015777276 -#> 4 severely dependent 0.33740289 0.015752039#> value proportion std.error -#> 1 independent 0.07061821 0.008552969 -#> 2 slightly dependent 0.25359865 0.014525167 -#> 3 moderately dependent 0.33761265 0.015787989 -#> 4 severely dependent 0.33817049 0.015794372-# compute standard error from regression coefficient and p-value -se(list(estimate = .3, p.value = .002))#> [1] 0.09708008-# NOT RUN { -# compute standard error of ICC for the linear mixed model -icc(fit) -se(icc(fit)) - -# the standard error for the ICC can be computed manually in this way, -# taking the fitted model example from above -library(dplyr) -library(purrr) -dummy <- sleepstudy %>% - # generate 100 bootstrap replicates of dataset - bootstrap(100) %>% - # run mixed effects regression on each bootstrap replicate - # and compute ICC for each "bootstrapped" regression - mutate( - models = map(strap, ~lmer(Reaction ~ Days + (Days | Subject), data = .x)), - icc = map_dbl(models, ~icc(.x)) - ) - -# now compute SE and p-values for the bootstrapped ICC, values -# may differ from above example due to random seed -boot_se(dummy, icc) -boot_p(dummy, icc) -# }- -
Compute the standard error for the sample mean for mixed models, - regarding the extent to which clustering affects the standard errors. - May be used as part of the multilevel power calculation for cluster sampling - (see Gelman and Hill 2007, 447ff).
-se_ybar(fit)- -
fit | -Fitted mixed effects model ( |
The standard error of the sample mean of fit
Gelman A, Hill J. 2007. Data analysis using regression and multilevel/hierarchical models. Cambridge, New York: Cambridge University Press
- --#> Subject -#> 9.049936
Collection of convenient functions for common statistical computations, which are not directly provided by R's base or stats packages.
-This package aims at providing, first, shortcuts for statistical measures, which otherwise could only be calculated with additional effort (like standard errors or root mean squared errors).
-Second, these shortcut functions are generic (if appropriate), and can be applied not only to vectors, but also to other objects as well (e.g., the Coefficient of Variation can be computed for vectors, linear models, or linear mixed models; the r2()
-function returns the r-squared value for lm
, glm
, merMod
, glmmTMB
, or lme
and other objects).
Most functions of this package are designed as summary functions, i.e. they do not transform the input vector; rather, they return a summary, which is sometimes a vector and sometimes a tidy data frame. he focus of most functions lies on summary statistics or fit measures for regression models, including generalized linear models, mixed effects models or Bayesian models. However, some of the functions deal with other statistical measures, like Cronbach's Alpha, Cramer's V, Phi etc.
-The comprised tools include:
For regression and mixed models: Coefficient of Variation, Root Mean Squared Error, Residual Standard Error, Coefficient of Discrimination, R-squared and pseudo-R-squared values, standardized beta values
Especially for mixed models: Design effect, ICC, sample size calculation and convergence tests
Especially for Bayesian models: Highest Density Interval, region of practical equivalence (rope), Monte Carlo Standard Errors, ratio of number of effective samples, mediation analysis, Test for Practical Equivalence
Fit and accuracy measures for regression models: Overdispersion tests, accuracy of predictions, test/training-error comparisons, error rate and binned residual plots for logistic regression models
For anova-tables: Eta-squared, Partial Eta-squared, Omega-squared and Partial Omega-squared statistics
Furthermore, sjstats has functions to access information from model objects, which either support more model objects than their stats counterparts, or provide easy access to model attributes, like:
to get the model frame
to get information about the model family, link functions etc.
to get the link-inverse function
and resp_var()
to get the names of either the dependent or independent variables, or
to get the "cleaned" variables names from a model object (cleaned means, things like s()
or log()
are removed from the returned character vector with variable names.)
Other statistics:
Cramer's V, Cronbach's Alpha, Mean Inter-Item-Correlation, Mann-Whitney-U-Test, Item-scale reliability tests
Compute an approximated sample size for linear mixed models - (two-level-designs), based on power-calculation for standard - design and adjusted for design effect for 2-level-designs.
- -smpsize_lmm(eff.size, df.n = NULL, power = 0.8, sig.level = 0.05, k, - n, icc = 0.05)- -
eff.size | -Effect size. |
df.n | -Optional argument for the degrees of freedom for numerator. See 'Details'. |
power | -Power of test (1 minus Type II error probability). |
sig.level | -Significance level (Type I error probability). |
k | -Number of cluster groups (level-2-unit) in multilevel-design. |
n | -Optional, number of observations per cluster groups -(level-2-unit) in multilevel-design. |
icc | -Expected intraclass correlation coefficient for multilevel-model. |
A list with two values: The number of subjects per cluster, and the - total sample size for the linear mixed model.
- -The sample size calculation is based on a power-calculation for the
- standard design. If df.n
is not specified, a power-calculation
- for an unpaired two-sample t-test will be computed (using
- pwr.t.test
of the pwr-package).
- If df.n
is given, a power-calculation for general linear models
- will be computed (using pwr.f2.test
of the
- pwr-package). The sample size of the standard design
- is then adjusted for the design effect of two-level-designs (see
- deff
). Thus, the sample size calculation is appropriate
- in particular for two-level-designs (see Snijders 2005). Models that
- additionally include repeated measures (three-level-designs) may work
- as well, however, the computed sample size may be less accurate.
Cohen J. 1988. Statistical power analysis for the behavioral sciences (2nd ed.). Hillsdale,NJ: Lawrence Erlbaum.
- Hsieh FY, Lavori PW, Cohen HJ, Feussner JR. 2003. An Overview of Variance Inflation Factors for Sample-Size Calculation. Evaluation and the Health Professions 26: 239-257. doi: 10.1177/0163278703255230
- Snijders TAB. 2005. Power and Sample Size in Multilevel Linear Models. In: Everitt BS, Howell DC (Hrsg.). Encyclopedia of Statistics in Behavioral Science. Chichester, UK: John Wiley and Sons, Ltd. doi: 10.1002/0470013192.bsa492
-# Sample size for multilevel model with 30 cluster groups and a small to -# medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and -# hence a total sample size of about 802 observations is needed. -smpsize_lmm(eff.size = .3, k = 30)#> $`Subjects per Cluster` -#> [1] 27 -#> -#> $`Total Sample Size` -#> [1] 802 -#>-# Sample size for multilevel model with 20 cluster groups and a medium -# to large effect size for linear models of 0.2. Five subjects per cluster and -# hence a total sample size of about 107 observations is needed. -smpsize_lmm(eff.size = .2, df.n = 5, k = 20, power = .9)#> $`Subjects per Cluster` -#> [1] 5 -#> -#> $`Total Sample Size` -#> [1] 107 -#>- -
- std_beta.Rd
Returns the standardized beta coefficients, std. error and confidence intervals - of a fitted linear (mixed) models.
-std_beta(fit, ...) - -# S3 method for merMod -std_beta(fit, ci.lvl = 0.95, ...) - -# S3 method for lm -std_beta(fit, type = "std", ci.lvl = 0.95, ...) - -# S3 method for gls -std_beta(fit, type = "std", ci.lvl = 0.95, ...)- -
fit | -Fitted linear (mixed) model of class |
... | -Currently not used. |
ci.lvl | -Numeric, the level of the confidence intervals. |
type | -If |
A tibble
with term names, standardized beta coefficients,
- standard error and confidence intervals of fit
“Standardized coefficients refer to how many standard deviations a dependent variable will change, - per standard deviation increase in the predictor variable. Standardization of the coefficient is - usually done to answer the question of which of the independent variables have a greater effect - on the dependent variable in a multiple regression analysis, when the variables are measured - in different units of measurement (for example, income measured in dollars and family size - measured in number of individuals)” (Source: Wikipedia)
-For gls
-objects, standardized beta coefficients may be wrong
- for categorical variables (factors
), because the model.matrix
- gls
objects returns the original data of the categorical vector,
- and not the 'dummy' coded vectors as for other classes. See, as example:
- head(model.matrix(lm(neg_c_7 ~ as.factor(e42dep), data = efc, na.action = na.omit)))
- head(model.matrix(nlme::gls(neg_c_7 ~ as.factor(e42dep), data = efc, na.action = na.omit)))
- In such cases, use to_dummy
to create dummies from
- factors.
Wikipedia: Standardized coefficient
- Gelman A. 2008. Scaling regression inputs by dividing by two standard deviations. Statistics in Medicine 27: 2865-2873 http://www.stat.columbia.edu/~gelman/research/published/standardizing7.pdf
-# fit linear model -fit <- lm(Ozone ~ Wind + Temp + Solar.R, data = airquality) -# print std. beta coefficients -std_beta(fit)#> term std.estimate std.error conf.low conf.high -#> 1 Wind -0.3564122 0.06996619 -0.4935434 -0.2192810 -#> 2 Temp 0.4731461 0.07260889 0.3308353 0.6154569 -#> 3 Solar.R 0.1638655 0.06351430 0.0393798 0.2883513-# print std. beta coefficients and ci, using -# 2 sd and center binary predictors -std_beta(fit, type = "std2")#> term std.estimate std.error conf.low conf.high -#> 1 Wind -23.71992 4.656386 -32.846272 -14.59358 -#> 2 Temp 31.48879 4.832262 22.017729 40.95985 -#> 3 Solar.R 10.90557 4.226999 2.620802 19.19034-# std. beta for mixed models -library(lme4) -fit1 <- lmer(Reaction ~ Days + (Days | Subject), sleepstudy) -std_beta(fit)#> term std.estimate std.error conf.low conf.high -#> 1 Wind -0.3564122 0.06996619 -0.4935434 -0.2192810 -#> 2 Temp 0.4731461 0.07260889 0.3308353 0.6154569 -#> 3 Solar.R 0.1638655 0.06351430 0.0393798 0.2883513-
- svyglm.nb.Rd
is an extension to the survey-package
- to fit survey-weighted negative binomial models. It uses
- svymle
to fit sampling-weighted
- maximum likelihood estimates, based on starting values provided
- by glm.nb
, as proposed by Lumley
- (2010, pp249).
svyglm.nb(formula, design, ...)- -
formula | -An object of class |
design | -An object of class |
... | -Other arguments passed down to |
An object of class svymle
- with some additional information about the model.
For details on the computation method, see Lumley (2010), Appendix E
- (especially 254ff.)
- sjstats implements following S3-methods for svyglm.nb
- family()
, model.frame()
, formula()
, print()
- predict()
and residuals()
. However, these functions have some
- limitations:
simply returns the family-object from the
- underlying glm.nb
The predict()
-method just re-fits the svyglm.nb
- with glm.nb
, overwrites the $coefficients
- from this model-object with the coefficients from the returned
- svymle
-object and finally calls
- predict.glm
to compute the predicted values.
re-fits the svyglm.nb
-model with
- glm.nb
and then computes the Pearson-residuals
- from the glm.nb
Lumley T (2010). Complex Surveys: a guide to analysis using R. Wiley
- --# ------------------------------------------ -# This example reproduces the results from -# Lumley 2010, figure E.7 (Appendix E, p256) -# ------------------------------------------ -if (require("survey")) { - data(nhanes_sample) - - # create survey design - des <- svydesign( - id = ~SDMVPSU, - strat = ~SDMVSTRA, - weights = ~WTINT2YR, - nest = TRUE, - data = nhanes_sample - ) - - # fit negative binomial regression - fit <- svyglm.nb(total ~ factor(RIAGENDR) * (log(age) + factor(RIDRETH1)), des) - - # print coefficients and standard errors - fit -}#>#>#>#> -#>#>-#> -#>#>-#> -#>#> term irr std.error conf.low conf.high -#> 2 (Intercept) 9.8463 0.1556 7.2578 13.3580 -#> 3 factor(RIAGENDR)2 0.4511 0.1805 0.3167 0.6426 -#> 4 log(age) 2.9163 0.2331 1.8467 4.6056 -#> 5 factor(RIDRETH1)2 1.0859 0.1477 0.8130 1.4504 -#> 6 factor(RIDRETH1)3 1.0977 0.1779 0.7746 1.5556 -#> 7 factor(RIDRETH1)4 2.2686 0.2974 1.2665 4.0634 -#> 8 factor(RIDRETH1)5 1.0589 0.3789 0.5039 2.2250 -#> 9 factor(RIAGENDR)2:log(age) 0.2947 0.2651 0.1753 0.4955 -#> 10 factor(RIAGENDR)2:factor(RIDRETH1)2 0.8314 0.2611 0.4984 1.3870 -#> 11 factor(RIAGENDR)2:factor(RIDRETH1)3 1.8285 0.1931 1.2523 2.6698 -#> 12 factor(RIAGENDR)2:factor(RIDRETH1)4 1.0668 0.3747 0.5119 2.2232 -#> 13 factor(RIAGENDR)2:factor(RIDRETH1)5 1.4564 0.4427 0.6116 3.4680 -#> p.value -#> 2 <0.001 *** -#> 3 <0.001 *** -#> 4 <0.001 *** -#> 5 0.5769 -#> 6 0.6003 -#> 7 0.0059 ** -#> 8 0.8800 -#> 9 <0.001 *** -#> 10 0.4795 -#> 11 0.0018 ** -#> 12 0.8630 -#> 13 0.3957 -#> -#> Dispersion parameter Theta: 0.8062 -#> Standard Error of Theta: 0.0216#> -#>
is an extension to the survey-package
- to fit survey-weighted zero-inflated Poisson models. It uses
- svymle
to fit sampling-weighted
- maximum likelihood estimates, based on starting values provided
- by zeroinfl
svyglm.zip(formula, design, ...)- -
formula | -An object of class |
design | -An object of class |
... | -Other arguments passed down to |
An object of class svymle
and svyglm.zip
- with some additional information about the model.
Code modified from https://notstatschat.rbind.io/2015/05/26/zero-inflated-poisson-from-complex-samples/.
- --if (require("survey")) { - data(nhanes_sample) - set.seed(123) - nhanes_sample$malepartners <- rpois(nrow(nhanes_sample), 2) - nhanes_sample$malepartners[sample(1:2992, 400)] <- 0 - - # create survey design - des <- svydesign( - id = ~SDMVPSU, - strat = ~SDMVSTRA, - weights = ~WTINT2YR, - nest = TRUE, - data = nhanes_sample - ) - - # fit negative binomial regression - fit <- svyglm.zip( - malepartners ~ age + factor(RIDRETH1) | age + factor(RIDRETH1), - des - ) - - # print coefficients and standard errors - fit -}#> Warning: non-integer #successes in a binomial glm!#> term estimate std.error conf.low conf.high p.value -#> 2 age 0.0149 0.0354 0.9469 1.0879 0.6745 -#> 3 factor(RIDRETH1)2 0.0185 0.0754 0.8787 1.1810 0.8062 -#> 4 factor(RIDRETH1)3 -0.0449 0.0284 0.9043 1.0107 0.1133 -#> 5 factor(RIDRETH1)4 -0.0240 0.0276 0.9250 1.0305 0.3843 -#> 6 factor(RIDRETH1)5 0.0371 0.0617 0.9197 1.1712 0.5470 -#> 7 tp.(Intercept) -1.6694 0.4717 0.0747 0.4748 <0.001 *** -#> 8 tp.age -0.0333 0.2831 0.5553 1.6848 0.9064 -#> 9 tp.factor(RIDRETH1)2 0.1548 0.2571 0.7053 1.9323 0.5472 -#> 10 tp.factor(RIDRETH1)3 -0.3969 0.2111 0.4446 1.0169 0.0601 . -#> 11 tp.factor(RIDRETH1)4 -0.2330 0.3050 0.4357 1.4402 0.4450 -#> 12 tp.factor(RIDRETH1)5 -0.3303 0.4744 0.2836 1.8214 0.4863#> -#>
This function calculates a table's cell, row and column percentages as - well as expected values and returns all results as lists of tables.
-table_values(tab, digits = 2)- -
tab | -Simple |
digits | -Amount of digits for the table percentage values. |
(Invisibly) returns a list with four tables:
a table with cell percentages of tab
a table with row percentages of tab
a table with column percentages of tab
a table with expected values of tab
-tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) -# show expected values -table_values(tab)$expected#> A B C -#> A 4 5 4 -#> B 5 7 5# show cell percentages -table_values(tab)$cell#> 1 2 3 -#> -#> 1 16.67 10.00 13.33 -#> 2 13.33 30.00 16.67-
Returns a tidy summary output for stan models.
-tidy_stan( - x, - prob = 0.89, - typical = "median", - trans = NULL, - effects = c("all", "fixed", "random"), - component = c("all", "conditional", "zero_inflated", "zi"), - digits = 2 -)- -
x | -A |
prob | -Vector of scalars between 0 and 1, indicating the mass within -the credible interval that is to be estimated. |
typical | -The typical value that will represent the Bayesian point estimate.
-By default, the posterior median is returned. See |
trans | -Name of a function or character vector naming a function, used
-to apply transformations on the estimates and uncertainty intervals. The
-values for standard errors are not transformed! If |
effects | -Should results for fixed effects, random effects or both be returned? -Only applies to mixed models. May be abbreviated. |
component | -Should results for all parameters, parameters for the conditional model -or the zero-inflated part of the model be returned? May be abbreviated. Only -applies to brms-models. |
digits | -Amount of digits to round numerical values in the output. |
A data frame, summarizing x
, with consistent column names.
- To distinguish multiple HDI values, column names for the HDI get a suffix
- when prob
has more than one element.
The returned data frame has an additonal class-attribute,
- tidy_stan
, to pass the result to its own print()
- The print()
-method creates a cleaner output, especially for multilevel,
- zero-inflated or multivariate response models, where - for instance -
- the conditional part of a model is printed separately from the zero-inflated
- part, or random and fixed effects are printed separately.
- The returned data frame gives information on:
The Bayesian point estimate (column estimate, which is by
- default the posterior median; other statistics are also possible,
- see argument typical
The standard error (which is actually the median absolute deviation).
The HDI. Computation for HDI is based on the - code from Kruschke 2015, pp. 727f.
The Probability of Direction (pd), which is an index for "effect significance" - (see Makowski et al. 2019). A value of 95% or higher indicates a - "significant" (i.e. statistically clear) effect.
The effective numbers of samples, ESS.
The Rhat statistics. When Rhat is above 1, it usually indicates that - the chain has not yet converged, indicating that the drawn samples - might not be trustworthy. Drawing more iteration may solve this issue.
The Monte Carlo standard error (see mcse
). It is defined
- as standard deviation of the chains divided by their effective sample
- size and “provides a quantitative suggestion of how big the
- estimation noise is” (Kruschke 2015, p.187).
Kruschke JK. Doing Bayesian Data Analysis: A Tutorial with R, JAGS, and Stan 2nd edition. Academic Press, 2015
-Gelman A, Carlin JB, Stern HS, Dunson DB, Vehtari A, Rubin DB. Bayesian data analysis 3rd ed. Boca Raton: Chapman and Hall/CRC, 2013
-Gelman A, Rubin DB. Inference from iterative simulation using multiple sequences Statistical Science 1992;7: 457-511
-Makowski D, Ben-Shachar MS, Lüdecke D. bayestestR: Describing Effects and their Uncertainty, Existence and Significance within the Bayesian Framework. Journal of Open Source Software 2019;4:1541. doi: 10.21105/joss.01541
-McElreath R. Statistical Rethinking. A Bayesian Course with Examples in R and Stan Chapman and Hall, 2015
This function returns the "typical" value of a variable.
- -typical_value(x, fun = "mean", weights = NULL, ...)- -
x | -A variable. |
fun | -Character vector, naming the function to be applied to
- |
weights | -Name of variable in |
... | -Further arguments, passed down to |
The "typical" value of x
By default, for numeric variables, typical_value()
returns the
- mean value of x
(unless changed with the fun
- For factors, the reference level is returned or the most common value
- (if fun = "mode"
), unless fun
is a named vector. If
- fun
is a named vector, specify the function for numeric
- and categorical variables as element names, e.g.
- fun = c(numeric = "median", factor = "mean")
. In this case,
- factors are converted to numeric values (using to_value
- and the related function is applied. You may abbreviate the names
- fun = c(n = "median", f = "mean")
. See also 'Examples'.
- For character vectors the most common value (mode) is returned.
-#> [1] 5.843333#> $Sepal.Length -#> [1] 5.843333 -#> -#> $Sepal.Width -#> [1] 3.057333 -#> -#> $Petal.Length -#> [1] 3.758 -#> -#> $Petal.Width -#> [1] 1.199333 -#> -#> $Species -#> [1] "setosa" -#>-# example from ?stats::weighted.mean -wt <- c(5, 5, 4, 1) / 15 -x <- c(3.7, 3.3, 3.5, 2.8) - -typical_value(x, fun = "weighted.mean")#> [1] 3.325typical_value(x, fun = "weighted.mean", weights = wt)#> [1] 3.453333-# for factors, return either reference level or mode value -set.seed(123) -x <- sample(iris$Species, size = 30, replace = TRUE) -typical_value(x)#> [1] "setosa"typical_value(x, fun = "mode")#> [1] "virginica"-# for factors, use a named vector to apply other functions than "mode" -map(iris, ~ typical_value(.x, fun = c(n = "median", f = "mean")))#> $Sepal.Length -#> [1] 5.8 -#> -#> $Sepal.Width -#> [1] 3 -#> -#> $Petal.Length -#> [1] 4.35 -#> -#> $Petal.Width -#> [1] 1.3 -#> -#> $Species -#> [1] 2 -#>- -
Calculate the population variance or standard deviation of a vector.
-var_pop(x) - -sd_pop(x)- -
x | -(Numeric) vector. |
The population variance or standard deviation of x
Unlike var
, which returns the sample variance,
- var_pop()
returns the population variance. sd_pop()
- returns the standard deviation based on the population variance.
-#> [1] 2581.152# population variance -var_pop(efc$c12hour)#> [1] 2578.291#> [1] 50.80504# population sd -sd_pop(efc$c12hour)#> [1] 50.77687-
These functions weight the variable x
- a specific vector of weights
weight(x, weights, digits = 0) - -weight2(x, weights)- -
x | -(Unweighted) variable. |
weights | -Vector with same length as |
digits | -Numeric value indicating the number of decimal places to be
-used for rounding the weighted values. By default, this value is
- |
The weighted x
sums up all weights
values of the associated
- categories of x
, whereas weight()
uses a
- xtabs
formula to weight cases. Thus, weight()
- may return a vector of different length than x
The values of the returned vector are in sorted order, whereas the values'
- order of the original x
may be spread randomly. Hence, x
can't be
- used, for instance, for further cross tabulation. In case you want to have
- weighted contingency tables or (grouped) box plots etc., use the weightBy
- argument of most functions.
-#> v -#> 1 2 3 4 -#> 3 6 5 6#> -#> 1 2 3 4 -#> 2 5 5 3#> -#> 1 2 3 4 -#> 2 5 5 3#> x -#> a b c d e -#> 6 4 3 1 6#> -#> a b c e -#> 3 3 2 3-
, R/wtd_chisqtest.R
Weighted statistics for variables
- weighted_sd()
, weighted_se()
, weighted_mean()
and weighted_median()
- compute weighted standard deviation, standard error, mean or median for a
- variable or for all variables of a data frame. survey_median()
computes the
- median for a variable in a survey-design (see svydesign
- weighted_correlation()
computes a weighted correlation for a two-sided alternative
- hypothesis.
- Weighted tests
- weighted_ttest()
computes a weighted t-test, while weighted_mannwhitney()
- computes a weighted Mann-Whitney-U test or a Kruskal-Wallis test
- (for more than two groups). weighted_chisqtest()
computes a weighted
- Chi-squared test for contigency tables.
survey_median(x, design) - -weighted_chisqtest(data, ...) - -# S3 method for default -weighted_chisqtest(data, x, y, weights, ...) - -# S3 method for formula -weighted_chisqtest(formula, data, ...) - -weighted_correlation(data, ...) - -# S3 method for default -weighted_correlation(data, x, y, weights, ci.lvl = 0.95, ...) - -# S3 method for formula -weighted_correlation(formula, data, ci.lvl = 0.95, ...) - -weighted_mean(x, weights = NULL) - -weighted_median(x, weights = NULL) - -weighted_mannwhitney(data, ...) - -# S3 method for default -weighted_mannwhitney(data, x, grp, weights, ...) - -# S3 method for formula -weighted_mannwhitney(formula, data, ...) - -weighted_sd(x, weights = NULL) - -wtd_sd(x, weights = NULL) - -weighted_se(x, weights = NULL) - -weighted_ttest(data, ...) - -# S3 method for default -weighted_ttest( - data, - x, - y = NULL, - weights, - mu = 0, - paired = FALSE, - ci.lvl = 0.95, - alternative = c("two.sided", "less", "greater"), - ... -) - -# S3 method for formula -weighted_ttest( - formula, - data, - mu = 0, - paired = FALSE, - ci.lvl = 0.95, - alternative = c("two.sided", "less", "greater"), - ... -)- -
x | -(Numeric) vector or a data frame. For |
design | -An object of class |
data | -A data frame. |
... | -For |
y | -Optional, bare (unquoted) variable name, or a character vector with -the variable name. |
weights | -Bare (unquoted) variable name, or a character vector with
-the variable name of the numeric vector of weights. If |
formula | -A formula of the form |
ci.lvl | -Confidence level of the interval. |
grp | -Bare (unquoted) name of the cross-classifying variable, where
- |
mu | -A number indicating the true value of the mean (or difference in -means if you are performing a two sample test). |
paired | -Logical, whether to compute a paired t-test. |
alternative | -A character string specifying the alternative hypothesis,
-must be one of |
The weighted (test) statistic.
is a convenient wrapper for crosstable_statistics
- For a weighted one-way Anova, use means_by_group()
- weights
- weighted_ttest()
assumes unequal variance between the two groups.
-#> [1] 0.8498705#> c12hour e15relat e16sex -#> 51.7876181 2.0540843 0.4699551#> c12hour e15relat e16sex -#> 1.66065784 0.06942749 0.01562877-# survey_median ---- - -# median for variables from weighted survey designs -if (require("survey")) { - data(nhanes_sample) - - des <- svydesign( - id = ~SDMVPSU, - strat = ~SDMVSTRA, - weights = ~WTINT2YR, - nest = TRUE, - data = nhanes_sample - ) - - survey_median(total, des) - survey_median("total", des) -}#> [1] 6-# weighted t-test ---- - -efc$weight <- abs(rnorm(nrow(efc), 1, .3)) -weighted_ttest(efc, e17age, weights = weight)#> -#> One Sample t-test (two.sided) -#> # t=292.68 df=890 p-value=0.000 -#> -#> mean of e17age: 79.189 [78.658, 79.720] -#>weighted_ttest(efc, e17age, c160age, weights = weight)#> -#> Two-Sample t-test (two.sided) -#> -#> # comparison between e17age and c160age -#> # t=49.92 df=1469 p-value=0.000 -#> -#> mean of e17age : 79.187 -#> mean of c160age : 53.208 -#> difference of mean: 25.980 [24.959 27.001] -#>weighted_ttest(e17age ~ e16sex + weight, efc)#> -#> Two-Sample t-test (two.sided) -#> -#> # comparison of e17age by e16sex -#> # t=-7.46 df=604 p-value=0.000 -#> -#> mean in group [1] male : 76.401 -#> mean in group [2] female: 80.518 -#> difference of mean : -4.117 [-5.201 -3.034] -#>-# weighted Mann-Whitney-U-test ---- - -weighted_mannwhitney(c12hour ~ c161sex + weight, efc)#> -#> Weighted Mann-Whitney-U test (two.sided) -#> -#> # comparison of c12hour by c161sex -#> # Chisq=3.26 df=899 p-value=0.001 -#> -#> difference in mean rank score: 0.075 -#>-# weighted Chi-squared-test ---- - -weighted_chisqtest(efc, c161sex, e16sex, weights = weight, correct = FALSE)#> -#> # Measure of Association for Contingency Tables -#> -#> Chi-squared: 2.0566 -#> Phi: 0.0479 -#> p-value: 0.1515weighted_chisqtest(c172code ~ c161sex + weight, efc)#> -#> # Measure of Association for Contingency Tables -#> -#> Chi-squared: 4.8005 -#> Cramer's V: 0.0758 -#> p-value: 0.0907
Weighted statistics for variables
- wtd_sd()
, wtd_se()
, wtd_mean()
and wtd_median()
- compute weighted standard deviation, standard error, mean or median for a
- variable or for all variables of a data frame. svy_md()
computes the
- median for a variable in a survey-design (see svydesign
- wtd_cor()
computes a weighted correlation for a two-sided alternative
- hypothesis.
- Weighted tests
- wtd_ttest()
computes a weighted t-test, while wtd_mwu()
- computes a weighted Mann-Whitney-U test or a Kruskal-Wallis test
- (for more than two groups). wtd_chisqtest()
computes a weighted
- Chi-squared test for contigency tables.
svy_md(x, design) - -survey_median(x, design) - -wtd_chisqtest(data, ...) - -# S3 method for default -wtd_chisqtest(data, x, y, weights, ...) - -# S3 method for formula -wtd_chisqtest(formula, data, ...) - -wtd_cor(data, ...) - -# S3 method for default -wtd_cor(data, x, y, weights, ci.lvl = 0.95, ...) - -# S3 method for formula -wtd_cor(formula, data, ci.lvl = 0.95, ...) - -wtd_mean(x, weights = NULL) - -wtd_median(x, weights = NULL) - -wtd_mwu(data, ...) - -# S3 method for default -wtd_mwu(data, x, grp, weights, ...) - -# S3 method for formula -wtd_mwu(formula, data, ...) - -wtd_sd(x, weights = NULL) - -wtd_se(x, weights = NULL) - -wtd_ttest(data, ...) - -# S3 method for default -wtd_ttest( - data, - x, - y = NULL, - weights, - mu = 0, - paired = FALSE, - ci.lvl = 0.95, - alternative = c("two.sided", "less", "greater"), - ... -) - -# S3 method for formula -wtd_ttest( - formula, - data, - mu = 0, - paired = FALSE, - ci.lvl = 0.95, - alternative = c("two.sided", "less", "greater"), - ... -)- -
x | -(Numeric) vector or a data frame. For |
design | -An object of class |
data | -A data frame. |
... | -For |
y | -Optional, bare (unquoted) variable name, or a character vector with -the variable name. |
weights | -Bare (unquoted) variable name, or a character vector with
-the variable name of the numeric vector of weights. If |
formula | -A formula of the form |
ci.lvl | -Confidence level of the interval. |
grp | -Bare (unquoted) name of the cross-classifying variable, where
- |
mu | -A number indicating the true value of the mean (or difference in -means if you are performing a two sample test). |
paired | -Logical, whether to compute a paired t-test. |
alternative | -A character string specifying the alternative hypothesis,
-must be one of |
The weighted (test) statistic.
is a convenient wrapper for xtab_statistics
- For a weighted one-way Anova, use grpmean()
- weights
- wtd_ttest()
assumes unequal variance between the two groups.
-#> [1] 0.8498705#> c12hour e15relat e16sex -#> 51.7876181 2.0540843 0.4699551#> c12hour e15relat e16sex -#> 1.66065784 0.06942749 0.01562877-# svy_md ---- - -# median for variables from weighted survey designs -library(survey) -data(nhanes_sample) - -des <- svydesign( - id = ~SDMVPSU, - strat = ~SDMVSTRA, - weights = ~WTINT2YR, - nest = TRUE, - data = nhanes_sample -) - -svy_md(total, des)#> [1] 6svy_md("total", des)#> [1] 6-# weighted t-test ---- - -efc$weight <- abs(rnorm(nrow(efc), 1, .3)) -wtd_ttest(efc, e17age, weights = weight)#> -#> One Sample t-test (two.sided) -#> # t=292.68 df=890 p-value=0.000 -#> -#> mean of e17age: 79.189 [78.658, 79.720] -#>wtd_ttest(efc, e17age, c160age, weights = weight)#> -#> Two-Sample t-test (two.sided) -#> -#> # comparison between e17age and c160age -#> # t=49.92 df=1469 p-value=0.000 -#> -#> mean of e17age : 79.187 -#> mean of c160age : 53.208 -#> difference of mean: 25.980 [24.959 27.001] -#>wtd_ttest(e17age ~ e16sex + weight, efc)#> -#> Two-Sample t-test (two.sided) -#> -#> # comparison of e17age by e16sex -#> # t=-7.46 df=604 p-value=0.000 -#> -#> mean in group [1] male : 76.401 -#> mean in group [2] female: 80.518 -#> difference of mean : -4.117 [-5.201 -3.034] -#>-# weighted Mann-Whitney-U-test ---- - -wtd_mwu(c12hour ~ c161sex + weight, efc)#> -#> Weighted Mann-Whitney-U test (two.sided) -#> -#> # comparison of c12hour by c161sex -#> # Chisq=3.26 df=899 p-value=0.001 -#> -#> difference in mean rank score: 0.075 -#>-# weighted Chi-squared-test ---- - -wtd_chisqtest(efc, c161sex, e16sex, weights = weight, correct = FALSE)#> -#> # Measure of Association for Contingency Tables -#> -#> Chi-squared: 2.0566 -#> Phi: 0.0479 -#> p-value: 0.1515wtd_chisqtest(c172code ~ c161sex + weight, efc)#> -#> # Measure of Association for Contingency Tables -#> -#> Chi-squared: 4.8005 -#> Cramer's V: 0.0758 -#> p-value: 0.0907-
This function calculates various measure of association for - contingency tables and returns the statistic and p-value. - Supported measures are Cramer's V, Phi, Spearman's rho, - Kendall's tau and Pearson's r.
-cramer(tab, ...) - -# S3 method for formula -cramer( - formula, - data, - ci.lvl = NULL, - n = 1000, - method = c("dist", "quantile"), - ... -) - -phi(tab, ...) - -xtab_statistics( - data, - x1 = NULL, - x2 = NULL, - statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), - weights = NULL, - ... -) - -crosstable_statistics( - data, - x1 = NULL, - x2 = NULL, - statistics = c("auto", "cramer", "phi", "spearman", "kendall", "pearson", "fisher"), - weights = NULL, - ... -)- -
tab | -A |
... | -Other arguments, passed down to the statistic functions
- |
formula | -A formula of the form |
data | -A data frame or a table object. If a table object, |
ci.lvl | -Scalar between 0 and 1. If not |
n | -Number of bootstraps to be generated. |
method | -Character vector, indicating if confidence intervals should be
-based on bootstrap standard error, multiplied by the value of the
-quantile function of the t-distribution (default), or on sample
-quantiles of the bootstrapped values. See 'Details' in |
x1 | -Name of first variable that should be used to compute the
-contingency table. If |
x2 | -Name of second variable that should be used to compute the
-contingency table. If |
statistics | -Name of measure of association that should be computed. May
-be one of |
weights | -Name of variable in |
For phi()
, the table's Phi value. For cramer()
, the
- table's Cramer's V.
- For crosstable_statistics()
, a list with following components:
the value of the estimated measure of association.
the p-value for the test.
the value of the test statistic.
the name of the test statistic.
if applicable, the name of the test statistic, in HTML-format.
the degrees of freedom for the contingency table.
character string indicating the name of the measure of association.
if applicable, the name of the measure of association, in HTML-format.
the short form of association measure, equals the statistics
logical, if Fisher's exact test was used to calculate the p-value.
The p-value for Cramer's V and the Phi coefficient are based
- on chisq.test()
. If any expected value of a table cell is
- smaller than 5, or smaller than 10 and the df is 1, then fisher.test()
- is used to compute the p-value, unless statistics = "fisher"
; in
- this case, the use of fisher.test()
is forced to compute the
- p-value. The test statistic is calculated with cramer()
- phi()
- Both test statistic and p-value for Spearman's rho, Kendall's tau
- and Pearson's r are calculated with cor.test()
- When statistics = "auto"
, only Cramer's V or Phi are calculated,
- based on the dimension of the table (i.e. if the table has more than
- two rows or columns, Cramer's V is calculated, else Phi).
-# Phi coefficient for 2x2 tables -tab <- table(sample(1:2, 30, TRUE), sample(1:2, 30, TRUE)) -phi(tab)#> [1] 0.1443376-# Cramer's V for nominal variables with more than 2 categories -tab <- table(sample(1:2, 30, TRUE), sample(1:3, 30, TRUE)) -cramer(tab)#> [1] 0.3795188#> [1] 0.05258249-# bootstrapped confidence intervals -cramer(e16sex ~ c161sex, data = efc, ci.lvl = .95, n = 100)#> cramer conf.low conf.high -#> 1 0.05258249 -0.005303236 0.1088931-# 2x2 table, compute Phi automatically -crosstable_statistics(efc, e16sex, c161sex)#> -#> # Measure of Association for Contingency Tables -#> -#> Chi-squared: 2.2327 -#> Phi: 0.0526 -#> p-value: 0.1351-# more dimensions than 2x2, compute Cramer's V automatically -crosstable_statistics(efc, c172code, c161sex)#> -#> # Measure of Association for Contingency Tables -#> -#> Chi-squared: 4.1085 -#> Cramer's V: 0.0699 -#> p-value: 0.1282-# ordinal data, use Kendall's tau -crosstable_statistics(efc, e42dep, quol_5, statistics = "kendall")#> -#> # Measure of Association for Contingency Tables -#> -#> z: -9.5951 -#> Kendall's tau: -0.2496 -#> p-value: <0.001-# calcilate Spearman's rho, with continuity correction -crosstable_statistics(efc, - e42dep, - quol_5, - statistics = "spearman", - exact = FALSE, - continuity = TRUE -)#> -#> # Measure of Association for Contingency Tables -#> -#> S: 157974157.4198 -#> Spearman's rho: -0.3177 -#> p-value: <0.001-