diff --git a/.Rbuildignore b/.Rbuildignore index 96c5763c..d124fd41 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,10 @@ ^pkgdown$ ^.github$ \.code-workspace$ +^README\.rmd +^\.git$ +^WIP$ +\.lintr$ +^CRAN-SUBMISSION$ +^cran-comments\.md$ +^LICENSE\.md$ diff --git a/.github/workflows/R-check.yaml b/.github/workflows/R-check.yaml index 93fd9037..7c504b9b 100644 --- a/.github/workflows/R-check.yaml +++ b/.github/workflows/R-check.yaml @@ -32,8 +32,6 @@ jobs: ## This is because they are already run in `R-CMD-check-strict` workflow. ## ##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - { os: macOS-latest, r: "release" } - - { os: windows-latest, r: "devel" } - { os: windows-latest, r: "release" } diff --git a/DESCRIPTION b/DESCRIPTION index e25170ae..358379d0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,33 +20,21 @@ Depends: Imports: bayestestR, datawizard, - dplyr, - effectsize, + effectsize (>= 0.8.8), insight, - lme4, - magrittr, - MASS, - modelr, parameters, performance, - purrr, - rlang, - sjlabelled, - sjmisc, - stats, - tidyr + stats Suggests: brms, - broom, car, coin, ggplot2, - graphics, + lme4, + MASS, pscl, pwr, - sjPlot, survey, - rstan, testthat URL: https://strengejacke.github.io/sjstats/ BugReports: https://github.com/strengejacke/sjstats/issues diff --git a/NAMESPACE b/NAMESPACE index 5a224af0..e410b8f3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,52 +12,31 @@ S3method(formula,svyglm.nb) S3method(formula,svyglm.zip) S3method(model.frame,svyglm.nb) S3method(model.frame,svyglm.zip) -S3method(model.matrix,gls) S3method(phi,formula) S3method(phi,ftable) S3method(phi,table) S3method(plot,sj_inequ_trend) S3method(predict,svyglm.nb) S3method(print,sj_anova_stat) -S3method(print,sj_check_assump) S3method(print,sj_chi2gof) -S3method(print,sj_grpmean) -S3method(print,sj_grpmeans) S3method(print,sj_htest_chi) S3method(print,sj_htest_kw) S3method(print,sj_htest_mwu) -S3method(print,sj_mwu) -S3method(print,sj_outliers) -S3method(print,sj_pval) +S3method(print,sj_htest_t) +S3method(print,sj_htest_wilcox) S3method(print,sj_resample) S3method(print,sj_ttest) S3method(print,sj_wcor) -S3method(print,sj_wmwu) S3method(print,sj_xtab_stat) -S3method(print,sj_xtab_stat2) S3method(print,svyglm.nb) S3method(print,svyglm.zip) -S3method(print,tidy_stan) S3method(residuals,svyglm.nb) -S3method(summary,sj_pval) S3method(terms,svyglm.nb) S3method(weighted_correlation,default) S3method(weighted_correlation,formula) -S3method(weighted_mannwhitney,default) -S3method(weighted_mannwhitney,formula) -S3method(weighted_mean,data.frame) -S3method(weighted_mean,default) -S3method(weighted_median,data.frame) -S3method(weighted_median,default) -S3method(weighted_sd,data.frame) -S3method(weighted_sd,default) -S3method(weighted_sd,matrix) S3method(weighted_se,data.frame) S3method(weighted_se,default) S3method(weighted_se,matrix) -S3method(weighted_ttest,default) -S3method(weighted_ttest,formula) -export("%>%") export(anova_stats) export(auto_prior) export(boot_ci) @@ -110,94 +89,24 @@ export(smpsize_lmm) export(survey_median) export(svyglm.nb) export(svyglm.zip) +export(t_test) export(table_values) -export(typical_value) export(var_pop) export(weight) export(weight2) export(weighted_correlation) -export(weighted_mannwhitney) export(weighted_mean) export(weighted_median) -export(weighted_ranktest) export(weighted_sd) export(weighted_se) -export(weighted_ttest) -export(wtd_sd) +export(wilcoxon_test) export(xtab_statistics) -importFrom(MASS,glm.nb) importFrom(bayestestR,ci) importFrom(bayestestR,equivalence_test) -importFrom(dplyr,case_when) -importFrom(dplyr,filter) -importFrom(dplyr,group_vars) -importFrom(dplyr,mutate) -importFrom(dplyr,quos) -importFrom(dplyr,select) -importFrom(dplyr,select_if) -importFrom(dplyr,summarise) -importFrom(insight,export_table) -importFrom(insight,find_formula) -importFrom(insight,find_response) -importFrom(insight,format_p) -importFrom(insight,format_value) -importFrom(insight,get_data) -importFrom(insight,get_response) +importFrom(datawizard,weighted_mean) +importFrom(datawizard,weighted_median) +importFrom(datawizard,weighted_sd) importFrom(insight,link_inverse) -importFrom(insight,print_color) -importFrom(lme4,ngrps) -importFrom(magrittr,"%>%") -importFrom(modelr,crossv_kfold) importFrom(performance,mse) importFrom(performance,rmse) -importFrom(purrr,flatten_df) -importFrom(purrr,map) -importFrom(purrr,map2) -importFrom(purrr,map_dbl) -importFrom(purrr,map_df) -importFrom(purrr,map_lgl) -importFrom(purrr,walk) -importFrom(rlang,.data) -importFrom(rlang,enquo) -importFrom(rlang,quo_name) -importFrom(sjlabelled,as_numeric) -importFrom(sjmisc,is_empty) -importFrom(sjmisc,is_float) -importFrom(sjmisc,is_num_fac) -importFrom(sjmisc,str_contains) -importFrom(sjmisc,trim) -importFrom(sjmisc,typical_value) -importFrom(stats,as.formula) -importFrom(stats,chisq.test) -importFrom(stats,coef) -importFrom(stats,complete.cases) -importFrom(stats,dpois) importFrom(stats,family) -importFrom(stats,fitted) -importFrom(stats,formula) -importFrom(stats,kruskal.test) -importFrom(stats,lm) -importFrom(stats,model.frame) -importFrom(stats,model.matrix) -importFrom(stats,na.omit) -importFrom(stats,na.pass) -importFrom(stats,nobs) -importFrom(stats,pf) -importFrom(stats,pnorm) -importFrom(stats,predict.glm) -importFrom(stats,pt) -importFrom(stats,qf) -importFrom(stats,qnorm) -importFrom(stats,resid) -importFrom(stats,sd) -importFrom(stats,setNames) -importFrom(stats,terms) -importFrom(stats,update) -importFrom(stats,var) -importFrom(stats,vcov) -importFrom(stats,weighted.mean) -importFrom(stats,weights) -importFrom(stats,xtabs) -importFrom(tidyr,gather) -importFrom(tidyr,nest) -importFrom(tidyr,unnest) diff --git a/R/S3-methods.R b/R/S3-methods.R index 434b700a..568d22a6 100644 --- a/R/S3-methods.R +++ b/R/S3-methods.R @@ -1,24 +1,16 @@ -#' @importFrom stats model.matrix -#' @importFrom insight get_data -#' @export -model.matrix.gls <- function(object, ...) { - stats::model.matrix(object, data = insight::get_data(object)) -} - - -#' @importFrom stats coef vcov pnorm -#' @importFrom dplyr case_when #' @export print.svyglm.nb <- function(x, se = c("robust", "model"), digits = 4, ...) { se <- match.arg(se) sm <- tidy_svyglm.nb(x, digits, v_se = se)[-1, -2] - pan <- dplyr::case_when( - sm$p.value < 0.001 ~ "<0.001 ***", - sm$p.value < 0.01 ~ sprintf("%.*f ** ", digits, sm$p.value), - sm$p.value < 0.05 ~ sprintf("%.*f * ", digits, sm$p.value), - sm$p.value < 0.1 ~ sprintf("%.*f . ", digits, sm$p.value), - TRUE ~ sprintf("%.*f ", digits, sm$p.value) + pan <- ifelse(sm$p.value < 0.001, "<0.001 ***", + ifelse(sm$p.value < 0.01, sprintf("%.*f ** ", digits, sm$p.value), # nolint + ifelse(sm$p.value < 0.05, sprintf("%.*f * ", digits, sm$p.value), # nolint + ifelse(sm$p.value < 0.1, sprintf("%.*f . ", digits, sm$p.value), # nolint + sprintf("%.*f ", digits, sm$p.value) + ) + ) + ) ) sm$p.value <- pan @@ -32,20 +24,19 @@ print.svyglm.nb <- function(x, se = c("robust", "model"), digits = 4, ...) { } - -#' @importFrom stats coef vcov pnorm -#' @importFrom dplyr case_when #' @export print.svyglm.zip <- function(x, se = c("robust", "model"), digits = 4, ...) { se <- match.arg(se) sm <- tidy_svyglm.zip(x, digits, v_se = se)[-1, ] - pan <- dplyr::case_when( - sm$p.value < 0.001 ~ "<0.001 ***", - sm$p.value < 0.01 ~ sprintf("%.*f ** ", digits, sm$p.value), - sm$p.value < 0.05 ~ sprintf("%.*f * ", digits, sm$p.value), - sm$p.value < 0.1 ~ sprintf("%.*f . ", digits, sm$p.value), - TRUE ~ sprintf("%.*f ", digits, sm$p.value) + pan <- ifelse(sm$p.value < 0.001, "<0.001 ***", + ifelse(sm$p.value < 0.01, sprintf("%.*f ** ", digits, sm$p.value), # nolint + ifelse(sm$p.value < 0.05, sprintf("%.*f * ", digits, sm$p.value), # nolint + ifelse(sm$p.value < 0.1, sprintf("%.*f . ", digits, sm$p.value), # nolint + sprintf("%.*f ", digits, sm$p.value) + ) + ) + ) ) sm$p.value <- pan @@ -55,8 +46,6 @@ print.svyglm.zip <- function(x, se = c("robust", "model"), digits = 4, ...) { } - -#' @importFrom stats qnorm coef pnorm vcov tidy_svyglm.nb <- function(x, digits = 4, v_se = c("robust", "model")) { v_se <- match.arg(v_se) @@ -72,15 +61,13 @@ tidy_svyglm.nb <- function(x, digits = 4, v_se = c("robust", "model")) { estimate = round(est, digits), irr = round(exp(est), digits), std.error = round(se, digits), - conf.low = round(exp(est - stats::qnorm(.975) * se), digits), - conf.high = round(exp(est + stats::qnorm(.975) * se), digits), + conf.low = round(exp(est - stats::qnorm(0.975) * se), digits), + conf.high = round(exp(est + stats::qnorm(0.975) * se), digits), p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits) ) } - -#' @importFrom stats qnorm coef pnorm vcov tidy_svyglm.zip <- function(x, digits = 4, v_se = c("robust", "model")) { v_se <- match.arg(v_se) @@ -95,63 +82,52 @@ tidy_svyglm.zip <- function(x, digits = 4, v_se = c("robust", "model")) { term = substring(names(stats::coef(x)), 5), estimate = round(est, digits), std.error = round(se, digits), - conf.low = round(exp(est - stats::qnorm(.975) * se), digits), - conf.high = round(exp(est + stats::qnorm(.975) * se), digits), + conf.low = round(exp(est - stats::qnorm(0.975) * se), digits), + conf.high = round(exp(est + stats::qnorm(0.975) * se), digits), p.value = round(2 * stats::pnorm(abs(est / se), lower.tail = FALSE), digits) ) } - -#' @importFrom dplyr select #' @export model.frame.svyglm.nb <- function(formula, ...) { - pred <- attr(formula, "nb.terms", exact = T) - dplyr::select(formula$design$variables, string_one_of(pattern = pred, x = colnames(formula$design$variables))) + pred <- attr(formula, "nb.terms", exact = TRUE) + formula$design$variables[intersect(pred, colnames(formula$design$variables))] } - -#' @importFrom dplyr select #' @export model.frame.svyglm.zip <- function(formula, ...) { - pred <- attr(formula, "zip.terms", exact = T) - dplyr::select(formula$design$variables, string_one_of(pattern = pred, x = colnames(formula$design$variables))) + pred <- attr(formula, "zip.terms", exact = TRUE) + formula$design$variables[intersect(pred, colnames(formula$design$variables))] } - +#' @importFrom stats family #' @export family.svyglm.nb <- function(object, ...) { attr(object, "family", exact = TRUE) } - #' @export formula.svyglm.nb <- function(x, ...) { attr(x, "nb.formula", exact = TRUE) } - #' @export formula.svyglm.zip <- function(x, ...) { attr(x, "zip.formula", exact = TRUE) } - -#' @importFrom MASS glm.nb -#' @importFrom stats coef setNames predict.glm #' @export predict.svyglm.nb <- function(object, newdata = NULL, type = c("link", "response", "terms"), se.fit = FALSE, dispersion = NULL, terms = NULL, - na.action = na.pass, ...) { - - if (!isNamespaceLoaded("survey")) - requireNamespace("survey", quietly = TRUE) + na.action = stats::na.pass, ...) { + insight::check_if_installed(c("survey", "MASS")) fnb <- MASS::glm.nb( attr(object, "nb.formula", exact = TRUE), @@ -178,9 +154,6 @@ predict.svyglm.nb <- function(object, newdata = NULL, } -#' @importFrom MASS glm.nb -#' @importFrom stats coef setNames predict.glm -#' @importFrom insight get_response #' @export residuals.svyglm.nb <- function(object, ...) { @@ -201,7 +174,6 @@ residuals.svyglm.nb <- function(object, ...) { } -#' @importFrom stats terms formula #' @export terms.svyglm.nb <- function(x, ...) { @@ -212,14 +184,11 @@ terms.svyglm.nb <- function(x, ...) { } -#' @importFrom purrr map flatten_df #' @export AIC.svyglm.nb <- function(object, ...) { ## FIXME this one just returns the AIC of the underlying glm.nb() model - list(object, ...) %>% - purrr::map(~ getaic(.x)) %>% - purrr::flatten_df() %>% - as.data.frame() + aics <- lapply(list(object, ...), getaic) + as.data.frame(do.call(rbind, aics)) } @@ -235,53 +204,18 @@ deviance.svyglm.nb <- function(object, ...) { } -#' @importFrom insight print_color -#' @export -print.tidy_stan <- function(x, ...) { - insight::print_color("\nSummary Statistics of Stan-Model\n\n", "blue") - digits <- attr(x, "digits") - - for (i in x) { - insight::print_color(paste0("# ", attr(i, "main_title")), "blue") - cat(" ") - insight::print_color(attr(i, "sub_title"), "red") - cat("\n\n") - rem <- which(colnames(i) %in% c("Parameter", "Group", "Response", "Function")) - i <- i[, -rem] - colnames(i)[1] <- "Parameter" - i$ESS <- as.character(i$ESS) - i$pd <- sprintf("%.1f%%", 100 * i$pd) - i[] <- lapply(i, function(.j) { - if (is.numeric(.j)) .j <- sprintf("%.*f", digits, .j) - .j - }) - print.data.frame(i, quote = FALSE, row.names = FALSE) - cat("\n\n") - } -} - - -#' @importFrom sjmisc trim -clean_term_name <- function(x) { - x <- sjmisc::trim(x) - format(x, width = max(nchar(x))) -} - - #' @export as.integer.sj_resample <- function(x, ...) { x$id } - #' @export as.data.frame.sj_resample <- function(x, ...) { x$data[x$id, , drop = FALSE] } - #' @export print.sj_resample <- function(x, ...) { n <- length(x$id) @@ -290,20 +224,20 @@ print.sj_resample <- function(x, ...) { else id10 <- x$id - cat("<", paste0("id's of resample [", prettyNum(nrow(x$data), big.mark = ","), " x ", - prettyNum(ncol(x$data), big.mark = ","), "]"), "> ", - paste(id10, collapse = ", "), "\n", sep = "") + cat("<", paste0( + "id's of resample [", prettyNum(nrow(x$data), big.mark = ","), " x ", + prettyNum(ncol(x$data), big.mark = ","), "]" + ), "> ", + toString(id10), "\n", + sep = "" + ) } - -#' @importFrom tidyr gather -#' @importFrom rlang .data #' @export plot.sj_inequ_trend <- function(x, ...) { - if (!requireNamespace("ggplot2", quietly = TRUE)) { - stop("Package `ggplot2` required for plotting inequalities trends.", call. = F) - } + .data <- NULL + insight::check_if_installed("ggplot2") # add time indicator x$data$zeit <- seq_len(nrow(x$data)) @@ -312,14 +246,11 @@ plot.sj_inequ_trend <- function(x, ...) { gather.cols1 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "lo", "hi")] gather.cols2 <- colnames(x$data)[!colnames(x$data) %in% c("zeit", "rr", "rd")] - key_col <- "grp" - value_col <- "y" - # gather data to plot rr and rd - dat1 <- tidyr::gather(x$data, !! key_col, !! value_col, !! gather.cols1) + dat1 <- datawizard::data_to_long(x$data, select = gather.cols1, names_to = "grp", values_to = "y") # gather data for raw prevalences - dat2 <- tidyr::gather(x$data, !! key_col, !! value_col, !! gather.cols2) + dat2 <- datawizard::data_to_long(x$data, select = gather.cols1, names_to = "grp", values_to = "y") # Proper value names, for facet labels dat1$grp[dat1$grp == "rr"] <- "Rate Ratios" @@ -327,7 +258,7 @@ plot.sj_inequ_trend <- function(x, ...) { # plot prevalences gp1 <- ggplot2::ggplot(dat2, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) + - ggplot2::geom_smooth(method = "loess", se = F) + + ggplot2::geom_smooth(method = "loess", se = FALSE) + ggplot2::labs(title = "Prevalance Rates for Lower and Higher SES Groups", y = "Prevalances", x = "Time", colour = "") + ggplot2::scale_color_manual(values = c("darkblue", "darkred"), labels = c("High SES", "Low SES")) @@ -335,90 +266,17 @@ plot.sj_inequ_trend <- function(x, ...) { # plot rr and rd gp2 <- ggplot2::ggplot(dat1, ggplot2::aes_string(x = "zeit", y = "y", colour = "grp")) + - ggplot2::geom_smooth(method = "loess", se = F) + + ggplot2::geom_smooth(method = "loess", se = FALSE) + ggplot2::facet_wrap(~grp, ncol = 1, scales = "free") + ggplot2::labs(title = "Proportional Change in Rate Ratios and Rate Differences", colour = NULL, y = NULL, x = "Time") + - ggplot2::guides(colour = FALSE) + ggplot2::guides(colour = "none") suppressMessages(graphics::plot(gp1)) suppressMessages(graphics::plot(gp2)) } -#' @importFrom stats kruskal.test na.omit -#' @export -print.sj_mwu <- function(x, ...) { - insight::print_color("\n# Mann-Whitney-U-Test\n\n", "blue") - # get data - .dat <- x$df - # print to console - for (i in seq_len(nrow(.dat))) { - # get value labels - l1 <- .dat[i, "grp1.label"] - l2 <- .dat[i, "grp2.label"] - # do we have value labels? - if (!is.null(l1) && !is.na(l1) %% !is.null(l2) && !is.na(l2)) { - insight::print_color( - sprintf( - "Groups %i = %s (n = %i) | %i = %s (n = %i):\n", - .dat[i, "grp1"], - l1, - .dat[i, "grp1.n"], - .dat[i, "grp2"], - l2, - .dat[i, "grp2.n"] - ), "cyan" - ) - } else { - insight::print_color( - sprintf("Groups (%i|%i), n = %i/%i:\n", - .dat[i, "grp1"], .dat[i, "grp2"], - .dat[i, "grp1.n"], .dat[i, "grp2.n"]), - "cyan" - ) - } - - cat(sprintf( - " U = %.3f, W = %.3f, %s, Z = %.3f\n", - .dat[i, "u"], .dat[i, "w"], insight::format_p(.dat[i, "p"]), .dat[i, "z"] - )) - - string_es <- "effect-size r" - string_r <- sprintf("%.3f", .dat[i, "r"]) - string_group1 <- sprintf("rank-mean(%i)", .dat[i, "grp1"]) - string_group2 <- sprintf("rank-mean(%i)", .dat[i, "grp2"]) - string_rm1 <- sprintf("%.2f", .dat[i, "rank.mean.grp1"]) - string_rm2 <- sprintf("%.2f", .dat[i, "rank.mean.grp2"]) - - space1 <- max(nchar(c(string_es, string_group1, string_group2))) - space2 <- max(nchar(c(string_r, string_rm1, string_rm2))) - - cat( - sprintf(" %*s = %*s\n", space1, string_es, space2 + 1, string_r), - sprintf(" %*s = %*s\n", space1, string_group1, space2, string_rm1), - sprintf(" %*s = %*s\n\n", space1, string_group2, space2, string_rm2) - ) - } - - # if we have more than 2 groups, also perfom kruskal-wallis-test - if (length(unique(stats::na.omit(x$data$grp))) > 2) { - insight::print_color("# Kruskal-Wallis-Test\n\n", "blue") - kw <- stats::kruskal.test(x$data$dv, x$data$grp) - cat(sprintf("chi-squared = %.3f\n", kw$statistic)) - cat(sprintf("df = %i\n", kw$parameter)) - cat(paste(insight::format_p(kw$p.value, stars = TRUE), "\n")) - } -} - - -#' @export -print.sj_outliers <- function(x, ...) { - print(x$result, ...) -} - - -#' @importFrom insight format_p #' @export print.sj_xtab_stat <- function(x, ...) { # get length of method name, to align output @@ -442,105 +300,6 @@ print.sj_xtab_stat <- function(x, ...) { } - -#' @export -print.sj_xtab_stat2 <- function(x, ...) { - # get length of method name, to align output - l <- max(nchar(c(x$stat.name, "p-value", "Observations"))) - - # headline - insight::print_color(paste0("\n# ", x$method, "\n\n"), "blue") - - # print test statistic - cat(sprintf(" %*s: %.4f\n", l, x$stat.name, x$estimate)) - cat(sprintf(" %*s: %g\n", l, "df", x$df)) - cat(sprintf(" %*s: %s\n", l, "p-value", insight::format_p(x$p.value, stars = TRUE, name = NULL))) - cat(sprintf(" %*s: %g\n", l, "Observations", x$n_obs)) -} - - - -#' @export -print.sj_grpmean <- function(x, ...) { - cat("\n") - print_grpmean(x, digits = attributes(x)$digits, ...) -} - - -#' @importFrom insight export_table print_color format_value format_p -print_grpmean <- function(x, digits = NULL, ...) { - # headline - insight::print_color(sprintf( - "# Grouped Means for %s by %s\n\n", - attr(x, "dv.label", exact = TRUE), - attr(x, "grp.label", exact = TRUE) - ), "blue") - - if (is.null(digits)) { - digits <- 2 - } - - x$mean <- insight::format_value(x$mean, digits = digits) - x$std.dev <- insight::format_value(x$std.dev, digits = digits) - x$std.error <- insight::format_value(x$std.error, digits = digits) - x$p.value <- insight::format_p(x$p.value, name = NULL) - - colnames(x) <- c("Category", "Mean", "N", "SD", "SE", "p") - cat(insight::export_table(x)) - - # statistics - cat(sprintf( - "\nAnova: R2=%.3f; adj.R2=%.3f; F=%.3f; p=%.3f\n", - attr(x, "r2", exact = TRUE), - attr(x, "adj.r2", exact = TRUE), - attr(x, "fstat", exact = TRUE), - attr(x, "p.value", exact = TRUE) - )) -} - - -#' @importFrom purrr walk -#' @export -print.sj_grpmeans <- function(x, ...) { - - cat("\n") - purrr::walk(x, function(dat) { - # get grouping title label - grp <- attr(dat, "group", exact = T) - - # print title for grouping - insight::print_color(sprintf("Grouped by:\n%s\n\n", grp), "cyan") - - # print grpmean-table - print_grpmean(dat, digits = attributes(x)$digits, ...) - - cat("\n\n") - }) -} - - -#' @export -print.sj_pval <- function(x, digits = 3, summary = FALSE, ...) { - - if (summary) { - df.kr <- attr(x, "df.kr", exact = TRUE) - t.kr <- attr(x, "t.kr", exact = TRUE) - - if (!is.null(df.kr)) x$df <- df.kr - if (!is.null(t.kr)) x$statistic <- t.kr - } - - x <- purrr::map_if(x, is.numeric, round, digits = digits) - print.data.frame(as.data.frame(x), ..., row.names = TRUE) -} - - -#' @export -summary.sj_pval <- function(object, digits = 3, summary = FALSE, ...) { - print(object, digits, summary = TRUE) -} - - #' @export print.sj_chi2gof <- function(x, ...) { insight::print_color("\n# Chi-squared Goodness-of-Fit Test\n\n", "blue") @@ -562,30 +321,6 @@ print.sj_chi2gof <- function(x, ...) { } -#' @export -print.sj_check_assump <- function(x, ...) { - insight::print_color("\n# Checking Model-Assumptions\n\n", "blue") - cat(sprintf(" Model: %s", attr(x, "formula", exact = TRUE))) - - insight::print_color("\n\n violated statistic\n", "red") - - v1 <- ifelse(x$heteroskedasticity < 0.05, "yes", "no") - v2 <- ifelse(x$multicollinearity > 4, "yes", "no") - v3 <- ifelse(x$non.normal.resid < 0.05, "yes", "no") - v4 <- ifelse(x$autocorrelation < 0.05, "yes", "no") - - s1 <- sprintf("p = %.3f", x$heteroskedasticity) - s2 <- sprintf("vif = %.3f", x$multicollinearity) - s3 <- sprintf("p = %.3f", x$non.normal.resid) - s4 <- sprintf("p = %.3f", x$autocorrelation) - - cat(sprintf(" Heteroskedasticity %8s %11s\n", v1, s1)) - cat(sprintf(" Non-normal residuals %8s %11s\n", v3, s3)) - cat(sprintf(" Autocorrelated residuals%8s %11s\n", v4, s4)) - cat(sprintf(" Multicollinearity %8s %11s\n", v2, s2)) -} - - #' @export print.sj_ttest <- function(x, ...) { insight::print_color(sprintf("\n%s (%s)\n", x$method, x$alternative), "blue") @@ -632,18 +367,6 @@ print.sj_ttest <- function(x, ...) { } -#' @export -print.sj_wmwu <- function(x, ...) { - insight::print_color(sprintf("\n# %s\n", x$method), "blue") - - group <- attr(x, "group.name", exact = TRUE) - xn <- attr(x, "x.name", exact = TRUE) - - insight::print_color(sprintf("\n comparison of %s by %s\n", xn, group), "cyan") - cat(sprintf(" Chisq=%.2f df=%i p-value=%.3f\n\n", x$statistic, as.integer(x$parameter), x$p.value)) -} - - #' @export print.sj_wcor <- function(x, ...) { insight::print_color(sprintf("\nWeighted %s\n\n", x$method), "blue") @@ -659,7 +382,6 @@ print.sj_wcor <- function(x, ...) { } -#' @importFrom insight export_table #' @export print.sj_anova_stat <- function(x, digits = 3, ...) { x$p.value <- insight::format_p(x$p.value, name = NULL) diff --git a/R/anova_stats.R b/R/anova_stats.R index f7917687..d1e6e640 100644 --- a/R/anova_stats.R +++ b/R/anova_stats.R @@ -15,7 +15,7 @@ #' \cr \cr #' Tippey K, Longnecker MT (2016): An Ad Hoc Method for Computing Pseudo-Effect Size for Mixed Model. #' -#' @examples +#' @examplesIf requireNamespace("car") && requireNamespace("pwr") #' # load sample data #' data(efc) #' @@ -24,9 +24,7 @@ #' c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, #' data = efc #' ) -#' \dontrun{ #' anova_stats(car::Anova(fit, type = 2)) -#' } #' @export anova_stats <- function(model, digits = 3) { insight::check_if_installed("pwr") @@ -47,39 +45,44 @@ anova_stats <- function(model, digits = 3) { cohens.f <- sqrt(partial.etasq / (1 - partial.etasq)) # bind as data frame - as <- dplyr::bind_rows( + anov_stat <- rbind( data.frame(etasq, partial.etasq, omegasq, partial.omegasq, epsilonsq, cohens.f), data.frame(etasq = NA, partial.etasq = NA, omegasq = NA, partial.omegasq = NA, epsilonsq = NA, cohens.f = NA) - ) %>% - sjmisc::add_columns(aov.sum) + ) + anov_stat <- cbind(anov_stat, data.frame(aov.sum)) # get nr of terms - nt <- nrow(as) - 1 + nt <- nrow(anov_stat) - 1 # finally, compute power - power <- tryCatch( - { - c( - pwr::pwr.f2.test(u = as$df[1:nt], v = as$df[nrow(as)], f2 = as$cohens.f[1:nt]^2)[["power"]], - NA - ) - }, + as_power <- tryCatch( + c( + pwr::pwr.f2.test( + u = anov_stat$df[1:nt], + v = anov_stat$df[nrow(anov_stat)], + f2 = anov_stat$cohens.f[1:nt]^2 + )[["power"]], + NA + ), error = function(x) { NA } ) - out <- sjmisc::add_variables(as, power = power) %>% - sjmisc::round_num(digits = digits) %>% - as.data.frame() + out <- cbind(anov_stat, data.frame(power = as_power)) + out[] <- lapply(out, function(i) { + if (is.numeric(i)) { + round(i, digits) + } else { + i + } + }) class(out) <- c("sj_anova_stat", class(out)) out } - -#' @importFrom rlang .data aov_stat <- function(model, type) { aov.sum <- aov_stat_summary(model) aov.res <- aov_stat_core(aov.sum, type) @@ -92,7 +95,7 @@ aov_stat <- function(model, type) { aov_stat_summary <- function(model) { - insight::check_if_installed("broom") + insight::check_if_installed("parameters") # check if we have a mixed model mm <- is_merMod(model) ori.model <- model @@ -103,12 +106,12 @@ aov_stat_summary <- function(model) { model <- stats::anova(model) # get summary table - aov.sum <- as.data.frame(broom::tidy(model)) + aov.sum <- insight::standardize_names(as.data.frame(parameters::model_parameters(model)), style = "broom") # for mixed models, add information on residuals if (mm) { res <- stats::residuals(ori.model) - aov.sum <- dplyr::bind_rows( + aov.sum <- rbind( aov.sum, data_frame( term = "Residuals", @@ -132,8 +135,14 @@ aov_stat_summary <- function(model) { colnames(aov.sum) <- c("term", "df", "sumsq", "meansq", "statistic", "p.value") # for car::Anova, the meansq-column might be missing, so add it manually - if (!obj_has_name(aov.sum, "meansq")) - aov.sum <- sjmisc::add_variables(aov.sum, meansq = aov.sum$sumsq / aov.sum$df, .after = "sumsq") + if (!obj_has_name(aov.sum, "meansq")) { + pos_sumsq <- which(colnames(aov.sum) == "sumsq") + aov.sum <- cbind( + aov.sum[1:pos_sumsq], + data.frame(meansq = aov.sum$sumsq / aov.sum$df), + aov.sum[(pos_sumsq + 1):ncol(aov.sum)] + ) + } intercept <- .which_intercept(aov.sum$term) if (length(intercept) > 0) { @@ -165,36 +174,35 @@ aov_stat_core <- function(aov.sum, type) { N <- sum(aov.sum[["df"]]) + 1 - if (type == "omega") { + aovstat <- switch(type, # compute omega squared for each model term - aovstat <- purrr::map_dbl(1:n_terms, function(x) { + omega = unlist(lapply(1:n_terms, function(x) { ss.term <- aov.sum[["sumsq"]][x] df.term <- aov.sum[["df"]][x] (ss.term - df.term * meansq.resid) / (ss.total + meansq.resid) - }) - } else if (type == "pomega") { + })), # compute partial omega squared for each model term - aovstat <- purrr::map_dbl(1:n_terms, function(x) { + pomega = unlist(lapply(1:n_terms, function(x) { df.term <- aov.sum[["df"]][x] meansq.term <- aov.sum[["meansq"]][x] (df.term * (meansq.term - meansq.resid)) / (df.term * meansq.term + (N - df.term) * meansq.resid) - }) - } else if (type == "epsilon") { + })), # compute epsilon squared for each model term - aovstat <- purrr::map_dbl(1:n_terms, function(x) { + epsilon = unlist(lapply(1:n_terms, function(x) { ss.term <- aov.sum[["sumsq"]][x] df.term <- aov.sum[["df"]][x] (ss.term - df.term * meansq.resid) / ss.total - }) - } else if (type == "eta") { + })), # compute eta squared for each model term - aovstat <- - purrr::map_dbl(1:n_terms, ~ aov.sum[["sumsq"]][.x] / sum(aov.sum[["sumsq"]])) - } else if (type %in% c("cohens.f", "peta")) { + eta = unlist(lapply(1:n_terms, function(x) { + aov.sum[["sumsq"]][x] / sum(aov.sum[["sumsq"]]) + })), # compute partial eta squared for each model term - aovstat <- - purrr::map_dbl(1:n_terms, ~ aov.sum[["sumsq"]][.x] / (aov.sum[["sumsq"]][.x] + ss.resid)) - } + cohens.f = , + peta = unlist(lapply(1:n_terms, function(x) { + aov.sum[["sumsq"]][x] / (aov.sum[["sumsq"]][x] + ss.resid) + })) + ) # compute Cohen's F if (type == "cohens.f") aovstat <- sqrt(aovstat / (1 - aovstat)) diff --git a/R/auto_prior.R b/R/auto_prior.R index 7442c09d..1e036369 100644 --- a/R/auto_prior.R +++ b/R/auto_prior.R @@ -34,20 +34,16 @@ #' formula used in \code{brms::brm()} must be rewritten to something like #' \code{y ~ 0 + intercept ...}, see \code{\link[brms]{set_prior}}. #' -#' @examples -#' library(sjmisc) +#' @examplesIf requireNamespace("brms") #' data(efc) #' efc$c172code <- as.factor(efc$c172code) -#' efc$c161sex <- to_label(efc$c161sex) +#' efc$c161sex <- as.factor(efc$c161sex) #' #' mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) -#' -#' if (requireNamespace("brms", quietly = TRUE)) -#' auto_prior(mf, efc, TRUE) +#' auto_prior(mf, efc, TRUE) #' #' ## compare to -#' # library(rstanarm) -#' # m <- stan_glm(mf, data = efc, chains = 2, iter = 200) +#' # m <- rstanarm::stan_glm(mf, data = efc, chains = 2, iter = 200) #' # ps <- prior_summary(m) #' # ps$prior_intercept$adjusted_scale #' # ps$prior$adjusted_scale @@ -59,22 +55,16 @@ #' # add informative priors #' mf <- formula(neg_c_7 ~ c161sex + c172code) #' -#' if (requireNamespace("brms", quietly = TRUE)) { -#' auto_prior(mf, efc, TRUE) + -#' brms::prior(normal(.1554, 40), class = "b", coef = "c160age") -#' } +#' auto_prior(mf, efc, TRUE) + +#' brms::prior(normal(.1554, 40), class = "b", coef = "c160age") #' #' # example with binary response #' efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) #' mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) -#' -#' if (requireNamespace("brms", quietly = TRUE)) -#' auto_prior(mf, efc, FALSE) +#' auto_prior(mf, efc, FALSE) #' @export auto_prior <- function(formula, data, gaussian, locations = NULL) { - - if (!requireNamespace("brms", quietly = TRUE)) - stop("Package `brms` required.", call. = FALSE) + insight::check_if_installed("brms") scale.b <- 2.5 scale.y <- 10 @@ -82,20 +72,14 @@ auto_prior <- function(formula, data, gaussian, locations = NULL) { pred <- insight::find_predictors(formula, effects = "all", flatten = TRUE) y.name <- insight::find_response(formula, combine = TRUE) - cols <- c(y.name, pred) - - data <- data %>% - dplyr::select(!! cols) %>% - stats::na.omit() %>% - as.data.frame() - + data <- stats::na.omit(data[c(y.name, pred)]) y <- data[[y.name]] # check if response is binary - if (missing(gaussian) && dplyr::n_distinct(y, na.rm = TRUE) == 2) gaussian <- FALSE + if (missing(gaussian) && insight::n_unique(y) == 2) gaussian <- FALSE - if (isTRUE(gaussian) && dplyr::n_distinct(y, na.rm = TRUE) == 2) - warning("Priors were calculated based on assumption that the response is Gaussian, however it seems to be binary.", call. = F) + if (isTRUE(gaussian) && insight::n_unique(y) == 2) + insight::format_alert("Priors were calculated based on assumption that the response is Gaussian, however it seems to be binary.") # nolint if (gaussian) { @@ -135,7 +119,7 @@ auto_prior <- function(formula, data, gaussian, locations = NULL) { term.names <- c(term.names, i) } - for (i in 1:length(term.names)) { + for (i in seq_along(term.names)) { if (!is.null(locations) && length(locations) >= (i + 1)) location.b <- locations[i + 1] diff --git a/R/boot_ci.R b/R/boot_ci.R index a1ad3bdd..33b8b9f3 100644 --- a/R/boot_ci.R +++ b/R/boot_ci.R @@ -6,66 +6,55 @@ #' replicate estimates. #' #' @param data A data frame that containts the vector with bootstrapped -#' estimates, or directly the vector (see 'Examples'). +#' estimates, or directly the vector (see 'Examples'). #' @param ci.lvl Numeric, the level of the confidence intervals. -#' @param ... Optional, unquoted names of variables with bootstrapped estimates. -#' Required, if either \code{data} is a data frame (and no vector), -#' and only selected variables from \code{data} should be processed. -#' You may also use functions like \code{:} or tidyselect's -#' \code{select_helpers()}. +#' @param select Optional, unquoted names of variables (as character vector) +#' with bootstrapped estimates. Required, if either `data` is a data frame +#' (and no vector), and only selected variables from `data` should be processed. #' @param method Character vector, indicating if confidence intervals should be -#' based on bootstrap standard error, multiplied by the value of the -#' quantile function of the t-distribution (default), or on sample -#' quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. -#' May be abbreviated. -#' -#' @return A data frame with either bootstrap estimate, -#' standard error, the lower and upper confidence intervals or the -#' p-value for all bootstrapped estimates. -#' -#' @details The methods require one or more vectors of bootstrap replicate estimates -#' as input. -#' \itemize{ -#' \item{ -#' \code{boot_est()} returns the bootstrapped estimate, simply by -#' computing the mean value of all bootstrap estimates. -#' } -#' \item{ -#' \code{boot_se()} computes the nonparametric bootstrap standard -#' error by calculating the standard deviation of the input vector. -#' } -#' \item{ -#' The mean value of the input vector and its standard error is used -#' by \code{boot_ci()} to calculate the lower and upper confidence -#' interval, assuming a t-distribution of bootstrap estimate replicates -#' (for \code{method = "dist"}, the default, which is -#' \code{mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)}); for -#' \code{method = "quantile"}, 95\% sample quantiles are used to compute -#' the confidence intervals (\code{quantile(x, probs = c(.025, .975))}). -#' Use \code{ci.lvl} to change the level for the confidence interval. -#' } -#' \item{ -#' P-values from \code{boot_p()} are also based on t-statistics, -#' assuming normal distribution. -#' } -#' } +#' based on bootstrap standard error, multiplied by the value of the quantile +#' function of the t-distribution (default), or on sample quantiles of the +#' bootstrapped values. See 'Details' in `boot_ci()`. May be abbreviated. +#' +#' @return A data frame with either bootstrap estimate, standard error, the +#' lower and upper confidence intervals or the p-value for all bootstrapped +#' estimates. +#' +#' @details The methods require one or more vectors of bootstrap replicate +#' estimates as input. +#' +#' - `boot_est()`: returns the bootstrapped estimate, simply by computing +#' the mean value of all bootstrap estimates. +#' - `boot_se()`: computes the nonparametric bootstrap standard error by +#' calculating the standard deviation of the input vector. +#' - The mean value of the input vector and its standard error is used by +#' `boot_ci()` to calculate the lower and upper confidence interval, +#' assuming a t-distribution of bootstrap estimate replicates (for +#' `method = "dist"`, the default, which is +#' `mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)`); for +#' `method = "quantile"`, 95\% sample quantiles are used to compute the +#' confidence intervals (`quantile(x, probs = c(0.025, 0.975))`). Use +#' `ci.lvl` to change the level for the confidence interval. +#' - P-values from `boot_p()` are also based on t-statistics, assuming normal +#' distribution. #' #' @references Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164 #' -#' @seealso \code{\link{bootstrap}} to generate nonparametric bootstrap samples. +#' @seealso []`bootstrap()`] to generate nonparametric bootstrap samples. #' #' @examples -#' library(dplyr) -#' library(purrr) #' data(efc) #' bs <- bootstrap(efc, 100) #' #' # now run models for each bootstrapped sample -#' bs$models <- map(bs$strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)) +#' bs$models <- lapply( +#' bs$strap, +#' function(.x) lm(neg_c_7 ~ e42dep + c161sex, data = .x) +#' ) #' #' # extract coefficient "dependency" and "gender" from each model -#' bs$dependency <- map_dbl(bs$models, ~coef(.x)[2]) -#' bs$gender <- map_dbl(bs$models, ~coef(.x)[3]) +#' bs$dependency <- vapply(bs$models, function(x) coef(x)[2], numeric(1)) +#' bs$gender <- vapply(bs$models, function(x) coef(x)[3], numeric(1)) #' #' # get bootstrapped confidence intervals #' boot_ci(bs$dependency) @@ -76,67 +65,26 @@ #' #' # alternative function calls. #' boot_ci(bs$dependency) -#' boot_ci(bs, dependency) -#' boot_ci(bs, dependency, gender) -#' boot_ci(bs, dependency, gender, method = "q") +#' boot_ci(bs, "dependency") +#' boot_ci(bs, c("dependency", "gender")) +#' boot_ci(bs, c("dependency", "gender"), method = "q") #' #' #' # compare coefficients #' mean(bs$dependency) #' boot_est(bs$dependency) #' coef(fit)[2] -#' -#' -#' # bootstrap() and boot_ci() work fine within pipe-chains -#' efc %>% -#' bootstrap(100) %>% -#' mutate( -#' models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)), -#' dependency = map_dbl(models, ~coef(.x)[2]) -#' ) %>% -#' boot_ci(dependency) -#' -#' # check p-value -#' boot_p(bs$gender) -#' summary(fit)$coefficients[3, ] -#' -#' \dontrun{ -#' # 'spread_coef()' from the 'sjmisc'-package makes it easy to generate -#' # bootstrapped statistics like confidence intervals or p-values -#' library(dplyr) -#' library(sjmisc) -#' efc %>% -#' # generate bootstrap replicates -#' bootstrap(100) %>% -#' # apply lm to all bootstrapped data sets -#' mutate( -#' models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) -#' ) %>% -#' # spread model coefficient for all 100 models -#' spread_coef(models) %>% -#' # compute the CI for all bootstrapped model coefficients -#' boot_ci(e42dep, c161sex, c172code) -#' -#' # or... -#' efc %>% -#' # generate bootstrap replicates -#' bootstrap(100) %>% -#' # apply lm to all bootstrapped data sets -#' mutate( -#' models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) -#' ) %>% -#' # spread model coefficient for all 100 models -#' spread_coef(models, append = FALSE) %>% -#' # compute the CI for all bootstrapped model coefficients -#' boot_ci()} -#' @importFrom rlang .data #' @export -boot_ci <- function(data, ..., method = c("dist", "quantile"), ci.lvl = .95) { +boot_ci <- function(data, select = NULL, method = c("dist", "quantile"), ci.lvl = 0.95) { # match arguments method <- match.arg(method) # evaluate arguments, generate data - .dat <- get_dot_data(data, dplyr::quos(...)) + if (is.null(select)) { + .dat <- as.data.frame(data) + } else { + .dat <- data[select] + } # compute confidence intervals for all values transform_boot_result(lapply(.dat, function(x) { @@ -144,9 +92,9 @@ boot_ci <- function(data, ..., method = c("dist", "quantile"), ci.lvl = .95) { # bootstrap values or quantiles if (method == "dist") { # get bootstrap standard error - bootse <- stats::qt((1 + ci.lvl) / 2, df = length(x) - 1) * stats::sd(x, na.rm = T) + bootse <- stats::qt((1 + ci.lvl) / 2, df = length(x) - 1) * stats::sd(x, na.rm = TRUE) # lower and upper confidence interval - ci <- mean(x, na.rm = T) + c(-bootse, bootse) + ci <- mean(x, na.rm = TRUE) + c(-bootse, bootse) } else { # CI based on quantiles of bootstrapped values ci <- stats::quantile(x, probs = c((1 - ci.lvl) / 2, (1 + ci.lvl) / 2)) @@ -159,16 +107,18 @@ boot_ci <- function(data, ..., method = c("dist", "quantile"), ci.lvl = .95) { #' @rdname boot_ci -#' @importFrom stats sd #' @export -boot_se <- function(data, ...) { +boot_se <- function(data, select = NULL) { # evaluate arguments, generate data - .dat <- get_dot_data(data, dplyr::quos(...)) - + if (is.null(select)) { + .dat <- as.data.frame(data) + } else { + .dat <- data[select] + } # compute confidence intervalls for all values transform_boot_result(lapply(.dat, function(x) { # get bootstrap standard error - se <- stats::sd(x, na.rm = T) + se <- stats::sd(x, na.rm = TRUE) names(se) <- "std.err" se })) @@ -176,16 +126,18 @@ boot_se <- function(data, ...) { #' @rdname boot_ci -#' @importFrom stats sd pt #' @export -boot_p <- function(data, ...) { +boot_p <- function(data, select = NULL) { # evaluate arguments, generate data - .dat <- get_dot_data(data, dplyr::quos(...)) - + if (is.null(select)) { + .dat <- as.data.frame(data) + } else { + .dat <- data[select] + } # compute confidence intervalls for all values transform_boot_result(lapply(.dat, function(x) { # compute t-statistic - t.stat <- mean(x, na.rm = T) / stats::sd(x, na.rm = T) + t.stat <- mean(x, na.rm = TRUE) / stats::sd(x, na.rm = TRUE) # compute p-value p <- 2 * stats::pt(abs(t.stat), df = length(x) - 1, lower.tail = FALSE) names(p) <- "p.value" @@ -196,34 +148,23 @@ boot_p <- function(data, ...) { #' @rdname boot_ci #' @export -boot_est <- function(data, ...) { +boot_est <- function(data, select = NULL) { # evaluate arguments, generate data - .dat <- get_dot_data(data, dplyr::quos(...)) - + if (is.null(select)) { + .dat <- as.data.frame(data) + } else { + .dat <- data[select] + } # compute mean for all values (= bootstrapped estimate) transform_boot_result(lapply(.dat, function(x) { - estimate <- mean(x, na.rm = T) + estimate <- mean(x, na.rm = TRUE) names(estimate) <- "estimate" estimate })) } - transform_boot_result <- function(res) { # transform a bit, so we have each estimate in a row, and ci's as columns... - res %>% - as.data.frame() %>% - t() %>% - as.data.frame() %>% - rownames_as_column(var = "term") -} - - -#' @importFrom dplyr select -get_dot_data <- function(x, qs) { - if (sjmisc::is_empty(qs)) - as.data.frame(x) - else - suppressWarnings(dplyr::select(x, !!!qs)) + rownames_as_column(as.data.frame(t(as.data.frame(res))), var = "term") } diff --git a/R/bootstrap.R b/R/bootstrap.R index c3d3635f..444896a2 100644 --- a/R/bootstrap.R +++ b/R/bootstrap.R @@ -61,17 +61,11 @@ #' mean(as.data.frame(x)$c12hour, na.rm = TRUE) #' })) #' -#' # or as tidyverse-approach -#' if (require("dplyr") && require("purrr")) { -#' bs <- efc %>% -#' bootstrap(100) %>% -#' mutate( -#' c12hour = map_dbl(strap, ~mean(as.data.frame(.x)$c12hour, na.rm = TRUE)) -#' ) +#' # bootstrapped standard error +#' boot_se(bs, "c12hour") #' -#' # bootstrapped standard error -#' boot_se(bs, c12hour) -#' } +#' # bootstrapped CI +#' boot_ci(bs, "c12hour") #' @export bootstrap <- function(data, n, size) { if (!missing(size) && !is.null(size)) { diff --git a/R/chi_squared_test.R b/R/chi_squared_test.R index 426883a6..474ecadd 100644 --- a/R/chi_squared_test.R +++ b/R/chi_squared_test.R @@ -1,6 +1,6 @@ #' @title Chi-Squared test #' @name chi_squared_test -#' @description This function performs a \eqn{chi}^2 test for contingency +#' @description This function performs a \eqn{\chi^2} test for contingency #' tables or tests for given probabilities. The returned effects sizes are #' Cramer's V for tables with more than two rows and columns, Phi (\eqn{\phi}) #' for 2x2 tables, and \ifelse{latex}{\eqn{Fei}}{פ (Fei)} for tests against @@ -17,6 +17,10 @@ #' @param ... Additional arguments passed down to [`chisq.test()`]. #' @inheritParams mann_whitney_test #' +#' @inheritSection mann_whitney_test Which test to use +#' +#' @inherit mann_whitney_test seealso +#' #' @return A data frame with test results. The returned effects sizes are #' Cramer's V for tables with more than two rows and columns, Phi (\eqn{\phi}) #' for 2x2 tables, and \ifelse{latex}{\eqn{Fei}}{פ (Fei)} for tests against @@ -33,12 +37,23 @@ #' The weighted version of the chi-squared test is based on the a weighted #' table, using [`xtabs()`] as input for `chisq.test()`. #' -#' @references Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., -#' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data -#' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. -#' \doi{10.3390/math11091982} +#' Interpretation of effect sizes are based on rules described in +#' [`effectsize::interpret_phi()`], [`effectsize::interpret_cramers_v()`], +#' and [`effectsize::interpret_fei()`]. +#' +#' @references +#' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., +#' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data +#' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. +#' \doi{10.3390/math11091982} #' -#' @examples +#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. +#' Dtsch Med Wochenschr 2007; 132: e24–e25 +#' +#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer +#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 +#' +#' @examplesIf requireNamespace("effectsize") #' data(efc) #' efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) #' @@ -58,6 +73,12 @@ chi_squared_test <- function(data, weights = NULL, paired = FALSE, ...) { + # sanity check - if we only have one variable in "select" and "by" and + # "probabilities" are NULL, set probalities + if (is.null(probabilities) && !is.null(select) && is.null(by) && length(select) == 1) { + probabilities <- rep(1 / length(data[[select]]), length(data[[select]])) + } + if (is.null(probabilities)) { .calculate_chisq(data, select, by, weights, paired, ...) } else { @@ -260,8 +281,29 @@ print.sj_htest_chi <- function(x, ...) { eff_symbol <- .format_symbols(x$effect_size_name) stat_symbol <- .format_symbols(x$statistic_name) + # string for effectsizes + eff_string <- switch(x$effect_size_name, + Fei = sprintf( + "%s = %.3f (%s effect)", + eff_symbol, + x$effect_size, + effectsize::interpret_fei(x$effect_size) + ), + Phi = sprintf( + "%s = %.3f (%s effect)", + eff_symbol, + x$effect_size, + effectsize::interpret_phi(x$effect_size) + ), + sprintf( + "Cramer's V = %.3f (%s effect)", + x$effect_size, + effectsize::interpret_cramers_v(x$effect_size) + ) + ) + cat(sprintf( - "\n %s = %.4f, %s = %.4f, df = %i, %s\n\n", - stat_symbol, x$statistic, eff_symbol, x$effect_size, round(x$df), insight::format_p(x$p) + "\n %s = %.3f, %s, df = %i, %s\n\n", + stat_symbol, x$statistic, eff_string, round(x$df), insight::format_p(x$p) )) } diff --git a/R/confint_ncg.R b/R/confint_ncg.R index 6996d7da..abf8a305 100644 --- a/R/confint_ncg.R +++ b/R/confint_ncg.R @@ -3,7 +3,6 @@ # Author: Ken Kelley # License: GPL-3 -#' @importFrom stats pf qf confint_ncg <- function(F.value = NULL, conf.level = 0.95, df.1 = NULL, df.2 = NULL) { alpha.lower <- alpha.upper <- (1 - conf.level) / 2 tol <- 1e-09 @@ -40,12 +39,12 @@ confint_ncg <- function(F.value = NULL, conf.level = 0.95, df.1 = NULL, df.2 = N Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[2]) - (1 - alpha.lower) > tol Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = LL.Bounds[3]) - (1 - alpha.lower) > tol - if (isTRUE(Diff.1) & isTRUE(Diff.2) & !isTRUE(Diff.3)) { + if (isTRUE(Diff.1) && isTRUE(Diff.2) && !isTRUE(Diff.3)) { LL.Bounds <- c(LL.Bounds[2], (LL.Bounds[2] + LL.Bounds[3]) / 2, LL.Bounds[3]) } - if (isTRUE(Diff.1) & !isTRUE(Diff.2) & !isTRUE(Diff.3)) { + if (isTRUE(Diff.1) && !isTRUE(Diff.2) && !isTRUE(Diff.3)) { LL.Bounds <- c(LL.Bounds[1], (LL.Bounds[1] + LL.Bounds[2]) / 2, LL.Bounds[2]) } @@ -89,14 +88,13 @@ confint_ncg <- function(F.value = NULL, conf.level = 0.95, df.1 = NULL, df.2 = N Diff.2 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper > tol Diff.3 <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[3]) - alpha.upper > tol - if (isTRUE(Diff.1) & isTRUE(Diff.2) & !isTRUE(Diff.3)) { + if (isTRUE(Diff.1) && isTRUE(Diff.2) && !isTRUE(Diff.3)) { UL.Bounds <- c(UL.Bounds[2], (UL.Bounds[2] + UL.Bounds[3]) / 2, UL.Bounds[3]) } - if (isTRUE(Diff.1) & !isTRUE(Diff.2) & !isTRUE(Diff.3)) { - UL.Bounds <- c(UL.Bounds[1], (UL.Bounds[1] + - UL.Bounds[2])/2, UL.Bounds[2]) + if (isTRUE(Diff.1) && !isTRUE(Diff.2) && !isTRUE(Diff.3)) { + UL.Bounds <- c(UL.Bounds[1], (UL.Bounds[1] + UL.Bounds[2]) / 2, UL.Bounds[2]) } Diff <- stats::pf(q = F.value, df1 = df.1, df2 = df.2, ncp = UL.Bounds[2]) - alpha.upper diff --git a/R/cramer.R b/R/cramer.R index a623da44..5c12251b 100644 --- a/R/cramer.R +++ b/R/cramer.R @@ -23,19 +23,19 @@ cramers_v.ftable <- function(tab, ...) { #' @rdname crosstable_statistics #' @export cramers_v.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) { - terms <- all.vars(formula) - tab <- table(data[[terms[1]]], data[[terms[2]]]) + fterms <- all.vars(formula) + tab <- table(data[[fterms[1]]], data[[fterms[2]]]) method <- match.arg(method) if (is.null(ci.lvl) || is.na(ci.lvl)) { .cramers_v(tab) } else { - straps <- sjstats::bootstrap(data[terms], n) + straps <- sjstats::bootstrap(data[fterms], n) tables <- lapply(straps$strap, function(x) { dat <- as.data.frame(x) table(dat[[1]], dat[[2]]) }) - cramers <- sapply(tables, function(x) .cramers_v(x)) + cramers <- sapply(tables, .cramers_v) ci <- boot_ci(cramers, ci.lvl = ci.lvl, method = method) data_frame( diff --git a/R/cv.R b/R/cv.R index 38d6ece0..c68844e1 100644 --- a/R/cv.R +++ b/R/cv.R @@ -20,7 +20,6 @@ #' fit <- lm(barthtot ~ c160age + c12hour, data = efc) #' cv(fit) #' -#' @importFrom stats sd #' @export cv <- function(x, ...) { # return value @@ -37,9 +36,8 @@ cv <- function(x, ...) { } -#' @importFrom performance rmse -#' @importFrom insight get_response cv_helper <- function(x) { + insight::check_if_installed("performance") # check if we have a fitted linear model if (inherits(x, c("lm", "lmerMod", "lme", "merModLmerTest")) && !inherits(x, "glm")) { # get response diff --git a/R/cv_error.R b/R/cv_error.R index c4880b12..557b8d15 100644 --- a/R/cv_error.R +++ b/R/cv_error.R @@ -31,41 +31,33 @@ #' neg_c_7 ~ barthtot + c12hour #' )) #' -#' @importFrom modelr crossv_kfold -#' @importFrom dplyr mutate summarise -#' @importFrom purrr map map2 map_dbl map_df -#' @importFrom tidyr unnest -#' @importFrom rlang .data -#' @importFrom insight find_response -#' @importFrom performance rmse #' @export cv_error <- function(data, formula, k = 5) { - insight::check_if_installed("broom") - - # compute cross validation data - cv_data <- data %>% - modelr::crossv_kfold(k = k) %>% - dplyr::mutate( - trained.models = purrr::map(.data$train, ~ stats::lm(formula, data = .x)), - predicted = purrr::map2(.data$trained.models, .data$test, ~ broom::augment(.x, newdata = .y)), - residuals = purrr::map(.data$predicted, ~.x[[insight::find_response(formula)]] - .x$.fitted), - rmse.train = purrr::map_dbl(.data$trained.models, ~ performance::rmse(.x)) - ) + insight::check_if_installed("datawizard") + # response + resp <- insight::find_response(formula) - # Training error - train.error <- dplyr::summarise(cv_data, train.error = mean(.data$rmse.train, na.rm = TRUE)) + # compute cross validation data + cv_data <- lapply(k, function(i) { + datawizard::data_partition(data, proportion = 0.8) + }) + # get train and test datasets + train_data <- lapply(cv_data, function(cvdat) cvdat[[1]]) + test_data <- lapply(cv_data, function(cvdat) cvdat[[2]]) + # fit models on datasets + trained_models <- lapply(train_data, function(x) stats::lm(formula, data = x)) + test_models <- lapply(test_data, function(x) stats::lm(formula, data = x)) - # Test error - test.error <- cv_data %>% - tidyr::unnest(.data$predicted, .data$residuals) %>% - dplyr::summarise(test.error = sqrt(mean(.data$residuals^2, na.rm = TRUE))) + # RMSE + train_error <- mean(vapply(trained_models, performance::rmse, numeric(1)), na.rm = TRUE) + test_error <- mean(vapply(test_models, performance::rmse, numeric(1)), na.rm = TRUE) data_frame( model = deparse(formula), - train.error = round(train.error[[1]], 4), - test.error = round(test.error[[1]], 4) + train.error = round(train_error, 4), + test.error = round(test_error, 4) ) } @@ -74,5 +66,5 @@ cv_error <- function(data, formula, k = 5) { #' @rdname cv_error #' @export cv_compare <- function(data, formulas, k = 5) { - purrr::map_df(formulas, ~ cv_error(data, formula = .x, k = k)) + do.call(rbind, lapply(formulas, function(f) cv_error(data, formula = f, k = k))) } diff --git a/R/find_beta.R b/R/find_beta.R index 1ae733cc..66d1529a 100644 --- a/R/find_beta.R +++ b/R/find_beta.R @@ -137,24 +137,24 @@ find_beta2 <- function(x, se, ci, n) { # need to compute proportion x <- x / n - p2 <- .95 + p2 <- 0.95 x2 <- x + bvar } # for standard errors, we assume a 68% quantile if (!missing(se)) { - p2 <- .68 + p2 <- 0.68 x2 <- x + se } # for CI, we assume a 68% quantile if (!missing(ci)) { - p2 <- .95 + p2 <- 0.95 x2 <- ci } # the probability is assumed to be the median - p1 <- .5 + p1 <- 0.5 x1 <- x find_beta(x1, p1, x2, p2) @@ -182,4 +182,3 @@ find_normal <- function(x1, p1, x2, p2) { list(mean = mw, sd = stddev) } - diff --git a/R/gmd.R b/R/gmd.R index c7c3eaf4..dcfe9678 100644 --- a/R/gmd.R +++ b/R/gmd.R @@ -1,45 +1,42 @@ #' @title Gini's Mean Difference #' @name gmd -#' @description \code{gmd()} computes Gini's mean difference for a numeric vector +#' @description `gmd()` computes Gini's mean difference for a numeric vector #' or for all numeric vectors in a data frame. #' #' @param x A vector or data frame. -#' @param ... Optional, unquoted names of variables that should be selected for -#' further processing. Required, if \code{x} is a data frame (and no vector) -#' and only selected variables from \code{x} should be processed. You may also -#' use functions like \code{:} or tidyselect's \code{select_helpers()}. +#' @param select Optional, names of variables as character vector that should be +#' selected for further processing. Required, if `x` is a data frame (and no vector) +#' and only selected variables from `x` should be processed. #' #' @return For numeric vectors, Gini's mean difference. For non-numeric vectors -#' or vectors of length < 2, returns \code{NA}. +#' or vectors of length < 2, returns `NA`. #' #' @note Gini's mean difference is defined as the mean absolute difference between -#' any two distinct elements of a vector. Missing values from \code{x} are -#' silently removed. +#' any two distinct elements of a vector. Missing values from `x` are silently +#' removed. #' #' @references David HA. Gini's mean difference rediscovered. Biometrika 1968(55): 573-575 #' #' @examples #' data(efc) #' gmd(efc$e17age) -#' gmd(efc, e17age, c160age, c12hour) +#' gmd(efc, c("e17age", "c160age", "c12hour")) #' -#' @importFrom dplyr quos select -#' @importFrom purrr map_df -#' @importFrom sjmisc is_empty #' @export -gmd <- function(x, ...) { - # evaluate dots - qs <- dplyr::quos(...) - if (!sjmisc::is_empty(qs)) x <- suppressMessages(dplyr::select(x, !!!qs)) - - if (is.data.frame(x)) - purrr::map_df(x, gmd_helper) - else +gmd <- function(x, select = NULL) { + if (is.data.frame(x)) { + do.call(rbind, lapply(select, function(i) { + data.frame( + variable = i, + gmd = gmd_helper(x[[i]]) + ) + })) + } else { gmd_helper(x) + } } -#' @importFrom stats na.omit gmd_helper <- function(x) { if (!is.numeric(x)) return(NA) diff --git a/R/gof.R b/R/gof.R index 3699a235..9b6d614e 100644 --- a/R/gof.R +++ b/R/gof.R @@ -48,7 +48,6 @@ #' # equal to population #' chisq_gof(efc$e42dep, prop.table(table(efc$e42dep))) #' -#' @importFrom stats na.omit fitted resid formula as.formula lm pnorm chisq.test #' @export chisq_gof <- function(x, prob = NULL, weights = NULL) { if (inherits(x, "glm")) { diff --git a/R/helpfunctions.R b/R/helpfunctions.R index ce36f369..134ad328 100644 --- a/R/helpfunctions.R +++ b/R/helpfunctions.R @@ -12,60 +12,8 @@ is_merMod <- function(fit) { } -is_stan_model <- function(fit) { - inherits(fit, c("stanreg", "stanfit", "brmsfit")) -} - - -#' @importFrom sjmisc str_contains -get_glm_family <- function(fit) { - c.f <- class(fit) - - # do we have glm? if so, get link family. make exceptions - # for specific models that don't have family function - if (any(c.f %in% c("lme", "plm"))) { - fitfam <- "" - logit_link <- FALSE - } else { - fitfam <- stats::family(fit)$family - logit_link <- stats::family(fit)$link == "logit" - } - - # create logical for family - binom_fam <- fitfam %in% c("binomial", "quasibinomial") - poisson_fam <- fitfam %in% c("poisson", "quasipoisson") || - sjmisc::str_contains(fitfam, "negative binomial", ignore.case = TRUE) - - list(is_bin = binom_fam, is_pois = poisson_fam, is_logit = logit_link) -} - -# return names of objects passed as ellipses argument -dot_names <- function(dots) unname(unlist(lapply(dots, as.character))) - - -#' @importFrom tidyr nest -#' @importFrom dplyr select filter group_vars -#' @importFrom stats complete.cases -#' @importFrom rlang .data -get_grouped_data <- function(x) { - # retain observations that are complete wrt grouping vars, then nest - grps <- x %>% - dplyr::group_modify(~ dplyr::filter(.x, stats::complete.cases(.y))) %>% - tidyr::nest() - - # arrange data - if (length(dplyr::group_vars(x)) == 1) - reihe <- order(grps[[1]]) - else - reihe <- order(grps[[1]], grps[[2]]) - grps <- grps[reihe, ] - - grps -} - - .compact_character <- function(x) { - x[!sapply(x, function(i) is.null(i) || nchar(i) == 0 || is.na(i) || any(i == "NULL", na.rm = TRUE))] + x[!sapply(x, function(i) is.null(i) || !nzchar(i, keepNA = TRUE) || is.na(i) || any(i == "NULL", na.rm = TRUE))] } @@ -106,3 +54,60 @@ get_grouped_data <- function(x) { ) l10n_info()[["UTF-8"]] && ((win_os && getRversion() >= "4.2") || (!win_os && getRversion() >= "4.0")) } + + +.is_pseudo_numeric <- function(x) { + (is.character(x) && !anyNA(suppressWarnings(as.numeric(stats::na.omit(x[nzchar(x, keepNA = TRUE)]))))) || (is.factor(x) && !anyNA(suppressWarnings(as.numeric(levels(x))))) # nolint +} + + +.misspelled_string <- function(source, searchterm, default_message = NULL) { + if (is.null(searchterm) || length(searchterm) < 1) { + return(default_message) + } + # used for many matches + more_found <- "" + # init default + msg <- "" + # remove matching strings + same <- intersect(source, searchterm) + searchterm <- setdiff(searchterm, same) + source <- setdiff(source, same) + # guess the misspelled string + possible_strings <- unlist(lapply(searchterm, function(s) { + source[.fuzzy_grep(source, s)] # nolint + }), use.names = FALSE) + if (length(possible_strings)) { + msg <- "Did you mean " + if (length(possible_strings) > 1) { + # make sure we don't print dozens of alternatives for larger data frames + if (length(possible_strings) > 5) { + more_found <- sprintf( + " We even found %i more possible matches, not shown here.", + length(possible_strings) - 5 + ) + possible_strings <- possible_strings[1:5] + } + msg <- paste0(msg, "one of ", toString(paste0("\"", possible_strings, "\""))) + } else { + msg <- paste0(msg, "\"", possible_strings, "\"") + } + msg <- paste0(msg, "?", more_found) + } else { + msg <- default_message + } + # no double white space + insight::trim_ws(msg) +} + + +.fuzzy_grep <- function(x, pattern, precision = NULL) { + if (is.null(precision)) { + precision <- round(nchar(pattern) / 3) + } + if (precision > nchar(pattern)) { + return(NULL) + } + p <- sprintf("(%s){~%i}", pattern, precision) + grep(pattern = p, x = x, ignore.case = FALSE) +} diff --git a/R/inequ_trends.R b/R/inequ_trends.R index e4415375..3b94a669 100644 --- a/R/inequ_trends.R +++ b/R/inequ_trends.R @@ -26,7 +26,7 @@ #' in changes in rate differences and rate ratios. The function implements #' the algorithm proposed by \emph{Mackenbach et al. 2015}. #' -#' @examples +#' @examplesIf requireNamespace("ggplot2") #' # This example reproduces Fig. 1 of Mackenbach et al. 2015, p.5 #' #' # 40 simulated time points, with an initial rate ratio of 2 and @@ -47,27 +47,21 @@ #' prev.data <- data.frame(lo, hi) #' #' # print values -#' inequ_trend(prev.data, lo, hi) +#' inequ_trend(prev.data, "lo", "hi") #' #' # plot trends - here we see that the relative inequalities #' # are increasing over time, while the absolute inequalities #' # are first increasing as well, but later are decreasing #' # (while rel. inequ. are still increasing) -#' plot(inequ_trend(prev.data, lo, hi)) +#' plot(inequ_trend(prev.data, "lo", "hi")) #' -#' @importFrom dplyr select -#' @importFrom rlang quo_name enquo .data #' @export inequ_trend <- function(data, prev.low, prev.hi) { # prepare data for prevalence rates for low and hi status groups if (is.null(data) || missing(data)) { dat <- data.frame(prev.low, prev.hi) } else { - # get variable names - # create quosures - low <- rlang::quo_name(rlang::enquo(prev.low)) - high <- rlang::quo_name(rlang::enquo(prev.hi)) - dat <- dplyr::select(data, !! low, !! high) + dat <- data[c(prev.low, prev.hi)] } # ensure common column names @@ -96,7 +90,7 @@ inequ_trend <- function(data, prev.low, prev.hi) { for (t in 2:nrow(dat)) { delta.low <- (dat$lo[t] - dat$lo[t - 1]) / dat$lo[t - 1] delta.hi <- (dat$hi[t] - dat$hi[t - 1]) / dat$hi[t - 1] - dat$rd[t] <- dat$rd[t - 1] + (dat$lo[t - 1 ] * delta.low - dat$hi[t - 1] * delta.hi) + dat$rd[t] <- dat$rd[t - 1] + (dat$lo[t - 1] * delta.low - dat$hi[t - 1] * delta.hi) } # return diff --git a/R/is_prime.R b/R/is_prime.R index 213fcbb5..f5b5ecef 100644 --- a/R/is_prime.R +++ b/R/is_prime.R @@ -2,23 +2,21 @@ #' @name is_prime #' #' @description This functions checks whether a number is, or numbers in a -#' vector are prime numbers. +#' vector are prime numbers. #' #' @param x An integer, or a vector of integers. #' -#' @return \code{TRUE} for each prime number in \code{x}, \code{FALSE} otherwise. +#' @return `TRUE` for each prime number in `x`, `FALSE` otherwise. #' #' @examples #' is_prime(89) #' is_prime(15) #' is_prime(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) #' -#' @importFrom purrr map_lgl -#' @importFrom sjmisc is_float #' @export is_prime <- function(x) { - if (sjmisc::is_float(x)) - stop("`x` needs to be an integer value.", call. = F) - - purrr::map_lgl(x, ~ .x == 2L || all(.x %% 2L:max(2, floor(sqrt(.x))) != 0)) + if (is.numeric(x) && !all(x %% 1 == 0, na.rm = TRUE)) { + insight::format_error("`x` needs to be an integer value.") + } + vapply(x, function(.x) .x == 2L || all(.x %% 2L:max(2, floor(sqrt(.x))) != 0), logical(1)) } diff --git a/R/kruskal_wallis_test.R b/R/kruskal_wallis_test.R index a66e3c15..24bc2cea 100644 --- a/R/kruskal_wallis_test.R +++ b/R/kruskal_wallis_test.R @@ -2,12 +2,24 @@ #' @name kruskal_wallis_test #' @description This function performs a Kruskal-Wallis rank sum test, to test #' the null hypothesis that the population median of all of the groups are -#' equal. The alternative is that they differ in at least one. +#' equal. The alternative is that they differ in at least one. Unlike the +#' underlying base R function `kruskal.test()`, this function allows for +#' weighted tests. #' #' @inheritParams mann_whitney_test +#' @inherit mann_whitney_test seealso #' #' @return A data frame with test results. #' +#' @inheritSection mann_whitney_test Which test to use +#' +#' @references +#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. +#' Dtsch Med Wochenschr 2007; 132: e24–e25 +#' +#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer +#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 +#' #' @details The function simply is a wrapper around [`kruskal.test()`]. The #' weighted version of the Kruskal-Wallis test is based on the **survey** package, #' using [`survey::svyranktest()`]. @@ -33,6 +45,8 @@ #' groups = rep(c("A", "B", "C"), each = 20) #' ) #' kruskal_wallis_test(long_data, select = "scales", by = "groups") +#' # base R equivalent +#' kruskal.test(scales ~ groups, data = long_data) #' @export kruskal_wallis_test <- function(data, select = NULL, @@ -41,13 +55,10 @@ kruskal_wallis_test <- function(data, insight::check_if_installed("datawizard") # sanity checks - .sanitize_htest_input(data, select, by, weights) + .sanitize_htest_input(data, select, by, weights, test = "kruskal_wallis_test") # does select indicate more than one variable? if (length(select) > 1) { - if (!is.null(by)) { - insight::format_error("If `select` specifies more than one variable, `by` must be `NULL`.") - } # we convert the data into long format, and create a grouping variable data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale") by <- select[2] @@ -68,16 +79,16 @@ kruskal_wallis_test <- function(data, insight::format_error("At least two groups are required, i.e. data must have at least two unique levels in `by` for `kruskal_wallis_test()`.") # nolint } if (is.null(weights)) { - .calculate_kw(dv, grp) + .calculate_kw(dv, grp, group_labels = c(select, by)) } else { - .calculate_weighted_kw(dv, grp, data[[weights]]) + .calculate_weighted_kw(dv, grp, data[[weights]], group_labels = c(select, by)) } } # Kruskal-Wallis-Test -------------------------------------------- -.calculate_kw <- function(dv, grp, paired = FALSE) { +.calculate_kw <- function(dv, grp, paired = FALSE, group_labels = NULL) { # prepare data wcdat <- data.frame(dv, grp) if (paired) { @@ -95,7 +106,7 @@ kruskal_wallis_test <- function(data, ) out <- data.frame( - data = wt$data.name, + data = paste(group_labels[1], "by", group_labels[2]), Chi2 = wt$statistic, df = wt$parameter, p = as.numeric(wt$p.value), @@ -113,7 +124,7 @@ kruskal_wallis_test <- function(data, # Weighted Mann-Whitney-Test for two groups ---------------------------------- -.calculate_weighted_kw <- function(dv, grp, weights, paired = FALSE) { +.calculate_weighted_kw <- function(dv, grp, weights, paired = FALSE, group_labels = NULL) { # check if pkg survey is available insight::check_if_installed("survey") @@ -133,7 +144,7 @@ kruskal_wallis_test <- function(data, } out <- data.frame( - data = paste(dv, "by", grp), + data = paste(group_labels[1], "by", group_labels[2]), Chi2 = result$statistic, df = result$parameter, p = as.numeric(result$p.value), @@ -182,7 +193,7 @@ print.sj_htest_kw <- function(x, ...) { stat_symbol <- .format_symbols("Chi2") cat(sprintf( - "\n %s = %.3f, df = %i, %s\n\n", + "\n %s = %.2f, df = %i, %s\n\n", stat_symbol, x$Chi2, round(x$df), insight::format_p(x$p) )) } diff --git a/R/mann_whitney_test.R b/R/mann_whitney_test.R index d16f1788..7874f5a6 100644 --- a/R/mann_whitney_test.R +++ b/R/mann_whitney_test.R @@ -1,29 +1,98 @@ -#' @title Mann-Whitney-Test +#' @title Mann-Whitney test #' @name mann_whitney_test -#' @description This function performs a Mann-Whitney-Test (or Wilcoxon rank -#' sum test for _unpaired_ samples. +#' @description This function performs a Mann-Whitney test (or Wilcoxon rank +#' sum test for _unpaired_ samples). Unlike the underlying base R function +#' `wilcox.test()`, this function allows for weighted tests and automatically +#' calculates effect sizes. For _paired_ (dependent) samples, or for one-sample +#' tests, please use the `wilcoxon_test()` function. #' -#' A Mann-Whitney-Test is a non-parametric test for the null hypothesis that two -#' independent samples have identical continuous distributions. It can be used -#' when the two continuous variables are not normally distributed. +#' A Mann-Whitney test is a non-parametric test for the null hypothesis that two +#' _independent_ samples have identical continuous distributions. It can be used +#' for ordinal scales or when the two continuous variables are not normally +#' distributed. For large samples, or approximately normally distributed variables, +#' the `t_test()` function can be used. #' #' @param data A data frame. -#' @param select Name of the dependent variable (as string) to be used for the -#' test. `select` can also be a character vector, specifying the names of -#' multiple continuous variables. In this case, `by` is ignored and variables -#' specified in `select` are used to compute the test. This can be useful if -#' the data is in wide-format and no grouping variable is available. -#' @param by Name of the grouping variable to be used for the test. If `by` is -#' not a factor, it will be coerced to a factor. For `chi_squared_test()`, if -#' `probabilities` is provided, `by` must be `NULL`. +#' @param select Name(s) of the continuous variable(s) (as character vector) +#' to be used as samples for the test. `select` can be one of the following: +#' +#' - `select` can be used in combination with `by`, in which case `select` is +#' the name of the continous variable (and `by` indicates a grouping factor). +#' - `select` can also be a character vector of length two or more (more than +#' two names only apply to `kruskal_wallis_test()`), in which case the two +#' continuous variables are treated as samples to be compared. `by` must be +#' `NULL` in this case. +#' - If `select` select is of length **two** and `paired = TRUE`, the two samples +#' are considered as *dependent* and a paired test is carried out. +#' - If `select` specifies **one** variable and `by = NULL`, a one-sample test +#' is carried out (only applicable for `t_test()` and `wilcoxon_test()`). +#' @param by Name of the variable indicating the groups. Required if `select` +#' specifies only one variable that contains all samples to be compared in the +#' test. If `by` is not a factor, it will be coerced to a factor. For +#' `chi_squared_test()`, if `probabilities` is provided, `by` must be `NULL`. #' @param weights Name of an (optional) weighting variable to be used for the test. -#' @param distribution Indicates how the null distribution of the test statistic -#' should be computed. May be one of `"exact"`, `"approximate"` or `"asymptotic"` -#' (default). See [`coin::wilcox_test()`] for details. +#' @param alternative A character string specifying the alternative hypothesis, +#' must be one of `"two.sided"` (default), `"greater"` or `"less"`. See `?t.test` +#' and `?wilcox.test`. +#' @param mu The hypothesized difference in means (for `t_test()`) or location +#' shift (for `mann_whitney_test()`). The default is 0. +#' @param ... Additional arguments passed to `wilcox.test()` (for unweighted +#' tests, i.e. when `weights = NULL`). +#' +#' @section Which test to use: +#' The following table provides an overview of which test to use for different +#' types of data. The choice of test depends on the scale of the outcome +#' variable and the number of samples to compare. +#' +#' | **Samples** | **Scale of Outcome** | **Significance Test** | +#' |-----------------|------------------------|---------------------------------| +#' | 1 | binary / nominal | `chi_squared_test()` | +#' | 1 | continuous, not normal | `wilcoxon_test()` | +#' | 1 | continuous, normal | `t_test()` | +#' | 2, independent | binary / nominal | `chi_squared_test()` | +#' | 2, independent | continuous, not normal | `mann_whitney_test()` | +#' | 2, independent | continuous, normal | `t_test()` | +#' | 2, dependent | binary (only 2x2) | `chi_squared_test(paired=TRUE)` | +#' | 2, dependent | continuous, not normal | `wilcoxon_test()` | +#' | 2, dependent | continuous, normal | `t_test(paired=TRUE)` | +#' | >2, independent | continuous, not normal | `kruskal_wallis_test()` | +#' | >2, independent | continuous, normal | `datawizard::means_by_group()` | +#' | >2, dependent | continuous, not normal | _not yet implemented_ (1) | +#' | >2, dependent | continuous, normal | _not yet implemented_ (2) | +#' +#' (1) More than two dependent samples are considered as _repeated measurements_. +#' For ordinal or not-normally distributed outcomes, these samples are +#' usually tested using a [`friedman.test()`], which requires the samples +#' in one variable, the groups to compare in another variable, and a third +#' variable indicating the repeated measurements (subject IDs). +#' +#' (2) More than two dependent samples are considered as _repeated measurements_. +#' For normally distributed outcomes, these samples are usually tested using +#' a ANOVA for repeated measurements. A more sophisticated approach would +#' be using a linear mixed model. +#' +#' @seealso +#' - [`mann_whitney_test()`] for unpaired (independent) samples. +#' - [`t_test()`] for parametric t-tests. +#' - [`kruskal_wallis_test()`] for non-parametric ANOVA (i.e. more than two samples). +#' - [`wilcoxon_test()`] for Wilcoxon rank sum tests for paired (dependent) samples. +#' - [`chi_squared_test()`] for chi-squared tests (two categorical variables). #' #' @return A data frame with test results. The function returns p and Z-values #' as well as effect size r and group-rank-means. #' +#' @references +#' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., +#' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data +#' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. +#' \doi{10.3390/math11091982} +#' +#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. +#' Dtsch Med Wochenschr 2007; 132: e24–e25 +#' +#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer +#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 +#' #' @details This function is based on [`wilcox.test()`] and [`coin::wilcox_test()`] #' (the latter to extract effect sizes). The weighted version of the test is #' based on [`survey::svyranktest()`]. @@ -34,49 +103,51 @@ #' - medium effect >= 0.3 #' - large effect >= 0.5 #' -#' **r** is calcuated as: -#' -#' ``` -#' r = |Z| / sqrt(n1 + n2) -#' ``` +#' **r** is calcuated as \eqn{r = \frac{|Z|}{\sqrt{n1 + n2}}}. #' -#' @examples +#' @examplesIf requireNamespace("coin") && requireNamespace("survey") #' data(efc) -#' # Mann-Whitney-U-Tests for elder's age by elder's sex. +#' # Mann-Whitney-U tests for elder's age by elder's sex. #' mann_whitney_test(efc, "e17age", by = "e16sex") +#' # base R equivalent +#' wilcox.test(e17age ~ e16sex, data = efc) #' #' # when data is in wide-format, specify all relevant continuous #' # variables in `select` and omit `by` #' set.seed(123) #' wide_data <- data.frame(scale1 = runif(20), scale2 = runif(20)) #' mann_whitney_test(wide_data, select = c("scale1", "scale2")) -#' +#' # base R equivalent +#' wilcox.test(wide_data$scale1, wide_data$scale2) #' # same as if we had data in long format, with grouping variable #' long_data <- data.frame( #' scales = c(wide_data$scale1, wide_data$scale2), -#' groups = rep(c("A", "B"), each = 20) +#' groups = as.factor(rep(c("A", "B"), each = 20)) #' ) #' mann_whitney_test(long_data, select = "scales", by = "groups") +#' # base R equivalent +#' wilcox.test(scales ~ groups, long_data) #' @export mann_whitney_test <- function(data, select = NULL, by = NULL, weights = NULL, - distribution = "asymptotic") { + mu = 0, + alternative = "two.sided", + ...) { insight::check_if_installed("datawizard") + alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater")) # sanity checks - .sanitize_htest_input(data, select, by, weights) + .sanitize_htest_input(data, select, by, weights, test = "mann_whitney_test") + + # alternative only if weights are NULL + if (!is.null(weights) && alternative != "two.sided") { + insight::format_error("Argument `alternative` must be `two.sided` if `weights` are specified.") + } # does select indicate more than one variable? if (length(select) > 1) { - # sanity check - may only specify two variable names - if (length(select) > 2) { - insight::format_error("You may only specify two variables for Mann-Whitney test.") - } - if (!is.null(by)) { - insight::format_error("If `select` specifies more than one variable, `by` must be `NULL`.") - } # we convert the data into long format, and create a grouping variable data <- datawizard::data_to_long(data[select], names_to = "group", values_to = "scale") by <- select[2] @@ -104,7 +175,7 @@ mann_whitney_test <- function(data, } if (is.null(weights)) { - .calculate_mwu(dv, grp, distribution, group_labels) + .calculate_mwu(dv, grp, alternative, mu, group_labels, ...) } else { .calculate_weighted_mwu(dv, grp, data[[weights]], group_labels) } @@ -113,12 +184,12 @@ mann_whitney_test <- function(data, # Mann-Whitney-Test for two groups -------------------------------------------- -.calculate_mwu <- function(dv, grp, distribution, group_labels) { +.calculate_mwu <- function(dv, grp, alternative, mu, group_labels, ...) { insight::check_if_installed("coin") # prepare data wcdat <- data.frame(dv, grp) # perfom wilcox test - wt <- coin::wilcox_test(dv ~ grp, data = wcdat, distribution = distribution) + wt <- coin::wilcox_test(dv ~ grp, data = wcdat) # for rank mean group_levels <- levels(grp) @@ -126,9 +197,16 @@ mann_whitney_test <- function(data, # compute statistics u <- as.numeric(coin::statistic(wt, type = "linear")) z <- as.numeric(coin::statistic(wt, type = "standardized")) - p <- coin::pvalue(wt) r <- abs(z / sqrt(length(dv))) - w <- suppressWarnings(stats::wilcox.test(dv ~ grp, data = wcdat)$statistic) + htest <- suppressWarnings(stats::wilcox.test( + dv ~ grp, + data = wcdat, + alternative = alternative, + mu = mu, + ... + )) + w <- htest$statistic + p <- htest$p.value # group means dat_gr1 <- stats::na.omit(dv[grp == group_levels[1]]) @@ -149,7 +227,9 @@ mann_whitney_test <- function(data, w = w, z = z, r = r, - p = as.numeric(p) + p = as.numeric(p), + mu = mu, + alternative = alternative ) attr(out, "rank_means") <- stats::setNames( c(rank_mean_1, rank_mean_2), @@ -211,13 +291,14 @@ mann_whitney_test <- function(data, z <- result$statistic r <- abs(z / sqrt(sum(n_grp1, n_grp2))) - out <- data.frame( + out <- data_frame( group1 = group_levels[1], group2 = group_levels[2], estimate = result$estimate, z = z, r = r, - p = as.numeric(result$p.value) + p = as.numeric(result$p.value), + alternative = "two.sided" ) attr(out, "rank_means") <- stats::setNames( @@ -238,66 +319,25 @@ mann_whitney_test <- function(data, # helper ---------------------------------------------------------------------- -.misspelled_string <- function(source, searchterm, default_message = NULL) { - if (is.null(searchterm) || length(searchterm) < 1) { - return(default_message) - } - # used for many matches - more_found <- "" - # init default - msg <- "" - # remove matching strings - same <- intersect(source, searchterm) - searchterm <- setdiff(searchterm, same) - source <- setdiff(source, same) - # guess the misspelled string - possible_strings <- unlist(lapply(searchterm, function(s) { - source[.fuzzy_grep(source, s)] # nolint - }), use.names = FALSE) - if (length(possible_strings)) { - msg <- "Did you mean " - if (length(possible_strings) > 1) { - # make sure we don't print dozens of alternatives for larger data frames - if (length(possible_strings) > 5) { - more_found <- sprintf( - " We even found %i more possible matches, not shown here.", - length(possible_strings) - 5 - ) - possible_strings <- possible_strings[1:5] - } - msg <- paste0(msg, "one of ", toString(paste0("\"", possible_strings, "\""))) - } else { - msg <- paste0(msg, "\"", possible_strings, "\"") - } - msg <- paste0(msg, "?", more_found) - } else { - msg <- default_message +.sanitize_htest_input <- function(data, select, by, weights, test = NULL) { + # check if arguments are NULL + if (is.null(select)) { + insight::format_error("Argument `select` is missing.") } - # no double white space - insight::trim_ws(msg) -} - - -.fuzzy_grep <- function(x, pattern, precision = NULL) { - if (is.null(precision)) { - precision <- round(nchar(pattern) / 3) + # sanity check - may only specify two variable names + if (identical(test, "mann_whitney_test") && length(select) > 2) { + insight::format_error("You may only specify two variables for Mann-Whitney test.") } - if (precision > nchar(pattern)) { - return(NULL) + if (identical(test, "mann_whitney_test") && length(select) == 1 && is.null(by)) { + insight::format_error("Only one variable provided in `select`, but none in `by`. You need to specify a second continuous variable in `select`, or a grouping variable in `by` for Mann-Whitney test.") # nolint } - p <- sprintf("(%s){~%i}", pattern, precision) - grep(pattern = p, x = x, ignore.case = FALSE) -} - -.sanitize_htest_input <- function(data, select, by, weights) { - # check if arguments are NULL - if (is.null(select)) { - insight::format_error("Argument `select` is missing.") + # sanity check - may only specify two variable names + if (identical(test, "t_test") && length(select) > 2) { + insight::format_error("You may only specify two variables for Student's t test.") } - # `by` is only allowed to be NULL if `select` specifies more than one variable - if (is.null(by) && length(select) == 1) { - insight::format_error("Arguments `by` is missing.") + if ((!is.null(test) && test %in% c("t_test", "kruskal_wallis_test", "mann_whitney_test")) && length(select) > 1 && !is.null(by)) { # nolint + insight::format_error("If `select` specifies more than one variable, `by` must be `NULL`.") } # check if arguments have correct length or are of correct type @@ -307,7 +347,7 @@ mann_whitney_test <- function(data, if (!is.null(by) && (length(by) != 1 || !is.character(by))) { insight::format_error("Argument `by` must be a character string with the name of a single variable.") } - if (!is.null(weights) && length(weights) != 1) { + if (!is.null(weights) && (length(weights) != 1 || !is.character(weights))) { insight::format_error("Argument `weights` must be a character string with the name of a single variable.") } @@ -374,5 +414,21 @@ print.sj_htest_mwu <- function(x, ...) { ), "cyan" ) - cat(sprintf("\n r = %.3f, Z = %.3f, %s\n\n", x$r, x$z, insight::format_p(x$p))) + # alternative hypothesis + if (!is.null(x$alternative) && !is.null(x$mu)) { + alt_string <- switch(x$alternative, + two.sided = "not equal to", + less = "less than", + greater = "greater than" + ) + alt_string <- paste("true location shift is", alt_string, x$mu) + insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan") + } + + if (!is.null(x$w)) { + w_stat <- paste("W =", insight::format_value(x$w, protect_integers = TRUE), ", ") + } else { + w_stat <- "" + } + cat(sprintf("\n %sr = %.2f, Z = %.2f, %s\n\n", w_stat, x$r, x$z, insight::format_p(x$p))) } diff --git a/R/phi.R b/R/phi.R index 48a07f1e..eb57406c 100644 --- a/R/phi.R +++ b/R/phi.R @@ -19,19 +19,19 @@ phi.ftable <- function(tab, ...) { #' @export phi.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist", "quantile"), ...) { - terms <- all.vars(formula) - tab <- table(data[[terms[1]]], data[[terms[2]]]) + formula_terms <- all.vars(formula) + tab <- table(data[[formula_terms[1]]], data[[formula_terms[2]]]) method <- match.arg(method) if (is.null(ci.lvl) || is.na(ci.lvl)) { - .cramer(tab) + .cramers_v(tab) } else { - straps <- sjstats::bootstrap(data[terms], n) + straps <- sjstats::bootstrap(data[formula_terms], n) tables <- lapply(straps$strap, function(x) { dat <- as.data.frame(x) table(dat[[1]], dat[[2]]) }) - phis <- sapply(tables, function(x) .phi(x)) + phis <- sapply(tables, .phi) ci <- boot_ci(phis, ci.lvl = ci.lvl, method = method) data_frame( @@ -44,6 +44,7 @@ phi.formula <- function(formula, data, ci.lvl = NULL, n = 1000, method = c("dist .phi <- function(tab) { + insight::check_if_installed("MASS") # convert to flat table if (!inherits(tab, "ftable")) tab <- stats::ftable(tab) diff --git a/R/prop.R b/R/prop.R index 167b3cdd..6c980541 100644 --- a/R/prop.R +++ b/R/prop.R @@ -35,7 +35,7 @@ #' returns a data frame with one column per group with grouping categories, #' followed by one column with proportions per condition. #' -#' @examples +#' @examplesIf getRversion() >= "4.2.0" && requireNamespace("datawizard", quietly = TRUE) #' data(efc) #' #' # proportion of value 1 in e42dep @@ -52,10 +52,10 @@ #' prop(efc, e42dep == 1, e42dep > 2, na.rm = FALSE) #' #' # for factors or character vectors, use quoted or unquoted values -#' library(sjmisc) +#' library(datawizard) #' # convert numeric to factor, using labels as factor levels -#' efc$e16sex <- to_label(efc$e16sex) -#' efc$n4pstu <- to_label(efc$n4pstu) +#' efc$e16sex <- to_factor(efc$e16sex) +#' efc$n4pstu <- to_factor(efc$n4pstu) #' #' # get proportion of female older persons #' prop(efc, e16sex == female) @@ -70,40 +70,16 @@ #' ) #' #' # also works with pipe-chains -#' library(dplyr) -#' efc %>% prop(e17age > 70) -#' efc %>% prop(e17age > 70, e16sex == 1) -#' -#' # and with group_by -#' efc %>% -#' group_by(e16sex) %>% -#' prop(e42dep > 2) -#' -#' efc %>% -#' select(e42dep, c161sex, c172code, e16sex) %>% -#' group_by(c161sex, c172code) %>% -#' prop(e42dep > 2, e16sex == 1) -#' -#' # same for "props()" -#' efc %>% -#' select(e42dep, c161sex, c172code, c12hour, n4pstu) %>% -#' group_by(c161sex, c172code) %>% -#' props( -#' e42dep > 2, -#' c12hour > 20 & c12hour < 40, -#' n4pstu == 'Care Level 1' | n4pstu == 'Care Level 3' -#' ) +#' efc |> prop(e17age > 70) +#' efc |> prop(e17age > 70, e16sex == 1) #' @export prop <- function(data, ..., weights = NULL, na.rm = TRUE, digits = 4) { # check argument if (!is.data.frame(data)) { insight::format_error("`data` needs to be a data frame.") } - - # get dots - dots <- match.call(expand.dots = FALSE)$`...` - - proportions(data, dots, weight.by = weights, na.rm, digits, multi_logical = FALSE) + dots <- match.call(expand.dots = FALSE)[["..."]] + .proportions(data, dots = dots, weight.by = weights, na.rm, digits, multi_logical = FALSE) } @@ -114,16 +90,12 @@ props <- function(data, ..., na.rm = TRUE, digits = 4) { if (!is.data.frame(data)) { insight::format_error("`data` needs to be a data frame.") } - - # get dots - dots <- match.call(expand.dots = FALSE)$`...` - - proportions(data, dots, NULL, na.rm, digits, multi_logical = TRUE) + dots <- match.call(expand.dots = FALSE)[["..."]] + .proportions(data, dots = dots, NULL, na.rm, digits, multi_logical = TRUE) } -#' @importFrom purrr map_df -proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) { +.proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) { # remember comparisons comparisons <- lapply(dots, function(x) { # to character, and remove spaces and quotes @@ -132,86 +104,35 @@ proportions <- function(data, dots, weight.by, na.rm, digits, multi_logical) { x }) - # do we have a grouped data frame? if (inherits(data, "grouped_df")) { - - # remember order of values - reihenfolge <- NULL - - # get grouped data - grps <- get_grouped_data(data) - - # now get proportions for each subset - fr <- purrr::map_df( - seq_len(nrow(grps)), - function(i) { - # get data from grouped data frame - .d <- grps$data[[i]] - - # iterate dots (comparing conditions) - if (multi_logical) - result <- lapply(dots, get_multiple_proportion, .d, na.rm, digits) - else - result <- lapply(dots, get_proportion, .d, weight.by, na.rm, digits) - - as.data.frame(t(unlist(result))) - } - ) - - - # now we need the values from the groups of the grouped data frame - for (i in (ncol(grps) - 1):1) { - # get value label - var.name <- colnames(grps)[i] - val.labels <- suppressWarnings( - rep(sjlabelled::get_labels(data[[var.name]]), length.out = nrow(fr)) - ) - - # if we have no value labels, use values instead - if (is.null(val.labels)) { - val.labels <- - rep(unique(sort(data[[var.name]])), length.out = nrow(fr)) - } - - # add row order, based on values of grouping variables - reihenfolge <- rep(sort(unique(sjlabelled::as_numeric(data[[var.name]]))), length.out = nrow(fr)) %>% - as.data.frame() %>% - dplyr::bind_cols(reihenfolge) - - # bind values as column - fr <- dplyr::bind_cols(data.frame(val.labels, stringsAsFactors = FALSE), fr) - } - - # get column names. we need variable labels as column names - var.names <- colnames(grps)[seq_len(ncol(grps) - 1)] - var.labels <- sjlabelled::get_label(data[, var.names], def.value = var.names) - - # set variable labels and comparisons as colum names - colnames(fr) <- c(var.labels, comparisons) - - # order rows by values of grouping variables - fr <- fr[do.call(order, reihenfolge), ] - - fr - + grps <- attributes(data)$groups + result <- lapply(grps[[".rows"]], function(x) { + .process_prop(data[x, , drop = FALSE], comparisons, dots, multi_logical, na.rm, digits, weight.by) + }) } else { - # iterate dots (comparing conditions) - if (multi_logical) - result <- lapply(dots, get_multiple_proportion, data, na.rm, digits) - else - result <- lapply(dots, get_proportion, data, weight.by, na.rm, digits) + result <- .process_prop(data, comparisons, dots, multi_logical, na.rm, digits, weight.by) + } + result +} - # if we have more than one proportion, return a data frame. this allows us - # to save more information, the condition and the proportion value - if (length(comparisons) > 1) { - return(data_frame( - condition = as.character(unlist(comparisons)), - prop = unlist(result) - )) - } - unlist(result) +.process_prop <- function(data, comparisons, dots, multi_logical, na.rm, digits, weight.by) { + # iterate dots (comparing conditions) + if (multi_logical) + result <- lapply(dots, get_multiple_proportion, data, na.rm, digits) + else + result <- lapply(dots, get_proportion, data, weight.by, na.rm, digits) + + # if we have more than one proportion, return a data frame. this allows us + # to save more information, the condition and the proportion value + if (length(comparisons) > 1) { + return(data_frame( + condition = as.character(unlist(comparisons)), + prop = unlist(result) + )) } + + unlist(result) } @@ -241,19 +162,16 @@ get_proportion <- function(x, data, weight.by, na.rm, digits) { if (!is.null(weight.by)) f <- weight(f, weights = weight.by) # get proportions - if (x.parts[2] == "==") - dummy <- f == v - else if (x.parts[2] == "!=") - dummy <- f != v - else if (x.parts[2] == "<") - dummy <- f < v - else if (x.parts[2] == ">") - dummy <- f > v - else - dummy <- f == v + dummy <- switch(x.parts[2], + "==" = f == v, + "!=" = f != v, + "<" = f < v, + ">" = f > v, + f == v + ) # remove missings? - if (na.rm) dummy <- na.omit(dummy) + if (na.rm) dummy <- stats::na.omit(dummy) # get proportion round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits) @@ -265,7 +183,7 @@ get_multiple_proportion <- function(x, data, na.rm, digits) { dummy <- with(data, eval(parse(text = deparse(x)))) # remove missings? - if (na.rm) dummy <- na.omit(dummy) + if (na.rm) dummy <- stats::na.omit(dummy) # get proportion round(sum(dummy, na.rm = TRUE) / length(dummy), digits = digits) diff --git a/R/re-exports.R b/R/re-exports.R index f16d9fb6..1b5605ae 100644 --- a/R/re-exports.R +++ b/R/re-exports.R @@ -1,12 +1,3 @@ -#' @importFrom magrittr %>% -#' @export -magrittr::`%>%` - - -#' @importFrom sjmisc typical_value -#' @export -sjmisc::typical_value - #' @importFrom performance mse #' @export performance::mse @@ -26,3 +17,15 @@ bayestestR::equivalence_test #' @importFrom insight link_inverse #' @export insight::link_inverse + +#' @importFrom datawizard weighted_sd +#' @export +datawizard::weighted_sd + +#' @importFrom datawizard weighted_mean +#' @export +datawizard::weighted_mean + +#' @importFrom datawizard weighted_median +#' @export +datawizard::weighted_median diff --git a/R/samplesize_mixed.R b/R/samplesize_mixed.R index 66f170cf..42521118 100644 --- a/R/samplesize_mixed.R +++ b/R/samplesize_mixed.R @@ -36,7 +36,7 @@ #' additionally include repeated measures (three-level-designs) may work #' as well, however, the computed sample size may be less accurate. #' -#' @examples +#' @examplesIf requireNamespace("pwr") #' # Sample size for multilevel model with 30 cluster groups and a small to #' # medium effect size (Cohen's d) of 0.3. 27 subjects per cluster and #' # hence a total sample size of about 802 observations is needed. @@ -47,7 +47,13 @@ #' # hence a total sample size of about 107 observations is needed. #' samplesize_mixed(eff.size = .2, df.n = 5, k = 20, power = .9) #' @export -samplesize_mixed <- function(eff.size, df.n = NULL, power = .8, sig.level = .05, k, n, icc = 0.05) { +samplesize_mixed <- function(eff.size, + df.n = NULL, + power = 0.8, + sig.level = 0.05, + k, + n, + icc = 0.05) { if (!requireNamespace("pwr", quietly = TRUE)) { stop("Package `pwr` needed for this function to work. Please install it.", call. = FALSE) } diff --git a/R/se_ybar.R b/R/se_ybar.R index 0b1bc882..642dc856 100644 --- a/R/se_ybar.R +++ b/R/se_ybar.R @@ -12,14 +12,9 @@ #' #' @references Gelman A, Hill J. 2007. Data analysis using regression and multilevel/hierarchical models. Cambridge, New York: Cambridge University Press #' -#' @examples -#' if (require("lme4")) { -#' fit <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy) -#' se_ybar(fit) -#' } -#' @importFrom lme4 ngrps -#' @importFrom stats nobs -#' @importFrom purrr map_dbl +#' @examplesIf require("lme4") +#' fit <- lmer(Reaction ~ 1 + (1 | Subject), sleepstudy) +#' se_ybar(fit) #' @export se_ybar <- function(fit) { # get model icc @@ -32,7 +27,7 @@ se_ybar <- function(fit) { tot_var <- sum(tau.00, vars$var.residual) # get number of groups - m.cnt <- lme4::ngrps(fit) + m.cnt <- vapply(fit@flist, nlevels, 1) # compute number of observations per group (level-2-unit) obs <- round(stats::nobs(fit) / m.cnt) @@ -41,7 +36,9 @@ se_ybar <- function(fit) { icc <- tau.00 / tot_var # compute standard error of sample mean - se <- purrr::map_dbl(seq_len(length(m.cnt)), ~ sqrt((tot_var / stats::nobs(fit)) * design_effect(n = obs[.x], icc = icc[.x]))) + se <- unlist(lapply(seq_len(length(m.cnt)), function(.x) { + sqrt((tot_var / stats::nobs(fit)) * design_effect(n = obs[.x], icc = icc[.x])) + })) # give names for se, so user sees, which random effect has what impact names(se) <- names(m.cnt) diff --git a/R/select_helpers.R b/R/select_helpers.R index de7de9b2..87d6104c 100644 --- a/R/select_helpers.R +++ b/R/select_helpers.R @@ -13,9 +13,8 @@ string_ends_with <- function(pattern, x) { grep(pattern, x, perl = TRUE) } -#' @importFrom purrr map string_one_of <- function(pattern, x) { - m <- unlist(purrr::map(pattern, ~ grep(., x, fixed = TRUE, useBytes = TRUE))) + m <- unlist(lapply(pattern, grep, x = x, fixed = TRUE, useBytes = TRUE)) x[m] } @@ -32,5 +31,5 @@ obj_has_name <- function(x, name) { } obj_has_rownames <- function(x) { - !identical(as.character(1:nrow(x)), rownames(x)) + !identical(as.character(seq_len(nrow(x))), rownames(x)) } diff --git a/R/svy_median.R b/R/svy_median.R index 963aea2c..9a0d68f2 100644 --- a/R/svy_median.R +++ b/R/svy_median.R @@ -1,11 +1,8 @@ -#' @rdname weighted_sd -#' @importFrom stats as.formula +#' @rdname weighted_se #' @export survey_median <- function(x, design) { # check if pkg survey is available - if (!requireNamespace("survey", quietly = TRUE)) { - stop("Package `survey` needed to for this function to work. Please install it.", call. = FALSE) - } + insight::check_if_installed("survey") # deparse v <- stats::as.formula(paste("~", as.character(substitute(x)))) diff --git a/R/svyglmnb.R b/R/svyglmnb.R index 309b8095..26c84d52 100644 --- a/R/svyglmnb.R +++ b/R/svyglmnb.R @@ -65,14 +65,9 @@ utils::globalVariables("scaled.weights") #' # print coefficients and standard errors #' fit #' } -#' @importFrom MASS glm.nb -#' @importFrom stats weights update model.frame coef as.formula family #' @export svyglm.nb <- function(formula, design, ...) { - # check if pkg survey is available - if (!requireNamespace("survey", quietly = TRUE)) { - stop("Package `survey` needed to for this function to work. Please install it.", call. = FALSE) - } + insight::check_if_installed(c("survey", "MASS")) # get design weights. we need to scale these weights for the glm.nb() function dw <- stats::weights(design) diff --git a/R/svyglmzip.R b/R/svyglmzip.R index f9b575b9..ba6dd5f8 100644 --- a/R/svyglmzip.R +++ b/R/svyglmzip.R @@ -45,19 +45,9 @@ utils::globalVariables("scaled.weights") #' # print coefficients and standard errors #' fit #' } -#' @importFrom insight find_formula -#' @importFrom stats weights update model.frame coef as.formula family #' @export svyglm.zip <- function(formula, design, ...) { - # check if pkg survey is available - if (!requireNamespace("survey", quietly = TRUE)) { - stop("Package `survey` needed to for this function to work. Please install it.", call. = FALSE) - } - - if (!requireNamespace("pscl", quietly = TRUE)) { - stop("Package `pscl` needed to for this function to work. Please install it.", call. = FALSE) - } - + insight::check_if_installed(c("survey", "pscl")) # get design weights. we need to scale these weights for the glm.nb() function dw <- stats::weights(design) @@ -66,7 +56,7 @@ svyglm.zip <- function(formula, design, ...) { design <- stats::update(design, scaled.weights = dw / mean(dw, na.rm = TRUE)) # fit ZIP model, with scaled design weights - mod <- pscl::zeroinfl(formula, data = stats::model.frame(design), weights = scaled.weights, ...) + mod <- suppressWarnings(pscl::zeroinfl(formula, data = stats::model.frame(design), weights = scaled.weights, ...)) ff <- insight::find_formula(mod) # fit survey model, using maximum likelihood estimation @@ -94,7 +84,6 @@ svyglm.zip <- function(formula, design, ...) { } -#' @importFrom stats dpois # log-likelihood function used in "svymle()" sjstats_loglik_zip <- function(y, eta, logitp) { mu <- exp(eta) diff --git a/R/t_test.R b/R/t_test.R new file mode 100644 index 00000000..4f7e9bbf --- /dev/null +++ b/R/t_test.R @@ -0,0 +1,425 @@ +#' @title Student's t test +#' @name t_test +#' @description This function performs a Student's t test for two independent +#' samples, for paired samples, or for one sample. Unlike the underlying +#' base R function `t.test()`, this function allows for weighted tests and +#' automatically calculates effect sizes. +#' +#' @inheritParams mann_whitney_test +#' @param paired Logical, whether to compute a paired t-test for dependent +#' samples. +#' @inherit mann_whitney_test seealso +#' +#' @inheritSection mann_whitney_test Which test to use +#' +#' @details Interpretation of effect sizes are based on rules described in +#' [`effectsize::interpret_cohens_d()`] and [`effectsize::interpret_hedges_g()`]. +#' +#' @return A data frame with test results. +#' +#' @references +#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. +#' Dtsch Med Wochenschr 2007; 132: e24–e25 +#' +#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer +#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 +#' +#' @examplesIf requireNamespace("effectsize") +#' data(sleep) +#' # one-sample t-test +#' t_test(sleep, "extra") +#' # base R equivalent +#' t.test(extra ~ 1, data = sleep) +#' +#' # two-sample t-test, by group +#' t_test(mtcars, "mpg", by = "am") +#' # base R equivalent +#' t.test(mpg ~ am, data = mtcars) +#' +#' # paired t-test +#' t_test(mtcars, c("mpg", "hp"), paired = TRUE) +#' # base R equivalent +#' t.test(mtcars$mpg, mtcars$hp, data = mtcars, paired = TRUE) +#' @export +t_test <- function(data, + select = NULL, + by = NULL, + weights = NULL, + paired = FALSE, + mu = 0, + alternative = "two.sided") { + insight::check_if_installed(c("datawizard", "effectsize")) + alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater")) + + # sanity checks + .sanitize_htest_input(data, select, by, weights, test = "t_test") + data_name <- NULL + + # filter and remove NA + data <- stats::na.omit(data[c(select, by, weights)]) + + # does select indicate more than one variable? We than reshape the data + # to have one continous scale and one grouping variable + if (length(select) > 1) { + # paired? + if (paired) { + # subtract the two variables for paired t-test, and "set" by to NULL + data[[select[1]]] <- data[[select[1]]] - data[[select[2]]] + data_name <- paste(select[1], "and", select[2]) + select <- select[1] + by <- NULL + } else { + # we convert the data into long format, and create a grouping variable + data <- datawizard::data_to_long( + data[c(select, weights)], + select = select, + names_to = "group", + values_to = "scale" + ) + by <- select[2] + select <- select[1] + # after converting to long, we have the "grouping" variable first in the data + colnames(data) <- c(weights, by, select) + } + } + + # get data + dv <- data[[select]] + + # for two-sample t-test... + if (!is.null(by)) { + grp <- data[[by]] + # coerce to factor + grp <- datawizard::to_factor(grp) + # only two groups allowed + if (insight::n_unique(grp) > 2) { + insight::format_error("Only two groups are allowed for Student's t test.") # nolint + } + # value labels + group_labels <- names(attr(data[[by]], "labels", exact = TRUE)) + if (is.null(group_labels)) { + group_labels <- levels(droplevels(grp)) + } + data_name <- paste(select, "by", by) + } else { + # one-sample t-test... + grp <- NULL + group_labels <- select + if (is.null(data_name)) { + data_name <- select + } + } + + if (is.null(weights)) { + .calculate_ttest(dv, grp, mu, paired, alternative, group_labels, data_name) + } else { + .calculate_weighted_ttest(dv, grp, mu, paired, alternative, data[[weights]], group_labels, data_name) + } +} + + +# Mann-Whitney-Test for two groups -------------------------------------------- + +.calculate_ttest <- function(dv, grp, mu, paired, alternative, group_labels, data_name) { + insight::check_if_installed("effectsize") + # prepare data + if (is.null(grp)) { + tdat <- data.frame(dv) + t_formula <- stats::as.formula("dv ~ 1") + } else { + tdat <- data.frame(dv, grp) + t_formula <- stats::as.formula("dv ~ grp") + } + # perfom wilcox test + htest <- stats::t.test( + t_formula, + data = tdat, + alternative = alternative, + mu = mu + ) + test_statistic <- htest$statistic + if (nrow(tdat) > 20) { + effect_size <- stats::setNames( + effectsize::cohens_d( + t_formula, + data = tdat, + alternative = alternative, + mu = mu + )$Cohens_d, + "Cohens_d" + ) + } else { + effect_size <- stats::setNames( + effectsize::hedges_g( + t_formula, + data = tdat, + alternative = alternative, + mu = mu + )$Hedges_g, + "Hedges_g" + ) + } + + # return result + out <- data.frame( + data = data_name, + statistic_name = "t", + statistic = test_statistic, + effect_size_name = names(effect_size), + effect_size = as.numeric(effect_size), + p = as.numeric(htest$p.value), + df = as.numeric(htest$parameter), + method = ifelse(paired, "Paired t-test", htest$method), + alternative = alternative, + mu = mu, + stringsAsFactors = FALSE + ) + class(out) <- c("sj_htest_t", "data.frame") + attr(out, "group_labels") <- group_labels + attr(out, "means") <- as.numeric(htest$estimate) + attr(out, "paired") <- isTRUE(paired) + attr(out, "one_sample") <- is.null(grp) + attr(out, "weighted") <- FALSE + if (!is.null(grp)) { + attr(out, "n_groups") <- stats::setNames( + c(as.numeric(table(grp))), + c("N Group 1", "N Group 2") + ) + } + out +} + + +# Weighted Mann-Whitney-Test for two groups ---------------------------------- + +.calculate_weighted_ttest <- function(dv, grp, mu, paired, alternative, weights, group_labels, data_name) { + insight::check_if_installed(c("datawizard", "effectsize")) + if (is.null(grp)) { + dat <- stats::na.omit(data.frame(dv, weights)) + colnames(dat) <- c("y", "w") + x_values <- dat$y + x_weights <- dat$w + y_values <- NULL + # group N's + n_groups <- stats::setNames(round(sum(x_weights)), "N Group 1") + } else { + dat <- stats::na.omit(data.frame(dv, grp, weights)) + colnames(dat) <- c("y", "g", "w") + # unique groups + groups <- unique(dat$g) + # values for sample 1 + x_values <- dat$y[dat$g == groups[1]] + x_weights <- dat$w[dat$g == groups[1]] + # values for sample 2 + y_values <- dat$y[dat$g == groups[2]] + y_weights <- dat$w[dat$g == groups[2]] + # group N's + n_groups <- stats::setNames( + c(round(sum(x_weights)), round(sum(y_weights))), + c("N Group 1", "N Group 2") + ) + } + + mu_x <- stats::weighted.mean(x_values, x_weights, na.rm = TRUE) + var_x <- datawizard::weighted_sd(x_values, x_weights)^2 + se_x <- sqrt(var_x / length(x_values)) + + if (paired || is.null(y_values)) { + # paired + se <- se_x + dof <- length(x_values) - 1 + test_statistic <- (mu_x - mu) / se + estimate <- mu_x + method <- if (paired) "Paired t-test" else "One Sample t-test" + } else { + # unpaired t-test + mu_y <- stats::weighted.mean(y_values, y_weights) + var_y <- datawizard::weighted_sd(y_values, y_weights)^2 + se_y <- sqrt(var_y / length(y_values)) + se <- sqrt(se_x^2 + se_y^2) + dof <- se^4 / (se_x^4 / (length(x_values) - 1) + se_y^4 / (length(y_values) - 1)) + test_statistic <- (mu_x - mu_y - mu) / se + estimate <- c(mu_x, mu_y) + method <- "Two-Sample t-test" + } + + # p-values + if (alternative == "less") { + pval <- stats::pt(test_statistic, dof) + } else if (alternative == "greater") { + pval <- stats::pt(test_statistic, dof, lower.tail = FALSE) + } else { + pval <- 2 * stats::pt(-abs(test_statistic), dof) + } + + # effect size + dat$y <- dat$y * dat$w + if (is.null(y_values)) { + t_formula <- stats::as.formula("y ~ 1") + } else { + t_formula <- stats::as.formula("y ~ g") + } + + if (nrow(dat) > 20) { + effect_size <- stats::setNames( + effectsize::cohens_d( + t_formula, + data = dat, + alternative = alternative, + mu = mu, + paired = FALSE + )$Cohens_d, + "Cohens_d" + ) + } else { + effect_size <- stats::setNames( + effectsize::hedges_g( + t_formula, + data = dat, + alternative = alternative, + mu = mu, + paired = FALSE + )$Hedges_g, + "Hedges_g" + ) + } + + # return result + out <- data.frame( + data = data_name, + statistic_name = "t", + statistic = test_statistic, + effect_size_name = names(effect_size), + effect_size = as.numeric(effect_size), + p = pval, + df = dof, + method = method, + alternative = alternative, + mu = mu, + stringsAsFactors = FALSE + ) + class(out) <- c("sj_htest_t", "data.frame") + attr(out, "means") <- estimate + attr(out, "n_groups") <- n_groups + attr(out, "means") <- estimate + attr(out, "group_labels") <- group_labels + attr(out, "paired") <- isTRUE(paired) + attr(out, "one_sample") <- is.null(y_values) && !isTRUE(paired) + attr(out, "weighted") <- TRUE + out +} + + +# methods --------------------------------------------------------------------- + +#' @export +print.sj_htest_t <- function(x, ...) { + insight::check_if_installed("effectsize") + + # fetch attributes + group_labels <- attributes(x)$group_labels + means <- attributes(x)$means + n_groups <- attributes(x)$n_groups + weighted <- attributes(x)$weighted + paired <- isTRUE(attributes(x)$paired) + one_sample <- isTRUE(attributes(x)$one_sample) + + if (weighted) { + weight_string <- " (weighted)" + } else { + weight_string <- "" + } + + # same width + group_labels <- format(group_labels) + + # header + insight::print_color(sprintf("# %s%s\n\n", x$method, weight_string), "blue") + + # print for paired t-test + if (paired) { + # data + insight::print_color(sprintf( + " Data: %s (mean difference = %s)\n", + x$data, + insight::format_value(means[1], protect_integers = TRUE) + ), "cyan") + } else { + # data + insight::print_color(sprintf(" Data: %s\n", x$data), "cyan") + + # group-1-info + if (is.null(n_groups)) { + insight::print_color( + sprintf( + " Group 1: %s (mean = %s)\n", + group_labels[1], insight::format_value(means[1], protect_integers = TRUE) + ), "cyan" + ) + } else { + insight::print_color( + sprintf( + " Group 1: %s (n = %i, mean = %s)\n", + group_labels[1], n_groups[1], insight::format_value(means[1], protect_integers = TRUE) + ), "cyan" + ) + } + + # group-2-info + if (length(group_labels) > 1) { + if (is.null(n_groups)) { + insight::print_color( + sprintf( + " Group 2: %s (mean = %s)\n", + group_labels[2], insight::format_value(means[2], protect_integers = TRUE) + ), "cyan" + ) + } else { + insight::print_color( + sprintf( + " Group 2: %s (n = %i, mean = %s)\n", + group_labels[2], n_groups[2], insight::format_value(means[2], protect_integers = TRUE) + ), "cyan" + ) + } + } + } + + # alternative hypothesis + alt_string <- switch(x$alternative, + two.sided = "not equal to", + less = "less than", + greater = "greater than" + ) + if (one_sample) { + alt_string <- paste("true mean is", alt_string, x$mu) + } else if (paired) { + alt_string <- paste("true mean difference is", alt_string, x$mu) + } else { + alt_string <- paste("true difference in means is", alt_string, x$mu) + } + insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan") + + # string for effectsizes + if (x$effect_size_name == "Cohens_d") { + eff_string <- sprintf( + "Cohen's d = %.2f (%s effect)", + x$effect_size, + effectsize::interpret_cohens_d(x$effect_size) + ) + } else { + eff_string <- sprintf( + "Hedges' g = %.2f (%s effect)", + x$effect_size, + effectsize::interpret_hedges_g(x$effect_size) + ) + } + + cat(sprintf( + "\n t = %.2f, %s, df = %s, %s\n\n", + x$statistic, + eff_string, + insight::format_value(x$df, digits = 1, protect_integers = TRUE), + insight::format_p(x$p) + )) +} diff --git a/R/var_pop.R b/R/var_pop.R index bfe5367d..db53d6b8 100644 --- a/R/var_pop.R +++ b/R/var_pop.R @@ -22,21 +22,18 @@ #' sd(efc$c12hour, na.rm = TRUE) #' # population sd #' sd_pop(efc$c12hour) -#' -#' @importFrom stats na.omit var -#' @importFrom sjmisc is_num_fac -#' @importFrom sjlabelled as_numeric #' @export var_pop <- function(x) { + insight::check_if_installed("datawizard") # check for categorical if (is.factor(x)) { # only allow numeric factors - if (!sjmisc::is_num_fac(x)) { - warning("`x` must be numeric vector or a factor with numeric levels.", call. = F) - return(NA) + + if (!.is_pseudo_numeric(x)) { + insight::format_error("`x` must be numeric vector or a factor with numeric levels.") } # convert factor to numeric - x <- sjlabelled::as_numeric(x) + x <- datawizard::to_numeric(x, dummy_factors = FALSE) } # remove NA @@ -49,7 +46,6 @@ var_pop <- function(x) { #' @rdname var_pop -#' @importFrom stats na.omit var #' @export sd_pop <- function(x) { # get population variance diff --git a/R/weight.R b/R/weight.R index c36a3c55..11423688 100644 --- a/R/weight.R +++ b/R/weight.R @@ -38,8 +38,6 @@ #' table(x) #' table(weight(x, w)) #' -#' @importFrom stats na.pass xtabs -#' @importFrom sjlabelled as_numeric #' @export weight <- function(x, weights, digits = 0) { # remember if x is numeric @@ -76,7 +74,9 @@ weight <- function(x, weights, digits = 0) { # if we have NA values, weighted var is coerced to character. # coerce back to numeric then here - if (!is.numeric(weightedvar) && x.is.num) weightedvar <- sjlabelled::as_numeric(weightedvar) + if (!is.numeric(weightedvar) && x.is.num) { + weightedvar <- datawizard::to_numeric(weightedvar, dummy_factors = FALSE) + } # return result weightedvar diff --git a/R/wilcoxon_test.R b/R/wilcoxon_test.R new file mode 100644 index 00000000..5ef4d925 --- /dev/null +++ b/R/wilcoxon_test.R @@ -0,0 +1,257 @@ +#' @title Wilcoxon rank sum test +#' @name wilcoxon_test +#' @description This function performs Wilcoxon rank sum tests for one sample +#' or for two _paired_ (dependent) samples. For _unpaired_ (independent) +#' samples, please use the `mann_whitney_test()` function. +#' +#' A Wilcoxon rank sum test is a non-parametric test for the null hypothesis +#' that two samples have identical continuous distributions. The implementation +#' in `wilcoxon_test()` is only used for _paired_, i.e. _dependent_ samples. For +#' independent (unpaired) samples, use `mann_whitney_test()`. +#' +#' `wilcoxon_test()` can be used for ordinal scales or when the continuous +#' variables are not normally distributed. For large samples, or approximately +#' normally distributed variables, the `t_test()` function can be used (with +#' `paired = TRUE`). +#' +#' @inheritParams mann_whitney_test +#' @inherit mann_whitney_test seealso +#' +#' @inheritSection mann_whitney_test Which test to use +#' +#' @return A data frame with test results. The function returns p and Z-values +#' as well as effect size r and group-rank-means. +#' +#' @references +#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. +#' Dtsch Med Wochenschr 2007; 132: e24–e25 +#' +#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer +#' Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 +#' +#' @examplesIf requireNamespace("coin") +#' data(mtcars) +#' # one-sample test +#' wilcoxon_test(mtcars, "mpg") +#' # base R equivalent, we set exact = FALSE to avoid a warning +#' wilcox.test(mtcars$mpg ~ 1, exact = FALSE) +#' +#' # paired test +#' wilcoxon_test(mtcars, c("mpg", "hp")) +#' # base R equivalent, we set exact = FALSE to avoid a warning +#' wilcox.test(mtcars$mpg, mtcars$hp, paired = TRUE, exact = FALSE) +#' +#' # when `by` is specified, each group must be of same length +#' data(iris) +#' d <- iris[iris$Species != "setosa", ] +#' wilcoxon_test(d, "Sepal.Width", by = "Species") +#' @export +wilcoxon_test <- function(data, + select = NULL, + by = NULL, + weights = NULL, + mu = 0, + alternative = "two.sided", + ...) { + insight::check_if_installed("datawizard") + alternative <- match.arg(alternative, choices = c("two.sided", "less", "greater")) + + # sanity checks + .sanitize_htest_input(data, select, by, weights, test = "wilcoxon_test") + + # alternative only if weights are NULL + if (!is.null(weights) && alternative != "two.sided") { + insight::format_error("Argument `alternative` must be `two.sided` if `weights` are specified.") + } + + # for paired two-sample, do groups all have same length? + if (!is.null(by)) { + group_len <- as.numeric(table(as.vector(data[[by]]))) + if (!all(group_len == group_len[1])) { + insight::format_error("For paired two-sample Wilcoxon test, all groups specified in `by` must have the same length.") # nolint + } + # convert to wide format + out <- split(data[select], as.character(data[[by]])) + data <- stats::setNames(do.call(cbind, out), names(out)) + select <- colnames(data) + } + + # value labels + group_labels <- select + + x <- data[[select[1]]] + if (length(select) > 1) { + y <- data[[select[2]]] + } else { + y <- NULL + } + + if (is.null(weights)) { + .calculate_wilcox(x, y, alternative, mu, group_labels, ...) + } else { + .calculate_weighted_mwu(x, y, data[[weights]], group_labels) + } +} + + +# Mann-Whitney-Test for two groups -------------------------------------------- + +.calculate_wilcox <- function(x, y, alternative, mu, group_labels, ...) { + insight::check_if_installed("coin") + # for paired Wilcoxon test, we have effect sizes + if (!is.null(y)) { + # prepare data + wcdat <- data.frame(x, y) + # perfom wilcox test + wt <- coin::wilcoxsign_test(x ~ y, data = wcdat) + # compute statistics + u <- as.numeric(coin::statistic(wt, type = "linear")) + z <- as.numeric(coin::statistic(wt, type = "standardized")) + r <- abs(z / sqrt(nrow(wcdat))) + } else { + wt <- u <- z <- r <- NULL + } + + # prepare data + if (is.null(y)) { + dv <- x + } else { + dv <- x - y + } + htest <- suppressWarnings(stats::wilcox.test( + dv ~ 1, + alternative = alternative, + mu = mu, + ... + )) + v <- htest$statistic + p <- htest$p.value + + out <- data.frame( + group1 = group_labels[1], + v = v, + p = as.numeric(p), + mu = mu, + alternative = alternative + ) + # two groups? + if (length(group_labels) > 1) { + out$group2 <- group_labels[2] + } + # add effectsizes, when we have + if (!is.null(wt)) { + out$u <- u + out$z <- z + out$r <- r + } + attr(out, "group_labels") <- group_labels + attr(out, "method") <- "wilcoxon" + attr(out, "weighted") <- FALSE + attr(out, "one_sample") <- length(group_labels) == 1 + class(out) <- c("sj_htest_wilcox", "data.frame") + + out +} + + +# Weighted Mann-Whitney-Test for two groups ---------------------------------- + +.calculate_weighted_wilcox <- function(x, y, weights, group_labels) { + # check if pkg survey is available + insight::check_if_installed("survey") + + # prepare data + if (is.null(y)) { + dv <- x + } else { + dv <- x - y + } + + dat <- stats::na.omit(data.frame(dv, weights)) + colnames(dat) <- c("y", "w") + + design <- survey::svydesign(ids = ~0, data = dat, weights = ~w) + result <- survey::svyranktest(formula = y ~ 1, design, test = "wilcoxon") + + # statistics and effect sizes + z <- result$statistic + r <- abs(z / sqrt(nrow(dat))) + + out <- data_frame( + group1 = group_labels[1], + estimate = result$estimate, + z = z, + r = r, + p = as.numeric(result$p.value), + mu = 0, + alternative = "two.sided" + ) + # two groups? + if (length(group_labels) > 1) { + out$group2 <- group_labels[2] + } + + attr(out, "group_labels") <- group_labels + attr(out, "weighted") <- TRUE + attr(out, "one_sample") <- length(group_labels) == 1 + attr(out, "method") <- "wilcoxon" + class(out) <- c("sj_htest_wilcox", "data.frame") + + out +} + + +# methods --------------------------------------------------------------------- + +#' @export +print.sj_htest_wilcox <- function(x, ...) { + # fetch attributes + group_labels <- attributes(x)$group_labels + weighted <- attributes(x)$weighted + one_sample <- attributes(x)$one_sample + + if (weighted) { + weight_string <- " (weighted)" + } else { + weight_string <- "" + } + + if (one_sample) { + onesample_string <- "One Sample" + } else { + onesample_string <- "Paired" + } + + # same width + group_labels <- format(group_labels) + + # header + insight::print_color(sprintf( + "# %s Wilcoxon signed rank test%s\n\n", + onesample_string, + weight_string + ), "blue") + + # alternative hypothesis + if (!is.null(x$alternative) && !is.null(x$mu)) { + alt_string <- switch(x$alternative, + two.sided = "not equal to", + less = "less than", + greater = "greater than" + ) + alt_string <- paste("true location shift is", alt_string, x$mu) + insight::print_color(sprintf(" Alternative hypothesis: %s\n", alt_string), "cyan") + } + + if (!is.null(x[["v"]])) { + v_stat <- sprintf("V = %i, ", round(x$v)) + } else { + v_stat <- "" + } + + if (!is.null(x[["r"]])) { + cat(sprintf("\n %sr = %.2f, Z = %.2f, %s\n\n", v_stat, x$r, x$z, insight::format_p(x$p))) + } else { + cat(sprintf("\n %s%s\n\n", v_stat, insight::format_p(x$p))) + } +} diff --git a/R/wtd_cor.R b/R/wtd_cor.R index bcb7e24f..19a1d7fb 100644 --- a/R/wtd_cor.R +++ b/R/wtd_cor.R @@ -1,15 +1,15 @@ -#' @rdname weighted_sd +#' @rdname weighted_se #' @export weighted_correlation <- function(data, ...) { UseMethod("weighted_correlation") } -#' @rdname weighted_sd +#' @rdname weighted_se #' @export -weighted_correlation.default <- function(data, x, y, weights, ci.lvl = .95, ...) { - if (!missing(ci.lvl) & (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) - stop("'ci.lvl' must be a single number between 0 and 1") +weighted_correlation.default <- function(data, x, y, weights, ci.lvl = 0.95, ...) { + if (!missing(ci.lvl) && (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) + insight::format_error("'ci.lvl' must be a single number between 0 and 1.") x.name <- deparse(substitute(x)) y.name <- deparse(substitute(y)) @@ -24,8 +24,8 @@ weighted_correlation.default <- function(data, x, y, weights, ci.lvl = .95, ...) vars <- c(x.name, y.name, w.name) # get data - dat <- suppressMessages(dplyr::select(data, !! vars)) - dat <- na.omit(dat) + dat <- suppressMessages(data[vars]) + dat <- stats::na.omit(dat) xv <- dat[[x.name]] yv <- dat[[y.name]] @@ -35,12 +35,12 @@ weighted_correlation.default <- function(data, x, y, weights, ci.lvl = .95, ...) } -#' @rdname weighted_sd +#' @rdname weighted_se #' @export -weighted_correlation.formula <- function(formula, data, ci.lvl = .95, ...) { +weighted_correlation.formula <- function(formula, data, ci.lvl = 0.95, ...) { - if (!missing(ci.lvl) & (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) - stop("'ci.lvl' must be a single number between 0 and 1") + if (!missing(ci.lvl) && (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) + insight::format_error("'ci.lvl' must be a single number between 0 and 1.") vars <- all.vars(formula) @@ -50,8 +50,8 @@ weighted_correlation.formula <- function(formula, data, ci.lvl = .95, ...) { } # get data - dat <- suppressMessages(dplyr::select(data, !! vars)) - dat <- na.omit(dat) + dat <- suppressMessages(data[vars]) + dat <- stats::na.omit(dat) xv <- dat[[vars[1]]] yv <- dat[[vars[2]]] diff --git a/R/wtd_mean.R b/R/wtd_mean.R deleted file mode 100644 index 2b5f369b..00000000 --- a/R/wtd_mean.R +++ /dev/null @@ -1,22 +0,0 @@ -#' @rdname weighted_sd -#' @export -weighted_mean <- function(x, weights = NULL) { - UseMethod("weighted_mean") -} - -#' @importFrom stats weighted.mean -#' @export -weighted_mean.default <- function(x, weights = NULL) { - if (is.null(weights)) weights <- rep(1, length(x)) - stats::weighted.mean(x, w = weights, na.rm = TRUE) -} - -#' @importFrom stats weighted.mean -#' @importFrom purrr map_dbl -#' @importFrom dplyr select_if -#' @export -weighted_mean.data.frame <- function(x, weights = NULL) { - if (is.null(weights)) weights <- rep(1, length(x)) - dplyr::select_if(x, is.numeric) %>% - purrr::map_dbl(~ weighted.mean(.x, w = weights)) -} diff --git a/R/wtd_median.R b/R/wtd_median.R deleted file mode 100644 index 0e19d98a..00000000 --- a/R/wtd_median.R +++ /dev/null @@ -1,40 +0,0 @@ -#' @rdname weighted_sd -#' @export -weighted_median <- function(x, weights = NULL) { - UseMethod("weighted_median") -} - -#' @export -weighted_median.default <- function(x, weights = NULL) { - weighted_md_helper(x, w = weights, p = 0.5) -} - -#' @export -weighted_median.data.frame <- function(x, weights = NULL) { - dplyr::select_if(x, is.numeric) %>% - purrr::map_dbl(~ weighted_md_helper(.x, w = weights, p = 0.5)) -} - -weighted_md_helper <- function(x, w, p = .5) { - if (is.null(w)) w <- rep(1, length(x)) - - x[is.na(w)] <- NA - w[is.na(x)] <- NA - - w <- na.omit(w) - x <- na.omit(x) - - order <- order(x) - x <- x[order] - w <- w[order] - - rw <- cumsum(w) / sum(w) - md.values <- min(which(rw >= p)) - - if (rw[md.values] == p) - q <- mean(x[md.values:(md.values + 1)]) - else - q <- x[md.values] - - q -} diff --git a/R/wtd_mwu.R b/R/wtd_mwu.R deleted file mode 100644 index d0e9fa3d..00000000 --- a/R/wtd_mwu.R +++ /dev/null @@ -1,74 +0,0 @@ -#' @rdname weighted_sd -#' @export -weighted_mannwhitney <- function(data, ...) { - UseMethod("weighted_mannwhitney") -} - - -#' @importFrom dplyr select -#' @rdname weighted_sd -#' @export -weighted_mannwhitney.default <- function(data, x, grp, weights, ...) { - x.name <- deparse(substitute(x)) - g.name <- deparse(substitute(grp)) - w.name <- deparse(substitute(weights)) - - # create string with variable names - vars <- c(x.name, g.name, w.name) - - # get data - dat <- suppressMessages(dplyr::select(data, !! vars)) - dat <- na.omit(dat) - - weighted_mannwhitney_helper(dat) -} - - -#' @importFrom dplyr select -#' @rdname weighted_sd -#' @export -weighted_mannwhitney.formula <- function(formula, data, ...) { - vars <- all.vars(formula) - - # get data - dat <- suppressMessages(dplyr::select(data, !! vars)) - dat <- na.omit(dat) - - weighted_mannwhitney_helper(dat) -} - -weighted_mannwhitney_helper <- function(dat, vars) { - # check if pkg survey is available - if (!requireNamespace("survey", quietly = TRUE)) { - stop("Package `survey` needed to for this function to work. Please install it.", call. = FALSE) - } - - x.name <- colnames(dat)[1] - group.name <- colnames(dat)[2] - - colnames(dat) <- c("x", "g", "w") - - if (dplyr::n_distinct(dat$g, na.rm = TRUE) > 2) { - m <- "Weighted Kruskal-Wallis test" - method <- "KruskalWallis" - } else { - m <- "Weighted Mann-Whitney-U test" - method <- "wilcoxon" - } - - design <- survey::svydesign(ids = ~0, data = dat, weights = ~w) - mw <- survey::svyranktest(formula = x ~ g, design, test = method) - - attr(mw, "x.name") <- x.name - attr(mw, "group.name") <- group.name - class(mw) <- c("sj_wmwu", "list") - - mw$method <- m - - mw -} - - -#' @rdname weighted_sd -#' @export -weighted_ranktest <- weighted_mannwhitney.formula diff --git a/R/wtd_sd.R b/R/wtd_sd.R deleted file mode 100644 index 95af674d..00000000 --- a/R/wtd_sd.R +++ /dev/null @@ -1,115 +0,0 @@ -#' @title Weighted statistics for tests and variables -#' @name weighted_sd -#' @description \strong{Weighted statistics for variables} -#' \cr \cr -#' \code{weighted_sd()}, \code{weighted_se()}, \code{weighted_mean()} and \code{weighted_median()} -#' compute weighted standard deviation, standard error, mean or median for a -#' variable or for all variables of a data frame. \code{survey_median()} computes the -#' median for a variable in a survey-design (see \code{\link[survey]{svydesign}}). -#' \code{weighted_correlation()} computes a weighted correlation for a two-sided alternative -#' hypothesis. -#' \cr \cr -#' \strong{Weighted tests} -#' \cr \cr -#' \code{weighted_ttest()} computes a weighted t-test, while \code{weighted_mannwhitney()} -#' computes a weighted Mann-Whitney-U test or a Kruskal-Wallis test -#' (for more than two groups). `weighted_ranktest()` is an alias for `weighted_mannwhitney()`. -#' \code{weighted_chisqtest()} computes a weighted Chi-squared test for contingency tables. -#' -#' @param x (Numeric) vector or a data frame. For \code{survey_median()}, \code{weighted_ttest()}, -#' \code{weighted_mannwhitney()} and \code{weighted_chisqtest()} the bare (unquoted) variable -#' name, or a character vector with the variable name. -#' @param weights Bare (unquoted) variable name, or a character vector with -#' the variable name of the numeric vector of weights. If \code{weights = NULL}, -#' unweighted statistic is reported. -#' @param data A data frame. -#' @param formula A formula of the form \code{lhs ~ rhs1 + rhs2} where \code{lhs} is a -#' numeric variable giving the data values and \code{rhs1} a factor with two -#' levels giving the corresponding groups and \code{rhs2} a variable with weights. -#' @param y Optional, bare (unquoted) variable name, or a character vector with -#' the variable name. -#' @param grp Bare (unquoted) name of the cross-classifying variable, where -#' \code{x} is grouped into the categories represented by \code{grp}, -#' or a character vector with the variable name. -#' @param mu A number indicating the true value of the mean (or difference in -#' means if you are performing a two sample test). -#' @param ci.lvl Confidence level of the interval. -#' @param alternative A character string specifying the alternative hypothesis, -#' must be one of \code{"two.sided"} (default), \code{"greater"} or -#' \code{"less"}. You can specify just the initial letter. -#' @param paired Logical, whether to compute a paired t-test. -#' @param ... For \code{weighted_ttest()} and \code{weighted_mannwhitney()}, currently not used. -#' For \code{weighted_chisqtest()}, further arguments passed down to -#' \code{\link[stats]{chisq.test}}. -#' -#' @inheritParams svyglm.nb -#' -#' @return The weighted (test) statistic. -#' -#' @note \code{weighted_chisq()} is a convenient wrapper for \code{\link{crosstable_statistics}}. -#' For a weighted one-way Anova, use \code{means_by_group()} with -#' \code{weights}-argument. -#' \cr \cr -#' \code{weighted_ttest()} assumes unequal variance between the two groups. -#' -#' @examples -#' # weighted sd and se ---- -#' weighted_sd(rnorm(n = 100, mean = 3), runif(n = 100)) -#' -#' data(efc) -#' weighted_sd(efc[, 1:3], runif(n = nrow(efc))) -#' weighted_se(efc[, 1:3], runif(n = nrow(efc))) -#' -#' # survey_median ---- -#' # median for variables from weighted survey designs -#' if (require("survey")) { -#' data(nhanes_sample) -#' -#' des <- svydesign( -#' id = ~SDMVPSU, -#' strat = ~SDMVSTRA, -#' weights = ~WTINT2YR, -#' nest = TRUE, -#' data = nhanes_sample -#' ) -#' -#' survey_median(total, des) -#' survey_median("total", des) -#' } -#' -#' # weighted t-test ---- -#' efc$weight <- abs(rnorm(nrow(efc), 1, .3)) -#' weighted_ttest(efc, e17age, weights = weight) -#' weighted_ttest(efc, e17age, c160age, weights = weight) -#' weighted_ttest(e17age ~ e16sex + weight, efc) -#' @export -weighted_sd <- function(x, weights = NULL) { - UseMethod("weighted_sd") -} - - -#' @rdname weighted_sd -#' @export -wtd_sd <- weighted_sd - - -#' @export -weighted_sd.data.frame <- function(x, weights = NULL) { - sd_result <- purrr::map_dbl(x, ~ sqrt(weighted_variance(.x, weights))) - names(sd_result) <- colnames(x) - - sd_result -} - -#' @export -weighted_sd.matrix <- function(x, weights = NULL) { - sd_result <- purrr::map_dbl(x, ~ sqrt(weighted_variance(.x, weights))) - names(sd_result) <- colnames(x) - - sd_result -} - -#' @export -weighted_sd.default <- function(x, weights = NULL) { - sqrt(weighted_variance(x, weights)) -} diff --git a/R/wtd_se.R b/R/wtd_se.R index 7fdefcbc..62c48c3e 100644 --- a/R/wtd_se.R +++ b/R/wtd_se.R @@ -1,4 +1,47 @@ -#' @rdname weighted_sd +#' @title Weighted statistics for variables +#' @name weighted_se +#' @description +#' `weighted_se()` computes weighted standard errors of a variable or for +#' all variables of a data frame. `survey_median()` computes the median for +#' a variable in a survey-design (see [`survey::svydesign()]`). +#' `weighted_correlation()` computes a weighted correlation for a two-sided +#' alternative hypothesis. +#' +#' @param x (Numeric) vector or a data frame. For `survey_median()` or `weighted_ttest()`, +#' the bare (unquoted) variable name, or a character vector with the variable name. +#' @param weights Bare (unquoted) variable name, or a character vector with +#' the variable name of the numeric vector of weights. If `weights = NULL`, +#' unweighted statistic is reported. +#' @param data A data frame. +#' @param formula A formula of the form `lhs ~ rhs1 + rhs2` where `lhs` is a +#' numeric variable giving the data values and `rhs1` a factor with two +#' levels giving the corresponding groups and `rhs2` a variable with weights. +#' @param y Optional, bare (unquoted) variable name, or a character vector with +#' the variable name. +#' @param ci.lvl Confidence level of the interval. +#' @param ... Currently not used. +#' +#' @inheritParams svyglm.nb +#' +#' @return The weighted (test) statistic. +#' +#' @examplesIf requireNamespace("survey") +#' data(efc) +#' weighted_se(efc$c12hour, abs(runif(n = nrow(efc)))) +#' +#' # survey_median ---- +#' # median for variables from weighted survey designs +#' data(nhanes_sample) +#' +#' des <- survey::svydesign( +#' id = ~SDMVPSU, +#' strat = ~SDMVSTRA, +#' weights = ~WTINT2YR, +#' nest = TRUE, +#' data = nhanes_sample +#' ) +#' survey_median(total, des) +#' survey_median("total", des) #' @export weighted_se <- function(x, weights = NULL) { UseMethod("weighted_se") @@ -7,7 +50,7 @@ weighted_se <- function(x, weights = NULL) { #' @export weighted_se.data.frame <- function(x, weights = NULL) { - se_result <- purrr::map_dbl(x, ~ weighted_se_helper(.x, weights = weights)) + se_result <- vapply(x, weighted_se_helper, numeric(1), weights = weights) names(se_result) <- colnames(x) se_result @@ -15,7 +58,7 @@ weighted_se.data.frame <- function(x, weights = NULL) { #' @export weighted_se.matrix <- function(x, weights = NULL) { - se_result <- purrr::map_dbl(x, ~ weighted_se_helper(.x, weights = weights)) + se_result <- vapply(x, weighted_se_helper, numeric(1), weights = weights) names(se_result) <- colnames(x) se_result diff --git a/R/wtd_ttest.R b/R/wtd_ttest.R deleted file mode 100644 index ed00245b..00000000 --- a/R/wtd_ttest.R +++ /dev/null @@ -1,182 +0,0 @@ -#' @rdname weighted_sd -#' @export -weighted_ttest <- function(data, ...) { - UseMethod("weighted_ttest") -} - -#' @rdname weighted_sd -#' @export -weighted_ttest.default <- function(data, x, y = NULL, weights, mu = 0, paired = FALSE, ci.lvl = 0.95, alternative = c("two.sided", "less", "greater"), ...) { - - if (!missing(ci.lvl) & (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) - stop("'ci.lvl' must be a single number between 0 and 1") - - alternative <- match.arg(alternative) - - x.name <- deparse(substitute(x)) - y.name <- deparse(substitute(y)) - w.name <- deparse(substitute(weights)) - - if (y.name == "NULL") y.name <- NULL - - if (w.name == "NULL") { - w.name <- "weights" - data$weights <- 1 - } - - # create string with variable names - vars <- c(x.name, y.name, w.name) - - # get data - dat <- suppressMessages(dplyr::select(data, !! vars)) - dat <- na.omit(dat) - - if (sjmisc::is_empty(dat) || nrow(dat) == 1) { - warning("Too less data to compute t-test.") - return(NULL) - } - - xv <- dat[[x.name]] - wx <- wy <- dat[[w.name]] - - if (!is.null(y.name)) - yv <- dat[[y.name]] - else - yv <- NULL - - nx <- ny <- nrow(dat) - - weighted_ttest_helper(xv, yv, wx, wy, nx, ny, mu, paired, alternative, ci.lvl, x.name, y.name, NULL) -} - - -#' @rdname weighted_sd -#' @export -weighted_ttest.formula <- function(formula, data, mu = 0, paired = FALSE, ci.lvl = 0.95, alternative = c("two.sided", "less", "greater"), ...) { - - if (!missing(ci.lvl) & (length(ci.lvl) != 1 || !is.finite(ci.lvl) || ci.lvl < 0 || ci.lvl > 1)) - stop("'ci.lvl' must be a single number between 0 and 1") - - alternative <- match.arg(alternative) - - vars <- all.vars(formula) - - g <- data[[vars[2]]] - - if (is.factor(g)) - grps <- levels(g) - else - grps <- na.omit(sort(unique(g))) - - if (length(grps) > 2) - stop("Grouping factor has more than two levels.") - - if (length(vars) < 3) { - vars <- c(vars, "weights") - data$weights <- 1 - } - - x <- data[[vars[1]]] - y <- data[[vars[2]]] - w <- data[[vars[3]]] - - xv <- x[y == grps[1]] - yv <- x[y == grps[2]] - wx <- w[y == grps[1]] - wy <- w[y == grps[2]] - - mxv <- is.na(xv) - xv <- xv[!mxv] - wx <- wx[!mxv] - - myv <- is.na(yv) - yv <- yv[!myv] - wy <- wy[!myv] - - nx <- length(xv) - ny <- length(yv) - - labs <- sjlabelled::get_labels( - data[[vars[2]]], - attr.only = FALSE, - values = "p", - drop.na = TRUE, - drop.unused = TRUE - ) - - weighted_ttest_helper(xv, yv, wx, wy, nx, ny, mu, paired, alternative, ci.lvl, vars[1], vars[2], labs) -} - - -weighted_ttest_helper <- function(xv, yv, wx, wy, nx, ny, mu, paired, alternative, ci.lvl, x.name, y.name, group.name) { - if (paired) { - xv <- xv - yv - yv <- NULL - } - - mu.x.w <- stats::weighted.mean(xv, wx) - var.x.w <- weighted_sd(xv, wx)^2 - se.x <- sqrt(var.x.w / nx) - - if (!is.null(yv)) { - mu.y.w <- stats::weighted.mean(yv, wy) - var.y.w <- weighted_sd(yv, wy)^2 - se.y <- sqrt(var.y.w / ny) - - se <- sqrt(se.x^2 + se.y^2) - df <- se^4 / (se.x^4 / (nx - 1) + se.y^4 / (ny - 1)) - tstat <- (mu.x.w - mu.y.w - mu) / se - - estimate <- c(mu.x.w, mu.y.w) - names(estimate) <- c("mean of x", "mean of y") - method <- "Two-Sample t-test" - } else { - se <- se.x - df <- nx - 1 - tstat <- (mu.x.w - mu) / se - - estimate <- stats::setNames(mu.x.w, if (paired) "mean of the differences" else "mean of x") - method <- if (paired) "Paired t-test" else "One Sample t-test" - } - - - - if (alternative == "less") { - pval <- stats::pt(tstat, df) - cint <- c(-Inf, tstat + stats::qt(ci.lvl, df)) - } else if (alternative == "greater") { - pval <- stats::pt(tstat, df, lower.tail = FALSE) - cint <- c(tstat - stats::qt(ci.lvl, df), Inf) - } else { - pval <- 2 * stats::pt(-abs(tstat), df) - alpha <- 1 - ci.lvl - cint <- stats::qt(1 - alpha / 2, df) - cint <- tstat + c(-cint, cint) - } - - cint <- mu + cint * se - - names(tstat) <- "t" - names(df) <- "df" - names(mu) <- if (paired || !is.null(yv)) "difference in means" else "mean" - - - tt <- structure( - class = "sj_ttest", - list( - estimate = estimate, - statistic = tstat, - df = df, - p.value = pval, - ci = cint, - alternative = alternative, - method = method - ) - ) - - attr(tt, "x.name") <- x.name - attr(tt, "y.name") <- y.name - attr(tt, "group.name") <- group.name - - tt -} diff --git a/R/wtd_variance.R b/R/wtd_variance.R index 12f77f28..0bde43a2 100644 --- a/R/wtd_variance.R +++ b/R/wtd_variance.R @@ -4,8 +4,8 @@ weighted_variance <- function(x, w) { x[is.na(w)] <- NA w[is.na(x)] <- NA - w <- na.omit(w) - x <- na.omit(x) + w <- stats::na.omit(w) + x <- stats::na.omit(x) xbar <- sum(w * x) / sum(w) sum(w * ((x - xbar)^2)) / (sum(w) - 1) diff --git a/R/xtab_statistics.R b/R/xtab_statistics.R index 21824194..19fc5abf 100644 --- a/R/xtab_statistics.R +++ b/R/xtab_statistics.R @@ -113,7 +113,20 @@ crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("au stat.html <- NULL # check if data is a table - if (!is.table(data)) { + if (is.table(data)) { + # 'data' is a table - copy to table object + tab <- data + # check if statistics are possible to compute + if (statistics %in% c("spearman", "kendall", "pearson")) { + stop( + sprintf( + "Need arguments `data`, `x1` and `x2` to compute %s-statistics.", + statistics + ), + call. = FALSE + ) + } + } else { # evaluate unquoted names x1 <- deparse(substitute(x1)) x2 <- deparse(substitute(x2)) @@ -124,7 +137,7 @@ crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("au x2 <- gsub("\"", "", x2, fixed = TRUE) weights <- gsub("\"", "", weights, fixed = TRUE) - if (sjmisc::is_empty(weights) || weights == "NULL") + if (insight::is_empty_object(weights) || weights == "NULL") weights <- NULL else weights <- data[[weights]] @@ -145,19 +158,6 @@ crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("au } else { tab <- table(data) } - } else { - # 'data' is a table - copy to table object - tab <- data - # check if statistics are possible to compute - if (statistics %in% c("spearman", "kendall", "pearson")) { - stop( - sprintf( - "Need arguments `data`, `x1` and `x2` to compute %s-statistics.", - statistics - ), - call. = FALSE - ) - } } # get expected values @@ -217,21 +217,21 @@ crosstable_statistics <- function(data, x1 = NULL, x2 = NULL, statistics = c("au } # compute method string - method <- dplyr::case_when( - statistics == "kendall" ~ "Kendall's tau", - statistics == "spearman" ~ "Spearman's rho", - statistics == "pearson" ~ "Pearson's r", - statistics == "cramer" ~ "Cramer's V", - statistics == "phi" ~ "Phi" + method <- ifelse(statistics == "kendall", "Kendall's tau", + ifelse(statistics == "spearman", "Spearman's rho", # nolint + ifelse(statistics == "pearson", "Pearson's r", # nolint + ifelse(statistics == "cramer", "Cramer's V", "Phi") # nolint + ) + ) ) # compute method string - method.html <- dplyr::case_when( - statistics == "kendall" ~ "Kendall's τ", - statistics == "spearman" ~ "Spearman's ρ", - statistics == "pearson" ~ "Pearson's r", - statistics == "cramer" ~ "Cramer's V", - statistics == "phi" ~ "φ" + method.html <- ifelse(statistics == "kendall", "Kendall's τ", + ifelse(statistics == "spearman", "Spearman's ρ", # nolint + ifelse(statistics == "pearson", "Pearson's r", # nolint + ifelse(statistics == "cramer", "Cramer's V", "&phi") # nolint + ) + ) ) # return result diff --git a/_pkgdown.yml b/_pkgdown.yml index 2fb1d7b8..fa456107 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -25,7 +25,7 @@ reference: - title: "Weighted Estimates and Dispersion" contents: - weight - - weighted_ttest + - weighted_se - title: "Summary Statistics and Tests" contents: @@ -33,6 +33,8 @@ reference: - chi_squared_test - kruskal_wallis_test - mann_whitney_test + - t_test + - wilcoxon_test - var_pop - title: "Tools for Regression Models" diff --git a/man/anova_stats.Rd b/man/anova_stats.Rd index 77f7e82f..a64263e9 100644 --- a/man/anova_stats.Rd +++ b/man/anova_stats.Rd @@ -22,6 +22,7 @@ epsilon-squared statistic or Cohen's F for all terms in an anovas. and power for each term. } \examples{ +\dontshow{if (requireNamespace("car") && requireNamespace("pwr")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # load sample data data(efc) @@ -30,9 +31,8 @@ fit <- aov( c12hour ~ as.factor(e42dep) + as.factor(c172code) + c160age, data = efc ) -\dontrun{ anova_stats(car::Anova(fit, type = 2)) -} +\dontshow{\}) # examplesIf} } \references{ Levine TR, Hullett CR (2002): Eta Squared, Partial Eta Squared, and Misreporting of Effect Size in Communication Research. diff --git a/man/auto_prior.Rd b/man/auto_prior.Rd index f7701564..64047399 100644 --- a/man/auto_prior.Rd +++ b/man/auto_prior.Rd @@ -48,19 +48,16 @@ formula used in \code{brms::brm()} must be rewritten to something like \code{y ~ 0 + intercept ...}, see \code{\link[brms]{set_prior}}. } \examples{ -library(sjmisc) +\dontshow{if (requireNamespace("brms")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) efc$c172code <- as.factor(efc$c172code) -efc$c161sex <- to_label(efc$c161sex) +efc$c161sex <- as.factor(efc$c161sex) mf <- formula(neg_c_7 ~ c161sex + c160age + c172code) - -if (requireNamespace("brms", quietly = TRUE)) - auto_prior(mf, efc, TRUE) +auto_prior(mf, efc, TRUE) ## compare to -# library(rstanarm) -# m <- stan_glm(mf, data = efc, chains = 2, iter = 200) +# m <- rstanarm::stan_glm(mf, data = efc, chains = 2, iter = 200) # ps <- prior_summary(m) # ps$prior_intercept$adjusted_scale # ps$prior$adjusted_scale @@ -72,15 +69,12 @@ if (requireNamespace("brms", quietly = TRUE)) # add informative priors mf <- formula(neg_c_7 ~ c161sex + c172code) -if (requireNamespace("brms", quietly = TRUE)) { - auto_prior(mf, efc, TRUE) + - brms::prior(normal(.1554, 40), class = "b", coef = "c160age") -} +auto_prior(mf, efc, TRUE) + + brms::prior(normal(.1554, 40), class = "b", coef = "c160age") # example with binary response efc$neg_c_7d <- ifelse(efc$neg_c_7 < median(efc$neg_c_7, na.rm = TRUE), 0, 1) mf <- formula(neg_c_7d ~ c161sex + c160age + c172code + e17age) - -if (requireNamespace("brms", quietly = TRUE)) - auto_prior(mf, efc, FALSE) +auto_prior(mf, efc, FALSE) +\dontshow{\}) # examplesIf} } diff --git a/man/boot_ci.Rd b/man/boot_ci.Rd index c437f137..893080e8 100644 --- a/man/boot_ci.Rd +++ b/man/boot_ci.Rd @@ -7,36 +7,33 @@ \alias{boot_est} \title{Standard error and confidence intervals for bootstrapped estimates} \usage{ -boot_ci(data, ..., method = c("dist", "quantile"), ci.lvl = 0.95) +boot_ci(data, select = NULL, method = c("dist", "quantile"), ci.lvl = 0.95) -boot_se(data, ...) +boot_se(data, select = NULL) -boot_p(data, ...) +boot_p(data, select = NULL) -boot_est(data, ...) +boot_est(data, select = NULL) } \arguments{ \item{data}{A data frame that containts the vector with bootstrapped estimates, or directly the vector (see 'Examples').} -\item{...}{Optional, unquoted names of variables with bootstrapped estimates. -Required, if either \code{data} is a data frame (and no vector), -and only selected variables from \code{data} should be processed. -You may also use functions like \code{:} or tidyselect's -\code{select_helpers()}.} +\item{select}{Optional, unquoted names of variables (as character vector) +with bootstrapped estimates. Required, if either \code{data} is a data frame +(and no vector), and only selected variables from \code{data} should be processed.} \item{method}{Character vector, indicating if confidence intervals should be -based on bootstrap standard error, multiplied by the value of the -quantile function of the t-distribution (default), or on sample -quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. -May be abbreviated.} +based on bootstrap standard error, multiplied by the value of the quantile +function of the t-distribution (default), or on sample quantiles of the +bootstrapped values. See 'Details' in \code{boot_ci()}. May be abbreviated.} \item{ci.lvl}{Numeric, the level of the confidence intervals.} } \value{ -A data frame with either bootstrap estimate, -standard error, the lower and upper confidence intervals or the -p-value for all bootstrapped estimates. +A data frame with either bootstrap estimate, standard error, the +lower and upper confidence intervals or the p-value for all bootstrapped +estimates. } \description{ Compute nonparametric bootstrap estimate, standard error, @@ -44,45 +41,38 @@ confidence intervals and p-value for a vector of bootstrap replicate estimates. } \details{ -The methods require one or more vectors of bootstrap replicate estimates -as input. +The methods require one or more vectors of bootstrap replicate +estimates as input. \itemize{ -\item{ -\code{boot_est()} returns the bootstrapped estimate, simply by -computing the mean value of all bootstrap estimates. -} -\item{ -\code{boot_se()} computes the nonparametric bootstrap standard -error by calculating the standard deviation of the input vector. -} -\item{ -The mean value of the input vector and its standard error is used -by \code{boot_ci()} to calculate the lower and upper confidence -interval, assuming a t-distribution of bootstrap estimate replicates -(for \code{method = "dist"}, the default, which is -\code{mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)}); for -\code{method = "quantile"}, 95\\% sample quantiles are used to compute -the confidence intervals (\code{quantile(x, probs = c(.025, .975))}). -Use \code{ci.lvl} to change the level for the confidence interval. -} -\item{ -P-values from \code{boot_p()} are also based on t-statistics, -assuming normal distribution. -} +\item \code{boot_est()}: returns the bootstrapped estimate, simply by computing +the mean value of all bootstrap estimates. +\item \code{boot_se()}: computes the nonparametric bootstrap standard error by +calculating the standard deviation of the input vector. +\item The mean value of the input vector and its standard error is used by +\code{boot_ci()} to calculate the lower and upper confidence interval, +assuming a t-distribution of bootstrap estimate replicates (for +\code{method = "dist"}, the default, which is +\verb{mean(x) +/- qt(.975, df = length(x) - 1) * sd(x)}); for +\code{method = "quantile"}, 95\\% sample quantiles are used to compute the +confidence intervals (\code{quantile(x, probs = c(0.025, 0.975))}). Use +\code{ci.lvl} to change the level for the confidence interval. +\item P-values from \code{boot_p()} are also based on t-statistics, assuming normal +distribution. } } \examples{ -library(dplyr) -library(purrr) data(efc) bs <- bootstrap(efc, 100) # now run models for each bootstrapped sample -bs$models <- map(bs$strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)) +bs$models <- lapply( + bs$strap, + function(.x) lm(neg_c_7 ~ e42dep + c161sex, data = .x) +) # extract coefficient "dependency" and "gender" from each model -bs$dependency <- map_dbl(bs$models, ~coef(.x)[2]) -bs$gender <- map_dbl(bs$models, ~coef(.x)[3]) +bs$dependency <- vapply(bs$models, function(x) coef(x)[2], numeric(1)) +bs$gender <- vapply(bs$models, function(x) coef(x)[3], numeric(1)) # get bootstrapped confidence intervals boot_ci(bs$dependency) @@ -93,63 +83,19 @@ confint(fit)[2, ] # alternative function calls. boot_ci(bs$dependency) -boot_ci(bs, dependency) -boot_ci(bs, dependency, gender) -boot_ci(bs, dependency, gender, method = "q") +boot_ci(bs, "dependency") +boot_ci(bs, c("dependency", "gender")) +boot_ci(bs, c("dependency", "gender"), method = "q") # compare coefficients mean(bs$dependency) boot_est(bs$dependency) coef(fit)[2] - - -# bootstrap() and boot_ci() work fine within pipe-chains -efc \%>\% - bootstrap(100) \%>\% - mutate( - models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex, data = .x)), - dependency = map_dbl(models, ~coef(.x)[2]) - ) \%>\% - boot_ci(dependency) - -# check p-value -boot_p(bs$gender) -summary(fit)$coefficients[3, ] - -\dontrun{ -# 'spread_coef()' from the 'sjmisc'-package makes it easy to generate -# bootstrapped statistics like confidence intervals or p-values -library(dplyr) -library(sjmisc) -efc \%>\% - # generate bootstrap replicates - bootstrap(100) \%>\% - # apply lm to all bootstrapped data sets - mutate( - models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) - ) \%>\% - # spread model coefficient for all 100 models - spread_coef(models) \%>\% - # compute the CI for all bootstrapped model coefficients - boot_ci(e42dep, c161sex, c172code) - -# or... -efc \%>\% - # generate bootstrap replicates - bootstrap(100) \%>\% - # apply lm to all bootstrapped data sets - mutate( - models = map(strap, ~lm(neg_c_7 ~ e42dep + c161sex + c172code, data = .x)) - ) \%>\% - # spread model coefficient for all 100 models - spread_coef(models, append = FALSE) \%>\% - # compute the CI for all bootstrapped model coefficients - boot_ci()} } \references{ Carpenter J, Bithell J. Bootstrap confdence intervals: when, which, what? A practical guide for medical statisticians. Statist. Med. 2000; 19:1141-1164 } \seealso{ -\code{\link{bootstrap}} to generate nonparametric bootstrap samples. +[]\code{bootstrap()}] to generate nonparametric bootstrap samples. } diff --git a/man/bootstrap.Rd b/man/bootstrap.Rd index ac2e2c63..e40b7845 100644 --- a/man/bootstrap.Rd +++ b/man/bootstrap.Rd @@ -69,17 +69,11 @@ bs$c12hour <- unlist(lapply(bs$strap, function(x) { mean(as.data.frame(x)$c12hour, na.rm = TRUE) })) -# or as tidyverse-approach -if (require("dplyr") && require("purrr")) { - bs <- efc \%>\% - bootstrap(100) \%>\% - mutate( - c12hour = map_dbl(strap, ~mean(as.data.frame(.x)$c12hour, na.rm = TRUE)) - ) +# bootstrapped standard error +boot_se(bs, "c12hour") - # bootstrapped standard error - boot_se(bs, c12hour) -} +# bootstrapped CI +boot_ci(bs, "c12hour") } \seealso{ \code{\link{boot_ci}} to calculate confidence intervals from diff --git a/man/chi_squared_test.Rd b/man/chi_squared_test.Rd index f6dd5d77..9b60c041 100644 --- a/man/chi_squared_test.Rd +++ b/man/chi_squared_test.Rd @@ -17,15 +17,25 @@ chi_squared_test( \arguments{ \item{data}{A data frame.} -\item{select}{Name of the dependent variable (as string) to be used for the -test. \code{select} can also be a character vector, specifying the names of -multiple continuous variables. In this case, \code{by} is ignored and variables -specified in \code{select} are used to compute the test. This can be useful if -the data is in wide-format and no grouping variable is available.} +\item{select}{Name(s) of the continuous variable(s) (as character vector) +to be used as samples for the test. \code{select} can be one of the following: +\itemize{ +\item \code{select} can be used in combination with \code{by}, in which case \code{select} is +the name of the continous variable (and \code{by} indicates a grouping factor). +\item \code{select} can also be a character vector of length two or more (more than +two names only apply to \code{kruskal_wallis_test()}), in which case the two +continuous variables are treated as samples to be compared. \code{by} must be +\code{NULL} in this case. +\item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples +are considered as \emph{dependent} and a paired test is carried out. +\item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test +is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}). +}} -\item{by}{Name of the grouping variable to be used for the test. If \code{by} is -not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if -\code{probabilities} is provided, \code{by} must be \code{NULL}.} +\item{by}{Name of the variable indicating the groups. Required if \code{select} +specifies only one variable that contains all samples to be compared in the +test. If \code{by} is not a factor, it will be coerced to a factor. For +\code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{probabilities}{A numeric vector of probabilities for each cell in the contingency table. The length of the vector must match the number of cells @@ -48,7 +58,7 @@ for 2x2 tables, and \ifelse{latex}{\eqn{Fei}}{פ (Fei)} for tests against given probabilities. } \description{ -This function performs a \eqn{chi}^2 test for contingency +This function performs a \eqn{\chi^2} test for contingency tables or tests for given probabilities. The returned effects sizes are Cramer's V for tables with more than two rows and columns, Phi (\eqn{\phi}) for 2x2 tables, and \ifelse{latex}{\eqn{Fei}}{פ (Fei)} for tests against @@ -65,8 +75,47 @@ a McNemar test (see \code{\link[=mcnemar.test]{mcnemar.test()}}) is conducted. The weighted version of the chi-squared test is based on the a weighted table, using \code{\link[=xtabs]{xtabs()}} as input for \code{chisq.test()}. + +Interpretation of effect sizes are based on rules described in +\code{\link[effectsize:interpret_r]{effectsize::interpret_phi()}}, \code{\link[effectsize:interpret_r]{effectsize::interpret_cramers_v()}}, +and \code{\link[effectsize:interpret_r]{effectsize::interpret_fei()}}. +} +\section{Which test to use}{ + +The following table provides an overview of which test to use for different +types of data. The choice of test depends on the scale of the outcome +variable and the number of samples to compare.\tabular{lll}{ + \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr + 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr + 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr + 1 \tab continuous, normal \tab \code{t_test()} \cr + 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr + 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr + 2, independent \tab continuous, normal \tab \code{t_test()} \cr + 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr + 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr + 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr + >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr + >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr + >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr + >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr } + + +(1) More than two dependent samples are considered as \emph{repeated measurements}. +For ordinal or not-normally distributed outcomes, these samples are +usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples +in one variable, the groups to compare in another variable, and a third +variable indicating the repeated measurements (subject IDs). + +(2) More than two dependent samples are considered as \emph{repeated measurements}. +For normally distributed outcomes, these samples are usually tested using +a ANOVA for repeated measurements. A more sophisticated approach would +be using a linear mixed model. +} + \examples{ +\dontshow{if (requireNamespace("effectsize")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} data(efc) efc$weight <- abs(rnorm(nrow(efc), 1, 0.3)) @@ -78,10 +127,26 @@ chi_squared_test(efc, "c161sex", by = "e16sex", weights = "weight") # Chi-squared test for given probabilities chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7)) +\dontshow{\}) # examplesIf} } \references{ -Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., +\itemize{ +\item Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M., Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data That Use the Chi‑Squared Statistic. Mathematics, 11, 1982. \doi{10.3390/math11091982} +\item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. +Dtsch Med Wochenschr 2007; 132: e24–e25 +\item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer +Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 +} +} +\seealso{ +\itemize{ +\item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for unpaired (independent) samples. +\item \code{\link[=t_test]{t_test()}} for parametric t-tests. +\item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric ANOVA (i.e. more than two samples). +\item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for paired (dependent) samples. +\item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables). +} } diff --git a/man/crosstable_statistics.Rd b/man/crosstable_statistics.Rd index cbea412f..7ae56212 100644 --- a/man/crosstable_statistics.Rd +++ b/man/crosstable_statistics.Rd @@ -65,10 +65,9 @@ frame including lower and upper confidence intervals.} \item{n}{Number of bootstraps to be generated.} \item{method}{Character vector, indicating if confidence intervals should be -based on bootstrap standard error, multiplied by the value of the -quantile function of the t-distribution (default), or on sample -quantiles of the bootstrapped values. See 'Details' in \code{boot_ci()}. -May be abbreviated.} +based on bootstrap standard error, multiplied by the value of the quantile +function of the t-distribution (default), or on sample quantiles of the +bootstrapped values. See 'Details' in \code{boot_ci()}. May be abbreviated.} \item{x1}{Name of first variable that should be used to compute the contingency table. If \code{data} is a table object, this argument will be diff --git a/man/gmd.Rd b/man/gmd.Rd index 03823df5..7c549f8a 100644 --- a/man/gmd.Rd +++ b/man/gmd.Rd @@ -4,15 +4,14 @@ \alias{gmd} \title{Gini's Mean Difference} \usage{ -gmd(x, ...) +gmd(x, select = NULL) } \arguments{ \item{x}{A vector or data frame.} -\item{...}{Optional, unquoted names of variables that should be selected for -further processing. Required, if \code{x} is a data frame (and no vector) -and only selected variables from \code{x} should be processed. You may also -use functions like \code{:} or tidyselect's \code{select_helpers()}.} +\item{select}{Optional, names of variables as character vector that should be +selected for further processing. Required, if \code{x} is a data frame (and no vector) +and only selected variables from \code{x} should be processed.} } \value{ For numeric vectors, Gini's mean difference. For non-numeric vectors @@ -24,13 +23,13 @@ or for all numeric vectors in a data frame. } \note{ Gini's mean difference is defined as the mean absolute difference between -any two distinct elements of a vector. Missing values from \code{x} are -silently removed. +any two distinct elements of a vector. Missing values from \code{x} are silently +removed. } \examples{ data(efc) gmd(efc$e17age) -gmd(efc, e17age, c160age, c12hour) +gmd(efc, c("e17age", "c160age", "c12hour")) } \references{ diff --git a/man/inequ_trend.Rd b/man/inequ_trend.Rd index 5a2bbc9f..25a188ed 100644 --- a/man/inequ_trend.Rd +++ b/man/inequ_trend.Rd @@ -36,6 +36,7 @@ in changes in rate differences and rate ratios. The function implements the algorithm proposed by \emph{Mackenbach et al. 2015}. } \examples{ +\dontshow{if (requireNamespace("ggplot2")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} # This example reproduces Fig. 1 of Mackenbach et al. 2015, p.5 # 40 simulated time points, with an initial rate ratio of 2 and @@ -56,14 +57,14 @@ for (i in 2:n) hi[i] <- hi[i - 1] * .97 prev.data <- data.frame(lo, hi) # print values -inequ_trend(prev.data, lo, hi) +inequ_trend(prev.data, "lo", "hi") # plot trends - here we see that the relative inequalities # are increasing over time, while the absolute inequalities # are first increasing as well, but later are decreasing # (while rel. inequ. are still increasing) -plot(inequ_trend(prev.data, lo, hi)) - +plot(inequ_trend(prev.data, "lo", "hi")) +\dontshow{\}) # examplesIf} } \references{ Mackenbach JP, Martikainen P, Menvielle G, de Gelder R. 2015. The Arithmetic of Reducing Relative and Absolute Inequalities in Health: A Theoretical Analysis Illustrated with European Mortality Data. Journal of Epidemiology and Community Health 70(7): 730-36. \doi{10.1136/jech-2015-207018} diff --git a/man/kruskal_wallis_test.Rd b/man/kruskal_wallis_test.Rd index d3033c42..8ca0bc9d 100644 --- a/man/kruskal_wallis_test.Rd +++ b/man/kruskal_wallis_test.Rd @@ -9,15 +9,25 @@ kruskal_wallis_test(data, select = NULL, by = NULL, weights = NULL) \arguments{ \item{data}{A data frame.} -\item{select}{Name of the dependent variable (as string) to be used for the -test. \code{select} can also be a character vector, specifying the names of -multiple continuous variables. In this case, \code{by} is ignored and variables -specified in \code{select} are used to compute the test. This can be useful if -the data is in wide-format and no grouping variable is available.} +\item{select}{Name(s) of the continuous variable(s) (as character vector) +to be used as samples for the test. \code{select} can be one of the following: +\itemize{ +\item \code{select} can be used in combination with \code{by}, in which case \code{select} is +the name of the continous variable (and \code{by} indicates a grouping factor). +\item \code{select} can also be a character vector of length two or more (more than +two names only apply to \code{kruskal_wallis_test()}), in which case the two +continuous variables are treated as samples to be compared. \code{by} must be +\code{NULL} in this case. +\item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples +are considered as \emph{dependent} and a paired test is carried out. +\item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test +is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}). +}} -\item{by}{Name of the grouping variable to be used for the test. If \code{by} is -not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if -\code{probabilities} is provided, \code{by} must be \code{NULL}.} +\item{by}{Name of the variable indicating the groups. Required if \code{select} +specifies only one variable that contains all samples to be compared in the +test. If \code{by} is not a factor, it will be coerced to a factor. For +\code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} } @@ -27,13 +37,49 @@ A data frame with test results. \description{ This function performs a Kruskal-Wallis rank sum test, to test the null hypothesis that the population median of all of the groups are -equal. The alternative is that they differ in at least one. +equal. The alternative is that they differ in at least one. Unlike the +underlying base R function \code{kruskal.test()}, this function allows for +weighted tests. } \details{ The function simply is a wrapper around \code{\link[=kruskal.test]{kruskal.test()}}. The weighted version of the Kruskal-Wallis test is based on the \strong{survey} package, using \code{\link[survey:svyranktest]{survey::svyranktest()}}. } +\section{Which test to use}{ + +The following table provides an overview of which test to use for different +types of data. The choice of test depends on the scale of the outcome +variable and the number of samples to compare.\tabular{lll}{ + \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr + 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr + 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr + 1 \tab continuous, normal \tab \code{t_test()} \cr + 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr + 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr + 2, independent \tab continuous, normal \tab \code{t_test()} \cr + 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr + 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr + 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr + >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr + >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr + >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr + >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr +} + + +(1) More than two dependent samples are considered as \emph{repeated measurements}. +For ordinal or not-normally distributed outcomes, these samples are +usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples +in one variable, the groups to compare in another variable, and a third +variable indicating the repeated measurements (subject IDs). + +(2) More than two dependent samples are considered as \emph{repeated measurements}. +For normally distributed outcomes, these samples are usually tested using +a ANOVA for repeated measurements. A more sophisticated approach would +be using a linear mixed model. +} + \examples{ data(efc) # Kruskal-Wallis test for elder's age by education @@ -55,4 +101,23 @@ long_data <- data.frame( groups = rep(c("A", "B", "C"), each = 20) ) kruskal_wallis_test(long_data, select = "scales", by = "groups") +# base R equivalent +kruskal.test(scales ~ groups, data = long_data) +} +\references{ +\itemize{ +\item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests. +Dtsch Med Wochenschr 2007; 132: e24–e25 +\item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer +Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8 +} +} +\seealso{ +\itemize{ +\item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for unpaired (independent) samples. +\item \code{\link[=t_test]{t_test()}} for parametric t-tests. +\item \code{\link[=kruskal_wallis_test]{kruskal_wallis_test()}} for non-parametric ANOVA (i.e. more than two samples). +\item \code{\link[=wilcoxon_test]{wilcoxon_test()}} for Wilcoxon rank sum tests for paired (dependent) samples. +\item \code{\link[=chi_squared_test]{chi_squared_test()}} for chi-squared tests (two categorical variables). +} } diff --git a/man/mann_whitney_test.Rd b/man/mann_whitney_test.Rd index ab376af7..2e3787be 100644 --- a/man/mann_whitney_test.Rd +++ b/man/mann_whitney_test.Rd @@ -2,46 +2,69 @@ % Please edit documentation in R/mann_whitney_test.R \name{mann_whitney_test} \alias{mann_whitney_test} -\title{Mann-Whitney-Test} +\title{Mann-Whitney test} \usage{ mann_whitney_test( data, select = NULL, by = NULL, weights = NULL, - distribution = "asymptotic" + mu = 0, + alternative = "two.sided", + ... ) } \arguments{ \item{data}{A data frame.} -\item{select}{Name of the dependent variable (as string) to be used for the -test. \code{select} can also be a character vector, specifying the names of -multiple continuous variables. In this case, \code{by} is ignored and variables -specified in \code{select} are used to compute the test. This can be useful if -the data is in wide-format and no grouping variable is available.} +\item{select}{Name(s) of the continuous variable(s) (as character vector) +to be used as samples for the test. \code{select} can be one of the following: +\itemize{ +\item \code{select} can be used in combination with \code{by}, in which case \code{select} is +the name of the continous variable (and \code{by} indicates a grouping factor). +\item \code{select} can also be a character vector of length two or more (more than +two names only apply to \code{kruskal_wallis_test()}), in which case the two +continuous variables are treated as samples to be compared. \code{by} must be +\code{NULL} in this case. +\item If \code{select} select is of length \strong{two} and \code{paired = TRUE}, the two samples +are considered as \emph{dependent} and a paired test is carried out. +\item If \code{select} specifies \strong{one} variable and \code{by = NULL}, a one-sample test +is carried out (only applicable for \code{t_test()} and \code{wilcoxon_test()}). +}} -\item{by}{Name of the grouping variable to be used for the test. If \code{by} is -not a factor, it will be coerced to a factor. For \code{chi_squared_test()}, if -\code{probabilities} is provided, \code{by} must be \code{NULL}.} +\item{by}{Name of the variable indicating the groups. Required if \code{select} +specifies only one variable that contains all samples to be compared in the +test. If \code{by} is not a factor, it will be coerced to a factor. For +\code{chi_squared_test()}, if \code{probabilities} is provided, \code{by} must be \code{NULL}.} \item{weights}{Name of an (optional) weighting variable to be used for the test.} -\item{distribution}{Indicates how the null distribution of the test statistic -should be computed. May be one of \code{"exact"}, \code{"approximate"} or \code{"asymptotic"} -(default). See \code{\link[coin:LocationTests]{coin::wilcox_test()}} for details.} +\item{mu}{The hypothesized difference in means (for \code{t_test()}) or location +shift (for \code{mann_whitney_test()}). The default is 0.} + +\item{alternative}{A character string specifying the alternative hypothesis, +must be one of \code{"two.sided"} (default), \code{"greater"} or \code{"less"}. See \code{?t.test} +and \code{?wilcox.test}.} + +\item{...}{Additional arguments passed to \code{wilcox.test()} (for unweighted +tests, i.e. when \code{weights = NULL}).} } \value{ A data frame with test results. The function returns p and Z-values as well as effect size r and group-rank-means. } \description{ -This function performs a Mann-Whitney-Test (or Wilcoxon rank -sum test for \emph{unpaired} samples. +This function performs a Mann-Whitney test (or Wilcoxon rank +sum test for \emph{unpaired} samples). Unlike the underlying base R function +\code{wilcox.test()}, this function allows for weighted tests and automatically +calculates effect sizes. For \emph{paired} (dependent) samples, or for one-sample +tests, please use the \code{wilcoxon_test()} function. -A Mann-Whitney-Test is a non-parametric test for the null hypothesis that two -independent samples have identical continuous distributions. It can be used -when the two continuous variables are not normally distributed. +A Mann-Whitney test is a non-parametric test for the null hypothesis that two +\emph{independent} samples have identical continuous distributions. It can be used +for ordinal scales or when the two continuous variables are not normally +distributed. For large samples, or approximately normally distributed variables, +the \code{t_test()} function can be used. } \details{ This function is based on \code{\link[=wilcox.test]{wilcox.test()}} and \code{\link[coin:LocationTests]{coin::wilcox_test()}} @@ -55,26 +78,85 @@ Interpretation of the effect size \strong{r}, as a rule-of-thumb: \item large effect >= 0.5 } -\strong{r} is calcuated as: +\strong{r} is calcuated as \eqn{r = \frac{|Z|}{\sqrt{n1 + n2}}}. +} +\section{Which test to use}{ + +The following table provides an overview of which test to use for different +types of data. The choice of test depends on the scale of the outcome +variable and the number of samples to compare.\tabular{lll}{ + \strong{Samples} \tab \strong{Scale of Outcome} \tab \strong{Significance Test} \cr + 1 \tab binary / nominal \tab \code{chi_squared_test()} \cr + 1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr + 1 \tab continuous, normal \tab \code{t_test()} \cr + 2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr + 2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr + 2, independent \tab continuous, normal \tab \code{t_test()} \cr + 2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr + 2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr + 2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr + >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr + >2, independent \tab continuous, normal \tab \code{datawizard::means_by_group()} \cr + >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr + >2, dependent \tab continuous, normal \tab \emph{not yet implemented} (2) \cr +} + + +(1) More than two dependent samples are considered as \emph{repeated measurements}. +For ordinal or not-normally distributed outcomes, these samples are +usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which requires the samples +in one variable, the groups to compare in another variable, and a third +variable indicating the repeated measurements (subject IDs). -\if{html}{\out{