diff --git a/R/chi_squared_test.R b/R/chi_squared_test.R
index 1a299632..474ecadd 100644
--- a/R/chi_squared_test.R
+++ b/R/chi_squared_test.R
@@ -17,6 +17,8 @@
 #' @param ... Additional arguments passed down to [`chisq.test()`].
 #' @inheritParams mann_whitney_test
 #'
+#' @inheritSection mann_whitney_test Which test to use
+#'
 #' @inherit mann_whitney_test seealso
 #'
 #' @return A data frame with test results. The returned effects sizes are
@@ -39,10 +41,17 @@
 #' [`effectsize::interpret_phi()`], [`effectsize::interpret_cramers_v()`],
 #' and [`effectsize::interpret_fei()`].
 #'
-#' @references Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
-#' Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
-#' That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
-#' \doi{10.3390/math11091982}
+#' @references
+#' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
+#'   Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
+#'   That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
+#'   \doi{10.3390/math11091982}
+#'
+#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+#'   Dtsch Med Wochenschr 2007; 132: e24–e25
+#'
+#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+#'   Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
 #'
 #' @examplesIf requireNamespace("effectsize")
 #' data(efc)
@@ -64,6 +73,12 @@ chi_squared_test <- function(data,
                              weights = NULL,
                              paired = FALSE,
                              ...) {
+  # sanity check - if we only have one variable in "select" and "by" and
+  # "probabilities" are NULL, set probalities
+  if (is.null(probabilities) && !is.null(select) && is.null(by) && length(select) == 1) {
+    probabilities <- rep(1 / length(data[[select]]), length(data[[select]]))
+  }
+
   if (is.null(probabilities)) {
     .calculate_chisq(data, select, by, weights, paired, ...)
   } else {
diff --git a/R/kruskal_wallis_test.R b/R/kruskal_wallis_test.R
index d60bcc81..24bc2cea 100644
--- a/R/kruskal_wallis_test.R
+++ b/R/kruskal_wallis_test.R
@@ -11,6 +11,15 @@
 #'
 #' @return A data frame with test results.
 #'
+#' @inheritSection mann_whitney_test Which test to use
+#'
+#' @references
+#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+#'   Dtsch Med Wochenschr 2007; 132: e24–e25
+#'
+#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+#'   Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
+#' 
 #' @details The function simply is a wrapper around [`kruskal.test()`]. The
 #' weighted version of the Kruskal-Wallis test is based on the **survey** package,
 #' using [`survey::svyranktest()`].
diff --git a/R/mann_whitney_test.R b/R/mann_whitney_test.R
index b858918b..b29b1e64 100644
--- a/R/mann_whitney_test.R
+++ b/R/mann_whitney_test.R
@@ -39,6 +39,37 @@
 #' @param ... Additional arguments passed to `wilcox.test()` (for unweighted
 #' tests, i.e. when `weights = NULL`).
 #'
+#' @section Which test to use:
+#' The following table provides an overview of which test to use for different
+#' types of data. The choice of test depends on the scale of the outcome
+#' variable and the number of samples to compare.
+#'
+#' | Samples         | Scale of Outcome       | Significance Test               |
+#' |-----------------|------------------------|---------------------------------|
+#' | 1               | binary / nominal       | `chi_squared_test()`            |
+#' | 1               | continuous, not normal | `wilcoxon_test()`               |
+#' | 1               | continuous, normal     | `t_test()`                      |
+#' | 2, independent  | binary / nominal       | `chi_squared_test()`            |
+#' | 2, independent  | continuous, not normal | `mann_whitney_test()`           |
+#' | 2, independent  | continuous, normal     | `t_test()`                      |
+#' | 2, dependent    | binary (only 2x2)      | `chi_squared_test(paired=TRUE)` |
+#' | 2, dependent    | continuous, not normal | `wilcoxon_test()`               |
+#' | 2, dependent    | continuous, normal     | `t_test(paired=TRUE)`           |
+#' | >2, independent | continuous, not normal | `kruskal_wallis_test()`         |
+#' | >2, independent | continuous,     normal | `datawizard::means_by_group()`  |
+#' | >2, dependent   | continuous, not normal | _not yet implemented_ (1)       |
+#' | >2, dependent   | continuous,     normal | _not yet implemented_ (2)       |
+#'
+#' (1) More than two dependent samples are considered as _repeated measurements_.
+#'     These samples are usually tested using a [`friedman.test()`], which
+#'     requires the samples in one variable, the groups to compare in another
+#'     variable, and a third variable indicating the repeated measurements
+#'     (subject IDs).
+#'
+#' (2) More than two independent samples are considered as _repeated measurements_.
+#'     These samples are usually tested using a ANOVA for repeated measurements.
+#'     A more sophisticated approach would be using a linear mixed model.
+#'
 #' @seealso
 #' - [`mann_whitney_test()`] for unpaired (independent) samples.
 #' - [`t_test()`] for parametric t-tests.
@@ -49,6 +80,18 @@
 #' @return A data frame with test results. The function returns p and Z-values
 #' as well as effect size r and group-rank-means.
 #'
+#' @references
+#' - Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
+#'   Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
+#'   That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
+#'   \doi{10.3390/math11091982}
+#'
+#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+#'   Dtsch Med Wochenschr 2007; 132: e24–e25
+#'
+#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+#'   Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
+#' 
 #' @details This function is based on [`wilcox.test()`] and [`coin::wilcox_test()`]
 #' (the latter to extract effect sizes). The weighted version of the test is
 #' based on [`survey::svyranktest()`].
diff --git a/R/t_test.R b/R/t_test.R
index ac33ecbf..4f7e9bbf 100644
--- a/R/t_test.R
+++ b/R/t_test.R
@@ -10,11 +10,20 @@
 #' samples.
 #' @inherit mann_whitney_test seealso
 #'
+#' @inheritSection mann_whitney_test Which test to use
+#'
 #' @details Interpretation of effect sizes are based on rules described in
 #' [`effectsize::interpret_cohens_d()`] and [`effectsize::interpret_hedges_g()`].
 #'
 #' @return A data frame with test results.
 #'
+#' @references
+#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+#'   Dtsch Med Wochenschr 2007; 132: e24–e25
+#'
+#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+#'   Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
+#' 
 #' @examplesIf requireNamespace("effectsize")
 #' data(sleep)
 #' # one-sample t-test
diff --git a/R/wilcoxon_test.R b/R/wilcoxon_test.R
index e44a9049..5ef4d925 100644
--- a/R/wilcoxon_test.R
+++ b/R/wilcoxon_test.R
@@ -17,9 +17,18 @@
 #' @inheritParams mann_whitney_test
 #' @inherit mann_whitney_test seealso
 #'
+#' @inheritSection mann_whitney_test Which test to use
+#'
 #' @return A data frame with test results. The function returns p and Z-values
 #' as well as effect size r and group-rank-means.
 #'
+#' @references
+#' - Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+#'   Dtsch Med Wochenschr 2007; 132: e24–e25
+#'
+#' - du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+#'   Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
+#'
 #' @examplesIf requireNamespace("coin")
 #' data(mtcars)
 #' # one-sample test
diff --git a/man/chi_squared_test.Rd b/man/chi_squared_test.Rd
index cc11aa83..a63b8b7f 100644
--- a/man/chi_squared_test.Rd
+++ b/man/chi_squared_test.Rd
@@ -80,6 +80,39 @@ Interpretation of effect sizes are based on rules described in
 \code{\link[effectsize:interpret_r]{effectsize::interpret_phi()}}, \code{\link[effectsize:interpret_r]{effectsize::interpret_cramers_v()}},
 and \code{\link[effectsize:interpret_r]{effectsize::interpret_fei()}}.
 }
+\section{Which test to use}{
+
+The following table provides an overview of which test to use for different
+types of data. The choice of test depends on the scale of the outcome
+variable and the number of samples to compare.\tabular{lll}{
+   Samples \tab Scale of Outcome \tab Significance Test \cr
+   1 \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   1 \tab continuous, normal \tab \code{t_test()} \cr
+   2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr
+   2, independent \tab continuous, normal \tab \code{t_test()} \cr
+   2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr
+   2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr
+   >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr
+   >2, independent \tab continuous,     normal \tab \code{datawizard::means_by_group()} \cr
+   >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr
+   >2, dependent \tab continuous,     normal \tab \emph{not yet implemented} (2) \cr
+}
+
+
+(1) More than two dependent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which
+requires the samples in one variable, the groups to compare in another
+variable, and a third variable indicating the repeated measurements
+(subject IDs).
+
+(2) More than two independent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a ANOVA for repeated measurements.
+A more sophisticated approach would be using a linear mixed model.
+}
+
 \examples{
 \dontshow{if (requireNamespace("effectsize")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 data(efc)
@@ -96,10 +129,16 @@ chi_squared_test(efc, "c161sex", probabilities = c(0.3, 0.7))
 \dontshow{\}) # examplesIf}
 }
 \references{
-Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
+\itemize{
+\item Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
 Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
 That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
 \doi{10.3390/math11091982}
+\item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+Dtsch Med Wochenschr 2007; 132: e24–e25
+\item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
+}
 }
 \seealso{
 \itemize{
diff --git a/man/kruskal_wallis_test.Rd b/man/kruskal_wallis_test.Rd
index 2f7b6790..de197f59 100644
--- a/man/kruskal_wallis_test.Rd
+++ b/man/kruskal_wallis_test.Rd
@@ -46,6 +46,39 @@ The function simply is a wrapper around \code{\link[=kruskal.test]{kruskal.test(
 weighted version of the Kruskal-Wallis test is based on the \strong{survey} package,
 using \code{\link[survey:svyranktest]{survey::svyranktest()}}.
 }
+\section{Which test to use}{
+
+The following table provides an overview of which test to use for different
+types of data. The choice of test depends on the scale of the outcome
+variable and the number of samples to compare.\tabular{lll}{
+   Samples \tab Scale of Outcome \tab Significance Test \cr
+   1 \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   1 \tab continuous, normal \tab \code{t_test()} \cr
+   2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr
+   2, independent \tab continuous, normal \tab \code{t_test()} \cr
+   2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr
+   2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr
+   >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr
+   >2, independent \tab continuous,     normal \tab \code{datawizard::means_by_group()} \cr
+   >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr
+   >2, dependent \tab continuous,     normal \tab \emph{not yet implemented} (2) \cr
+}
+
+
+(1) More than two dependent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which
+requires the samples in one variable, the groups to compare in another
+variable, and a third variable indicating the repeated measurements
+(subject IDs).
+
+(2) More than two independent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a ANOVA for repeated measurements.
+A more sophisticated approach would be using a linear mixed model.
+}
+
 \examples{
 data(efc)
 # Kruskal-Wallis test for elder's age by education
@@ -70,6 +103,14 @@ kruskal_wallis_test(long_data, select = "scales", by = "groups")
 # base R equivalent
 kruskal.test(scales ~ groups, data = long_data)
 }
+\references{
+\itemize{
+\item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+Dtsch Med Wochenschr 2007; 132: e24–e25
+\item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
+}
+}
 \seealso{
 \itemize{
 \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for unpaired (independent) samples.
diff --git a/man/mann_whitney_test.Rd b/man/mann_whitney_test.Rd
index be90ae7d..5eaf58c8 100644
--- a/man/mann_whitney_test.Rd
+++ b/man/mann_whitney_test.Rd
@@ -80,6 +80,39 @@ Interpretation of the effect size \strong{r}, as a rule-of-thumb:
 
 \strong{r} is calcuated as \eqn{r = \frac{|Z|}{\sqrt{n1 + n2}}}.
 }
+\section{Which test to use}{
+
+The following table provides an overview of which test to use for different
+types of data. The choice of test depends on the scale of the outcome
+variable and the number of samples to compare.\tabular{lll}{
+   Samples \tab Scale of Outcome \tab Significance Test \cr
+   1 \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   1 \tab continuous, normal \tab \code{t_test()} \cr
+   2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr
+   2, independent \tab continuous, normal \tab \code{t_test()} \cr
+   2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr
+   2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr
+   >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr
+   >2, independent \tab continuous,     normal \tab \code{datawizard::means_by_group()} \cr
+   >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr
+   >2, dependent \tab continuous,     normal \tab \emph{not yet implemented} (2) \cr
+}
+
+
+(1) More than two dependent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which
+requires the samples in one variable, the groups to compare in another
+variable, and a third variable indicating the repeated measurements
+(subject IDs).
+
+(2) More than two independent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a ANOVA for repeated measurements.
+A more sophisticated approach would be using a linear mixed model.
+}
+
 \examples{
 \dontshow{if (requireNamespace("coin") && requireNamespace("survey")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 data(efc)
@@ -105,6 +138,18 @@ mann_whitney_test(long_data, select = "scales", by = "groups")
 wilcox.test(scales ~ groups, long_data)
 \dontshow{\}) # examplesIf}
 }
+\references{
+\itemize{
+\item Ben-Shachar, M.S., Patil, I., Thériault, R., Wiernik, B.M.,
+Lüdecke, D. (2023). Phi, Fei, Fo, Fum: Effect Sizes for Categorical Data
+That Use the Chi‑Squared Statistic. Mathematics, 11, 1982.
+\doi{10.3390/math11091982}
+\item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+Dtsch Med Wochenschr 2007; 132: e24–e25
+\item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
+}
+}
 \seealso{
 \itemize{
 \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for unpaired (independent) samples.
diff --git a/man/t_test.Rd b/man/t_test.Rd
index 085896fe..ea89da9c 100644
--- a/man/t_test.Rd
+++ b/man/t_test.Rd
@@ -62,6 +62,39 @@ automatically calculates effect sizes.
 Interpretation of effect sizes are based on rules described in
 \code{\link[effectsize:interpret_cohens_d]{effectsize::interpret_cohens_d()}} and \code{\link[effectsize:interpret_cohens_d]{effectsize::interpret_hedges_g()}}.
 }
+\section{Which test to use}{
+
+The following table provides an overview of which test to use for different
+types of data. The choice of test depends on the scale of the outcome
+variable and the number of samples to compare.\tabular{lll}{
+   Samples \tab Scale of Outcome \tab Significance Test \cr
+   1 \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   1 \tab continuous, normal \tab \code{t_test()} \cr
+   2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr
+   2, independent \tab continuous, normal \tab \code{t_test()} \cr
+   2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr
+   2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr
+   >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr
+   >2, independent \tab continuous,     normal \tab \code{datawizard::means_by_group()} \cr
+   >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr
+   >2, dependent \tab continuous,     normal \tab \emph{not yet implemented} (2) \cr
+}
+
+
+(1) More than two dependent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which
+requires the samples in one variable, the groups to compare in another
+variable, and a third variable indicating the repeated measurements
+(subject IDs).
+
+(2) More than two independent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a ANOVA for repeated measurements.
+A more sophisticated approach would be using a linear mixed model.
+}
+
 \examples{
 \dontshow{if (requireNamespace("effectsize")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 data(sleep)
@@ -81,6 +114,14 @@ t_test(mtcars, c("mpg", "hp"), paired = TRUE)
 t.test(mtcars$mpg, mtcars$hp, data = mtcars, paired = TRUE)
 \dontshow{\}) # examplesIf}
 }
+\references{
+\itemize{
+\item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+Dtsch Med Wochenschr 2007; 132: e24–e25
+\item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
+}
+}
 \seealso{
 \itemize{
 \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for unpaired (independent) samples.
diff --git a/man/wilcoxon_test.Rd b/man/wilcoxon_test.Rd
index 7c9cc925..7b2ef6e4 100644
--- a/man/wilcoxon_test.Rd
+++ b/man/wilcoxon_test.Rd
@@ -68,6 +68,39 @@ variables are not normally distributed. For large samples, or approximately
 normally distributed variables, the \code{t_test()} function can be used (with
 \code{paired = TRUE}).
 }
+\section{Which test to use}{
+
+The following table provides an overview of which test to use for different
+types of data. The choice of test depends on the scale of the outcome
+variable and the number of samples to compare.\tabular{lll}{
+   Samples \tab Scale of Outcome \tab Significance Test \cr
+   1 \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   1 \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   1 \tab continuous, normal \tab \code{t_test()} \cr
+   2, independent \tab binary / nominal \tab \code{chi_squared_test()} \cr
+   2, independent \tab continuous, not normal \tab \code{mann_whitney_test()} \cr
+   2, independent \tab continuous, normal \tab \code{t_test()} \cr
+   2, dependent \tab binary (only 2x2) \tab \code{chi_squared_test(paired=TRUE)} \cr
+   2, dependent \tab continuous, not normal \tab \code{wilcoxon_test()} \cr
+   2, dependent \tab continuous, normal \tab \code{t_test(paired=TRUE)} \cr
+   >2, independent \tab continuous, not normal \tab \code{kruskal_wallis_test()} \cr
+   >2, independent \tab continuous,     normal \tab \code{datawizard::means_by_group()} \cr
+   >2, dependent \tab continuous, not normal \tab \emph{not yet implemented} (1) \cr
+   >2, dependent \tab continuous,     normal \tab \emph{not yet implemented} (2) \cr
+}
+
+
+(1) More than two dependent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a \code{\link[=friedman.test]{friedman.test()}}, which
+requires the samples in one variable, the groups to compare in another
+variable, and a third variable indicating the repeated measurements
+(subject IDs).
+
+(2) More than two independent samples are considered as \emph{repeated measurements}.
+These samples are usually tested using a ANOVA for repeated measurements.
+A more sophisticated approach would be using a linear mixed model.
+}
+
 \examples{
 \dontshow{if (requireNamespace("coin")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
 data(mtcars)
@@ -87,6 +120,14 @@ d <- iris[iris$Species != "setosa", ]
 wilcoxon_test(d, "Sepal.Width", by = "Species")
 \dontshow{\}) # examplesIf}
 }
+\references{
+\itemize{
+\item Bender, R., Lange, S., Ziegler, A. Wichtige Signifikanztests.
+Dtsch Med Wochenschr 2007; 132: e24–e25
+\item du Prel, J.B., Röhrig, B., Hommel, G., Blettner, M. Auswahl statistischer
+Testverfahren. Dtsch Arztebl Int 2010; 107(19): 343–8
+}
+}
 \seealso{
 \itemize{
 \item \code{\link[=mann_whitney_test]{mann_whitney_test()}} for unpaired (independent) samples.