Updates to internals (#53)

* Improved error messages when users provide private names * Renamed columns in 'tt' data * Adjusted names in tests * Improved internals to deal with same column in true and elsewhere * Updated NEWS * Updates to testing code * Testing platforms * Updated {roxygen2} * Improved error message when private names are used
ellessenne · Feb 29, 2024 · ef59fe9 · ef59fe9
1 parent 070526a
commit ef59fe9
Show file tree

Hide file tree

Showing 30 changed files with 207 additions and 171 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -43,7 +43,7 @@ URL: https://ellessenne.github.io/rsimsum/
 BugReports: https://github.com/ellessenne/rsimsum/issues
 VignetteBuilder: knitr
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 LazyData: true
 ByteCompile: true
 Encoding: UTF-8

diff --git a/Makefile b/Makefile
@@ -9,8 +9,6 @@ pre_submission_test:
 	R -e "devtools::check_win_oldrelease(quiet = TRUE)"
 	R -e "devtools::check_mac_release(quiet = TRUE)"
 	R -e "rhub::check_for_cran()"
-	R -e "rhub::check(platforms = 'macos-highsierra-release-cran')"
-	R -e "rhub::check(platforms = 'macos-highsierra-release')"
 	make style
 
 docs:

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,12 @@
 # rsimsum (development version)
 
-* Fixed issues with nested loop plot when the simulation design is not fully-factorial (#47, thanks @mikesweeting).
+* Fixed issues with nested loop plot when the simulation design is not fully-factorial (#47, thanks @mikesweeting);
+
+* Fixed wrong calculations when the same column was used in `true` and in `by` or elsewhere (#48, thanks @mikesweeting);
+
+* Updated columns names for confidence intervals in the `tt` dataset;
+
+* Updated documentation regarding column names that are not allowed when calling `simsum()` or `multisimsum()`.
 
 # rsimsum 0.12.0
 

diff --git a/R/data.R b/R/data.R
@@ -96,7 +96,7 @@
 #' @format A data frame with 4,000 rows and 8 variables:
 #' * `diff` The difference in mean between groups estimated by the t-test;
 #' * `se` Standard error of the estimated difference;
-#' * `lower`, `upper` Confidence interval for the difference in mean as reported by the t-test;
+#' * `conf.low`, `conf.high` Confidence interval for the difference in mean as reported by the t-test;
 #' * `df` The number of degrees of freedom assumed by the t-test;
 #' * `repno` Identifies each replication, between 1 and 500;
 #' * `dgm` Identifies each data-generating mechanism: 1 corresponds to normal data with equal variance between the groups, 2 is normal data with unequal variance, 3 and 4 are skewed data (simulated from a Gamma distribution) with equal and unequal variance between groups, respectively;

diff --git a/R/performance.R b/R/performance.R
@@ -1,5 +1,8 @@
 #' @keywords internal
 .performance <- function(data, estvarname, se, true, empse_ref = NULL, rho = NULL, ci.limits, df, control) {
+  ### Adjust 'true' to use internal column if not NULL
+  if (!is.null(true)) true <- ":true"
+
   ### Make object to return
   obj <- list()
 
@@ -22,29 +25,17 @@
   }
   # Bias
   if (!is.null(true)) {
-    if (is.character(true)) {
-      bias <- 1 / nsim * sum(data[[estvarname]] - data[[true]], na.rm = control$na.rm)
-    } else {
-      bias <- 1 / nsim * sum(data[[estvarname]] - true, na.rm = control$na.rm)
-    }
+    bias <- 1 / nsim * sum(data[[estvarname]] - data[[":true"]], na.rm = control$na.rm)
   }
   # Relative bias
   if (!is.null(true)) {
-    if (is.character(true)) {
-      rbias <- 1 / nsim * sum((data[[estvarname]] - data[[true]]) / data[[true]], na.rm = control$na.rm)
-    } else {
-      rbias <- 1 / nsim * sum((data[[estvarname]] - true) / true, na.rm = control$na.rm)
-    }
+    rbias <- 1 / nsim * sum((data[[estvarname]] - data[[":true"]]) / data[[":true"]], na.rm = control$na.rm)
   }
   # Empirical standard error
   empse <- sqrt(1 / (nsim - 1) * sum((data[[estvarname]] - mean(data[[estvarname]], na.rm = control$na.rm))^2, na.rm = control$na.rm))
   # Mean squared error
   if (!is.null(true)) {
-    if (is.character(true)) {
-      mse <- 1 / nsim * sum((data[[estvarname]] - data[[true]])^2, na.rm = control$na.rm)
-    } else {
-      mse <- 1 / nsim * sum((data[[estvarname]] - true)^2, na.rm = control$na.rm)
-    }
+    mse <- 1 / nsim * sum((data[[estvarname]] - data[[":true"]])^2, na.rm = control$na.rm)
   }
   # Relative change in precision
   if (!is.null(empse_ref) & !is.null(rho)) {
@@ -68,26 +59,14 @@
   # Coverage of a nominal (1 - level)% confidence interval
   if (!is.null(true) & !is.null(se)) {
     if (is.null(ci.limits)) {
-      if (is.character(true)) {
-        cover <- 1 / nsim * sum(data[[true]] >= data[[estvarname]] - crit * data[[se]] & data[[true]] <= data[[estvarname]] + crit * data[[se]], na.rm = control$na.rm)
-      } else {
-        cover <- 1 / nsim * sum(true >= data[[estvarname]] - crit * data[[se]] & true <= data[[estvarname]] + crit * data[[se]], na.rm = control$na.rm)
-      }
+      cover <- 1 / nsim * sum(data[[":true"]] >= data[[estvarname]] - crit * data[[se]] & data[[":true"]] <= data[[estvarname]] + crit * data[[se]], na.rm = control$na.rm)
     } else {
       if (is.character(ci.limits)) {
-        if (is.character(true)) {
-          cover <- 1 / nsim * sum(data[[true]] >= data[[ci.limits[1]]] & data[[true]] <= data[[ci.limits[2]]], na.rm = control$na.rm)
-        } else {
-          cover <- 1 / nsim * sum(true >= data[[ci.limits[1]]] & true <= data[[ci.limits[2]]], na.rm = control$na.rm)
-        }
+        cover <- 1 / nsim * sum(data[[":true"]] >= data[[ci.limits[1]]] & data[[":true"]] <= data[[ci.limits[2]]], na.rm = control$na.rm)
       } else if (is.numeric(ci.limits)) {
         data[["lower"]] <- ci.limits[1]
         data[["upper"]] <- ci.limits[2]
-        if (is.character(true)) {
-          cover <- 1 / nsim * sum(data[[true]] >= data[["lower"]] & data[[true]] <= data[["upper"]], na.rm = control$na.rm)
-        } else {
-          cover <- 1 / nsim * sum(true >= data[["lower"]] & true <= data[["upper"]], na.rm = control$na.rm)
-        }
+        cover <- 1 / nsim * sum(data[[":true"]] >= data[["lower"]] & data[[":true"]] <= data[["upper"]], na.rm = control$na.rm)
       }
     }
   }
@@ -112,20 +91,12 @@
   if (control$mcse) {
     if (!is.null(true)) {
       bias_mcse <- sqrt(1 / (nsim * (nsim - 1)) * sum((data[[estvarname]] - mean(data[[estvarname]], na.rm = control$na.rm))^2, na.rm = control$na.rm))
-      if (is.character(true)) {
-        rbias_i <- (data[[estvarname]] - data[[true]]) / data[[true]]
-      } else {
-        rbias_i <- (data[[estvarname]] - true) / true
-      }
+      rbias_i <- (data[[estvarname]] - data[[":true"]]) / data[[":true"]]
       rbias_mcse <- sd(rbias_i) / sqrt(nsim)
     }
     empse_mcse <- empse / sqrt(2 * (nsim - 1))
     if (!is.null(true)) {
-      if (is.character(true)) {
-        mse_mcse <- sqrt(sum(((data[[estvarname]] - data[[true]])^2 - mse)^2, na.rm = control$na.rm) / (nsim * (nsim - 1)))
-      } else {
-        mse_mcse <- sqrt(sum(((data[[estvarname]] - true)^2 - mse)^2, na.rm = control$na.rm) / (nsim * (nsim - 1)))
-      }
+      mse_mcse <- sqrt(sum(((data[[estvarname]] - data[[":true"]])^2 - mse)^2, na.rm = control$na.rm) / (nsim * (nsim - 1)))
     }
     if (!is.null(empse_ref) & !is.null(rho)) {
       relprec_mcse <- 200 * (empse_ref / empse)^2 * sqrt((1 - rho^2) / (nsim - 1))

diff --git a/R/simsum.R b/R/simsum.R
@@ -6,25 +6,32 @@
 #' @param data A `data.frame` in which variable names are interpreted.
 #' It has to be in tidy format, e.g. each variable forms a column and each observation forms a row.
 #' @param estvarname The name of the variable containing the point estimates.
+#' Note that some column names are forbidden: these are listed below in the _Details_ section.
 #' @param se The name of the variable containing the standard errors of the point estimates.
+#' Note that some column names are forbidden: these are listed below in the _Details_ section.
 #' @param true The true value of the parameter; this is used in calculations of bias, relative bias, coverage, and mean squared error and is required whenever these performance measures are requested.
 #' `true` can be a numeric value or a string that identifies a column in `data`.
 #' In the former setting, `simsum` will assume the same value for all replications; conversely, each replication will use a distinct value for `true` as identified by each row of `data`.
 #' See `vignette("E-custom-inputs", package = "rsimsum")` for more details.
+#' Note that some column names are forbidden: these are listed below in the _Details_ section.
 #' @param methodvar The name of the variable containing the methods to compare.
 #' For instance, methods could be the models compared within a simulation study.
 #' Can be `NULL`.
 #' If a vector of column names is passed to `simsum()`, those columns will be combined into a single column named `:methodvar` using the [base::interaction()] function before computing all performance measures.
+#' Note that some column names are forbidden: these are listed below in the _Details_ section.
 #' @param ref Specifies the reference method against which relative precision will be calculated.
 #' Only useful if `methodvar` is specified.
 #' @param by A vector of variable names to compute performance measures by a list of factors. Factors listed here are the (potentially several) data-generating mechanisms used to simulate data under different scenarios (e.g. sample size, true distribution of a variable, etc.).
 #' Can be `NULL`.
+#' Note that some column names are forbidden: these are listed below in the _Details_ section.
 #' @param ci.limits Can be used to specify the limits (lower and upper) of confidence intervals used to calculate coverage and bias-eliminated coverage.
 #' Useful for non-Wald type estimators (e.g. bootstrap).
 #' Defaults to `NULL`, where Wald-type confidence intervals based on the provided SEs are calculated for coverage; otherwise, it can be a numeric vector (for fixed confidence intervals) or a vector of strings that identify columns in `data` with replication-specific lower and upper limits.
 #' See `vignette("E-custom-inputs", package = "rsimsum")` for more details.
+#' Note that some column names are forbidden: these are listed below in the _Details_ section.
 #' @param df Can be used to specify that a column containing the replication-specific number of degrees of freedom that will be used to calculate confidence intervals for coverage (and bias-eliminated coverage) assuming t-distributed critical values (rather than normal theory intervals).
 #' See `vignette("E-custom-inputs", package = "rsimsum")` for more details.
+#' Note that some column names are forbidden: these are listed below in the _Details_ section.
 #' @param dropbig Specifies that point estimates or standard errors beyond the maximum acceptable values should be dropped. Defaults to `FALSE`.
 #' @param x Set to `TRUE` to include the `data` argument used to calculate summary statistics (i.e. after pre-processing the input dataset e.g. removing values deemed too large via the `dropbig` argument) as a slot. Calling `simsum` with `x = TRUE` is required to produce zipper plots. The downside is that the size of the returned object increases considerably, therefore it is set to `FALSE` by default.
 #' @param control A list of parameters that control the behaviour of `simsum`.
@@ -43,7 +50,7 @@
 #' @references Gasparini, A. 2018. rsimsum: Summarise results from Monte Carlo simulation studies. Journal of Open Source Software 3(26):739, \doi{10.21105/joss.00739}
 #' @export
 #' @details
-#' The following names are not allowed for `estvarname`, `se`, `methodvar`, `by`: `stat`, `est`, `mcse`, `lower`, `upper`, `:methodvar`.
+#' The following names are not allowed for any column in `data` that is passed to [simsum()]: `stat`, `est`, `mcse`, `lower`, `upper`, `:methodvar`, `:true`.
 #'
 #' @examples
 #' data("MIsim", package = "rsimsum")
@@ -62,22 +69,6 @@ simsum <- function(data,
                    dropbig = FALSE,
                    x = FALSE,
                    control = list()) {
-  # data("nlp", package = "rsimsum")
-  # nlp.subset <- nlp %>%
-  #   dplyr::filter(!(ss == 100 & esigma == 2))
-  # data <- nlp.subset
-  # estvarname <- "b"
-  # true <- 0
-  # se <- "se"
-  # methodvar <- "model"
-  # by <- c("baseline", "ss", "esigma")
-  # ci.limits <- NULL
-  # df <- NULL
-  # dropbig <- FALSE
-  # x <- FALSE
-  # control <- list()
-  # ref <- NULL
-
   ### Check arguments
   arg_checks <- checkmate::makeAssertCollection()
   # 'data' must be a data.frame
@@ -109,13 +100,13 @@ simsum <- function(data,
   checkmate::assert_subset(x = by, choices = names(data), add = arg_checks)
   checkmate::assert_subset(x = methodvar, choices = names(data), add = arg_checks)
   checkmate::assert_subset(x = df, choices = names(data), add = arg_checks)
-  # 'estvarname', 'se', 'methodvar', 'by' , 'df' must not be any in ('stat', 'est', 'mcse', 'lower', 'upper', ':methodvar')
-  .private_names <- c("stat", "est", "mcse", "lower", "upper", ":methodvar")
-  checkmate::assert_false(x = (estvarname %in% .private_names), add = arg_checks)
-  if (!is.null(se)) checkmate::assert_false(x = (se %in% .private_names), add = arg_checks)
-  if (!is.null(methodvar)) checkmate::assert_false(x = any(methodvar %in% .private_names), add = arg_checks)
-  if (!is.null(by)) checkmate::assert_false(x = any(by %in% .private_names), add = arg_checks)
-  if (!is.null(df)) checkmate::assert_false(x = any(df %in% .private_names), add = arg_checks)
+  # 'estvarname', 'se', 'methodvar', 'by' , 'df' must not be any in ('stat', 'est', 'mcse', 'lower', 'upper', ':methodvar', ':true')
+  .private_names <- c("stat", "est", "mcse", "lower", "upper", ":methodvar", ":true")
+  .check_private(var = estvarname, label = "estvarname", private_names = .private_names)
+  .check_private(var = se, label = "se", private_names = .private_names)
+  .check_private(var = methodvar, label = "methodvar", private_names = .private_names)
+  .check_private(var = by, label = "by", private_names = .private_names)
+  .check_private(var = df, label = "df", private_names = .private_names)
   # Process vector of 'methodvar' if a vector
   user_methodvar <- NULL
   if (length(methodvar) > 1) {
@@ -134,6 +125,7 @@ simsum <- function(data,
     if (is.character(ci.limits)) {
       checkmate::assert_character(x = ci.limits, len = 2, add = arg_checks)
       checkmate::assert_true(x = all(ci.limits %in% names(data)), add = arg_checks)
+      lapply(X = ci.limits, FUN = function(x) .check_private(var = x, label = "ci.limits", private_names = .private_names))
     }
     if (is.numeric(ci.limits)) {
       checkmate::assert_numeric(x = ci.limits, len = 2, add = arg_checks)
@@ -164,6 +156,15 @@ simsum <- function(data,
   ), recursive = FALSE)
   control <- control.tmp
 
+  ### Add hidden column with true values
+  if (!is.null(true)) {
+    if (is.character(true)) {
+      data[[":true"]] <- data[[true]]
+    } else {
+      data[[":true"]] <- true
+    }
+  }
+
   ### Factorise 'methodvar', 'by'
   data <- .factorise(data = data, cols = c(methodvar, by))
 
@@ -256,6 +257,7 @@ simsum <- function(data,
   obj$control <- control
   if (x) {
     obj$x <- .br(lapply(data, .br))
+    if (!is.null(true)) obj$x[[":true"]] <- NULL
     rownames(obj$x) <- NULL
   }
 

diff --git a/R/utils.R b/R/utils.R
@@ -201,3 +201,13 @@
   data <- data[nrs > 0]
   return(data)
 }
+
+### Check private names
+.check_private <- function(var, label, private_names) {
+  if (!is.null(var)) {
+    if (any(var %in% private_names)) {
+      this <- which(var %in% private_names)
+      stop(paste0("'", var[this], "' is not an allowed name for '", label, "'; see help('simsum') for more details."), call. = FALSE)
+    }
+  }
+}
diff --git a/data-raw/tt-data.R b/data-raw/tt-data.R
@@ -142,6 +142,8 @@ for (i in seq(B)) {
   tt.df[[i]] <- out
 }
 tt <- do.call(rbind.data.frame, tt.df)
+library(dplyr)
+tt <- rename(tt, conf.low = lower, conf.high = upper)
 
 ### Export for use in the package
 usethis::use_data(tt, overwrite = TRUE)
diff --git a/data/tt.rda b/data/tt.rda
diff --git a/docs/articles/A-introduction.html b/docs/articles/A-introduction.html
diff --git a/docs/articles/B-relhaz.html b/docs/articles/B-relhaz.html
diff --git a/docs/articles/C-plotting.html b/docs/articles/C-plotting.html
diff --git a/docs/articles/D-nlp.html b/docs/articles/D-nlp.html