openpharma · clarkliming · Jan 4, 2024 · Dec 22, 2023 · Dec 25, 2023 · Dec 25, 2023
@@ -3,7 +3,7 @@
 ### New Features
 
 - `Anova` is implemented for `mmrm` models and available upon loading the `car` package. It supports type II and III hypothesis testing.
-- The argument `start` for `mmrm_control()` is updated to allow string/function/numeric input for better
+- The argument `start` for `mmrm_control()` is updated to allow function/numeric input for better
 choices of initial values.
 
 ### Miscellaneous

@@ -243,9 +243,8 @@ refit_multiple_optimizers <- function(fit,
 #' - The argument `start` is used to facilitate the choice of initial values for fitting the model.
 #'   If `function` is provided, make sure its parameter is a valid element of `mmrm_tmb_data`
 #'   or `mmrm_tmb_formula_parts` and it returns a numeric vector.
-#'   If `NULL` is provided, `std_start` will be used.
-#'   By default, `std_start` will be used. Other implemented methods include `std_start` and
-#'   `emp_start`.
+#'   By default or if `NULL` is provided, `std_start` will be used.
+#'   Other implemented methods include `emp_start`.
 #'
 #' @return List of class `mmrm_control` with the control parameters.
 #' @export
@@ -268,11 +267,11 @@ mmrm_control <- function(n_cores = 1L,
   if (is.null(start)) {
     start <- std_start
   }
-  if (test_function(start)) {
-    assert_function(start, args = "...")
-  } else {
-    assert_numeric(start, null.ok = TRUE)
-  }
+  assert(
+    check_function(start, args = "..."),
+    check_numeric(start, null.ok = FALSE),
+    combine = "or"
+  )
   assert_flag(accept_singular)
   assert_flag(drop_visit_levels)
   assert_list(optimizers, names = "unique", types = c("function", "partial"))

@@ -260,15 +260,15 @@ h_mmrm_tmb_parameters <- function(formula_parts,
   m <- tmb_data$n_visits
   start_value0 <- std_start(formula_parts$cov_type, m, n_groups)
   theta_dim <- length(start_value0)
-  if (is.null(start)) {
-    start_value <- start_value0
+  start_values <- if (is.null(start)) {
+    start_value0
   } else if (test_function(start)) {
-    start_value <- do.call(start, utils::modifyList(formula_parts, tmb_data))
+    do.call(start, utils::modifyList(formula_parts, tmb_data))
   } else {
-    start_value <- start
+    start
   }
-  assert_numeric(start_value, len = theta_dim, any.missing = FALSE, finite = TRUE)
-  list(theta = start_value)
+  assert_numeric(start_values, len = theta_dim, any.missing = FALSE, finite = TRUE)
+  list(theta = start_values)
 }
 
 #' Asserting Sane Start Values for `TMB` Fit

@@ -357,6 +357,12 @@ h_valid_formula <- function(formula) {
 #' @param n_groups (`int`)\cr number of groups.
 #' @param ... not used.
 #'
+#' @details
+#' `std_start` will try to provide variance parameter from identity matrix.
+#' However, for `ar1` and `ar1h` the corresponding values are not ideal because the
+#' \eqn{\rho} is usually a positive number thus using 0 as starting value can lead to
+#' incorrect optimization result, and we use 0.5 as the initial value of \eqn{\rho}.
+#'
 #' @return A numeric vector of starting values.
 #'
 #' @export
@@ -386,29 +392,35 @@ std_start <- function(cov_type, n_visits, n_groups, ...) {
 #'
 #' @param full_frame (`data.frame`)\cr data used for model fitting.
 #' @param model_formula (`formula`)\cr the formula in mmrm model without covariance structure part.
-#' @param group_var (`string`)\cr group variable.
 #' @param visit_var (`string`)\cr visit variable.
 #' @param subject_var (`string`)\cr subject id variable.
 #' @param subject_groups (`factor`)\cr subject group assignment.
 #' @param ... not used.
 #'
-#' @details This `emp_start` only works for unstructured covariance structure.
+#' @details
+#' This `emp_start` only works for unstructured covariance structure.
+#' It uses linear regression to first obtain the coefficients and use the residuals
+#' to obtain the empirical variance-covariance, and it is then used to obtain the
+#' starting values.
 #'
 #' @return A numeric vector of starting values.
 #'
 #' @export
-emp_start <- function(full_frame, model_formula, visit_var, subject_var, n_visits, n_subjects, subject_groups, ...) {
+emp_start <- function(full_frame, model_formula, visit_var, subject_var, subject_groups, ...) {
   assert_formula(model_formula)
   assert_data_frame(full_frame)
   assert_subset(all.vars(model_formula), colnames(full_frame))
   assert_string(visit_var)
   assert_string(subject_var)
   assert_factor(full_frame[[visit_var]])
+  n_visits <- length(levels(full_frame[[visit_var]]))
   assert_factor(full_frame[[subject_var]])
+  subjects <- droplevels(full_frame[[subject_var]])
+  n_subjects <- length(levels(subjects))
   fit <- lm(formula = model_formula, data = full_frame)
   res <- rep(NA, n_subjects * n_visits)
   res[
-    n_visits * as.integer(droplevels(full_frame[[subject_var]])) - n_visits + as.integer(full_frame[[visit_var]])
+    n_visits * as.integer(subjects) - n_visits + as.integer(full_frame[[visit_var]])
   ] <- residuals(fit)
   res_mat <- matrix(res, ncol = n_visits, nrow = n_subjects, byrow = TRUE)
   emp_covs <- lapply(

diff --git a/tests/testthat/test-between-within.R b/tests/testthat/test-between-within.R
@@ -80,7 +80,7 @@ test_that("h_df_1d_bw works as expected for a model with only intercept", {
   expect_equal(result$est, 42.8338, tolerance = 1e-4)
   expect_equal(result$se, 0.3509, tolerance = 1e-4)
   expect_identical(result$df, 340L)
-  expect_equal(result$t_stat, 122.07, tolerance = 1e-3)
+  expect_equal(result$t_stat, 122.07, tolerance = 1e-4)
   expect_true(result$p_val < 0.0001)
 })
 

@@ -450,7 +450,7 @@ test_that("mmrm works for start = NULL", {
 
 ## start values ----
 
-test_that("start can be emp_start or std_start and they works", {
+test_that("start can be emp_start or std_start and they work", {
   expect_silent(
     mmrm(
       formula = FEV1 ~ ARMCD + us(AVISIT | USUBJID),
@@ -467,7 +467,7 @@ test_that("start can be emp_start or std_start and they works", {
   )
 })
 
-test_that("mmrm fails if start do not provide correct parameter", {
+test_that("mmrm fails if start function does not provide correct values", {
   wrong_start <- function(...) {
     c(NA_real_, 0L)
   }
@@ -476,14 +476,16 @@ test_that("mmrm fails if start do not provide correct parameter", {
       formula = FEV1 ~ ARMCD + us(AVISIT | USUBJID),
       data = fev_data,
       start = wrong_start
-    )
+    ),
+    "Assertion on 'start_values' failed: Must have length 10, but has length 2."
   )
   expect_error(
     mmrm(
       formula = FEV1 ~ ARMCD + ar1(AVISIT | USUBJID),
       data = fev_data,
       start = wrong_start
-    )
+    ),
+    "Assertion on 'start_values' failed: Contains missing values \\(element 1\\)"
   )
 })
 

diff --git a/tests/testthat/test-satterthwaite.R b/tests/testthat/test-satterthwaite.R
@@ -68,7 +68,7 @@ test_that("h_df_1d_sat works as expected", {
   expect_equal(result$est, 42.8338, tolerance = 1e-4)
   expect_equal(result$se, 0.3509, tolerance = 1e-4)
   expect_identical(round(result$df), 171)
-  expect_equal(result$t_stat, 122.07, tolerance = 1e-3)
+  expect_equal(result$t_stat, 122.07, tolerance = 1e-4)
   expect_true(result$p_val < 0.0001)
 })
 

diff --git a/tests/testthat/test-tmb-methods.R b/tests/testthat/test-tmb-methods.R
@@ -749,7 +749,7 @@ test_that("residuals works as expected with grouped covariance structure", {
 
   result_norm <- expect_silent(residuals(object, type = "normalized"))
   expect_double(result_norm, len = length(object$tmb_data$y_vector))
-  expect_equal(head(result_norm, 5), c(-0.73835, -1.88475, -1.53172, -1.02026, 0.54335), tolerance = 1e-3)
+  expect_equal(head(result_norm, 5), c(-0.73835, -1.88475, -1.53172, -1.02026, 0.54335), tolerance = 1e-4)
 })
 
 test_that("residuals works as expected with weighted model fit", {

@@ -35,14 +35,71 @@ We will see this below.
 The starting values will affect the optimization result. A better starting value
 usually can make the optimization more efficient.
 In `mmrm` we provide two starting value functions, one is `std_start` and the other is `emp_start`.
-`std_start` will try to use identity matrix as the covariance, however there are convergence problems
-for `ar1` and `ar1h` if identity matrix is provided, thus for these two covariance structure
+`std_start` will try to use the identity matrix as the covariance, however there are convergence problems
+for `ar1` and `ar1h` if the identity matrix is provided, thus for these two covariance structures
 we use $\rho=0.5$ instead.
-`emp_start` will try to use empirical covariance as the starting value for unstructured covariance structure.
+`emp_start` will try to use the empirical covariance matrix of the residuals of the ordinary least squares model as the starting value for unstructured covariance structure.
 If some timepoints are missing from data, identity matrix will be used for that submatrix.
 The correlation between existing and non-existing timepoints are set to 0.
 
 As the starting values will affect the result, please be cautious on choosing the starting values.
+#### Example of Default Starting Value Fails
+
+Here we provide an example where the `std_start` fails.
+In the following code chunk, we will create a dummy dataset for mmrm analysis.
+
+```{r data_generation}
+gen_data <- function(
+    n = 100,
+    mu = -100 / 52,
+    delta = 50 / 52,
+    mua = 2000,
+    sigmaa = 300,
+    sigmab = 60,
+    corab = 0.2,
+    sigma = 10,
+    times = c(0, 2, 6, 12, 24, 36, 52, 70, 88, 104)) {
+  nt <- length(times)
+  out <- data.frame(
+    pts = rep(seq_len(n * 2), each = nt),
+    trt = rep(c("Treatment", "Placebo"), rep(n * nt, 2)),
+    time = rep(times, n * 2)
+  )
+
+  covab <- corab * sigmaa * sigmab # cov between a and b
+  cov <- matrix(c(sigmaa^2, covab, covab, sigmab^2), ncol = 2) # Cov matrix for the slope and intercept
+  si <- rbind(
+    MASS::mvrnorm(n, mu = c(mua, mu + delta), Sigma = cov),
+    MASS::mvrnorm(n, mu = c(mua, mu + delta), Sigma = cov)
+  )
+  idx <- rep(seq_len(n * 2), each = nt)
+  out$fev1 <- si[idx, 1] + si[idx, 2] * times + rnorm(n * nt * 2, sd = sigma)
+  out$trt <- factor(out$trt)
+  out$time <- factor(out$time)
+  out$pts <- factor(out$pts)
+  return(out)
+}
+set.seed(123)
+out <- gen_data()
+```
+
+In the generated data, the variance is not in the same scale across visits.
+
+```{r show_variance}
+vapply(split(out$fev1, out$time), sd, FUN.VALUE = 1)
+```
+
+Using `emp_start` as the starting value, `mmrm` will converge fast.
+```{r mmrm_using_emp_start}
+mmrm(fev1 ~ trt * time + us(time | pts), data = out, start = emp_start)
+```
+
+However, if we use `std_start`, this will be convergence problems.
+
+```{r mmrm_using_std_start, eval = FALSE}
+mmrm(fev1 ~ trt * time + us(time | pts), data = out, start = std_start)
+```
+
 
 ## REML or ML