Fixes #1086 remove group_by to optimize computation and suggest parallel

- If parallel is not installed, the regular code is used. - Regular code tries not to run group_by that slow down the computation by a lot - The number of cores is identified from the SimulationSettings already set up for the PK parameter calculation task
Open-Systems-Pharmacology · Oct 23, 2023 · f63093e · f63093e
1 parent 1c1fd1b
commit f63093e
Show file tree

Hide file tree

Showing 4 changed files with 161 additions and 52 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -49,7 +49,8 @@ Suggests:
     styler,
     rClr,
     crayon,
-    webshot
+    webshot,
+    parallel
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.2.3

diff --git a/R/utilities-ratio-comparison.R b/R/utilities-ratio-comparison.R
@@ -152,8 +152,7 @@ getPKRatioSummaryStatistics <- function(pkData, referencePKData, simulationSetNa
   ) %>% mutate(Ratio = Value / ValueRef)
 
   # Data summary is applied to subgroups of QuantityPath and Parameter
-  ratioSummary <- ratioData %>%
-    group_by(QuantityPath, Parameter) %>%
+  ratioSummary <- ratioData[, c("QuantityPath", "Parameter", "Ratio")] %>%
     summarise(
       N = n(),
       Perc5 = quantile(Ratio, probs = 0.05, na.rm = TRUE),
@@ -165,7 +164,7 @@ getPKRatioSummaryStatistics <- function(pkData, referencePKData, simulationSetNa
       SD = sd(Ratio, na.rm = TRUE),
       GeoMean = exp(mean(log(Ratio), na.rm = TRUE)),
       GeoSD = exp(sd(log(Ratio), na.rm = TRUE)),
-      .groups = "drop_last"
+      .by = c("QuantityPath", "Parameter")
     ) %>%
     mutate(SimulationSetName = simulationSetName, .before = everything())
 
@@ -211,22 +210,20 @@ mcSampling <- function(dataSize, sampleSize, n = getDefaultMCRepetitions(), seed
 #' @import dplyr
 getPKRatioSummaryFromAnalyticalSolution <- function(pkData, referencePKData, simulationSetName) {
   pkSummary <- pkData %>%
-    group_by(QuantityPath, Parameter) %>%
     summarise(
       Mean = mean(Value, na.rm = TRUE),
       SD = sd(Value, na.rm = TRUE),
       MeanLog = mean(log(Value), na.rm = TRUE),
       SDLog = sd(log(Value), na.rm = TRUE),
-      .groups = "drop_last"
+      .by = c("QuantityPath", "Parameter")
     )
   referencePKSummary <- referencePKData %>%
-    group_by(QuantityPath, Parameter) %>%
     summarise(
       Mean = mean(Value, na.rm = TRUE),
       SD = sd(Value, na.rm = TRUE),
       MeanLog = mean(log(Value), na.rm = TRUE),
       SDLog = sd(log(Value), na.rm = TRUE),
-      .groups = "drop_last"
+      .by = c("QuantityPath", "Parameter")
     )
   # Merge data summaries
   ratioSummary <- left_join(
@@ -248,92 +245,181 @@ getPKRatioSummaryFromAnalyticalSolution <- function(pkData, referencePKData, sim
 }
 
 
-#' @title getPKRatioSummaryFromMCSampling
+#' @title getPKRatioSummaryFromSettings
 #' @description Get PK Parameter Ratio Measure From Monte Carlo Sampling
 #' @param pkData A data.frame of PK Parameter values for Population to compare
 #' @param referencePKData A data.frame of PK Parameter values for reference Population
-#' @param simulationSetName Name of simulation set
 #' @param settings A list of task settings
 #' @return A data.frame of the PK Parameter ratios summary statistics
 #' @keywords internal
-getPKRatioSummaryFromMCSampling <- function(pkData, referencePKData, simulationSetName, settings = NULL) {
+getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) {
   # Sample from largest population if size is different
   # Create a list of Sampled PK Parameters for each MC repetition and calculate their Ratio
   pkSize <- length(unique(pkData$IndividualId))
   referenceSize <- length(unique(referencePKData$IndividualId))
   mcRepetitions <- settings$mcRepetitions %||% getDefaultMCRepetitions()
   mcRandomSeed <- settings$mcRandomSeed %||% getDefaultMCRandomSeed()
 
-  t0 <- tic()
-  logInfo(messages$monteCarlo(simulationSetName, mcRepetitions, mcRandomSeed))
+  widerPopulation <- ifelse(pkSize < referenceSize, "reference", "comparison")
+
+  selectedIds <- mcSampling(
+    dataSize = switch(widerPopulation,
+      "reference" = referenceSize,
+      "comparison" = pkSize
+    ),
+    sampleSize = switch(widerPopulation,
+      "reference" = pkSize,
+      "comparison" = referenceSize
+    ),
+    n = mcRepetitions,
+    seed = mcRandomSeed
+  )
+  useParallel <- all(
+    requireNamespace("parallel", quietly = TRUE),
+    settings$numberOfCores > 1
+  )
+  if (useParallel) {
+    cl <- parallel::makeCluster(settings$numberOfCores)
+    on.exit(parallel::stopCluster(cl))
+
+    # Parallel code requires that some objects are exported and to use dplyr package
+    # Re-import and rename function in this specific environment to be exported on clusters
+    getParallelSummaryStatistics <- getPKRatioSummaryStatistics
+    invisible(parallel::clusterEvalQ(cl, library(dplyr)))
+    invisible(parallel::clusterExport(cl, c(
+      "selectedIds",
+      "pkSize",
+      "referenceSize",
+      "pkData",
+      "referencePKData",
+      "getParallelSummaryStatistics"
+    ),
+    envir = environment()
+    ))
+    listOfMCRatioData <- switch(widerPopulation,
+      "reference" = parallel::parLapply(
+        cl = cl,
+        seq_along(selectedIds),
+        function(repetition) {
+          getParallelSummaryStatistics(
+            pkData = pkData %>%
+              mutate(IndividualId = ave(
+                1:pkSize,
+                pkData$QuantityPath,
+                pkData$Parameter,
+                FUN = identity
+              )),
+            referencePKData = referencePKData %>%
+              slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>%
+              mutate(IndividualId = ave(1:pkSize, QuantityPath, Parameter, FUN = identity)),
+            simulationSetName = repetition
+          )
+        }
+      ),
+      "comparison" = parallel::parLapply(
+        cl = cl,
+        seq_along(selectedIds),
+        function(repetition) {
+          getParallelSummaryStatistics(
+            pkData %>%
+              slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>%
+              mutate(IndividualId = ave(1:referenceSize, QuantityPath, Parameter, FUN = identity)),
+            referencePKData = referencePKData %>%
+              mutate(IndividualId = ave(
+                1:referenceSize,
+                referencePKData$QuantityPath,
+                referencePKData$Parameter,
+                FUN = identity
+              )),
+            simulationSetName = repetition
+          )
+        }
+      )
+    )
+    mcRatioData <- do.call("rbind", listOfMCRatioData)
+    return(mcRatioData)
+  }
+
   if (settings$showProgress) {
     loadingProgress <- txtProgressBar(max = mcRepetitions, style = 3)
+    on.exit(close(loadingProgress))
   }
 
-  if (pkSize < referenceSize) {
-    selectedIds <- mcSampling(
-      dataSize = referenceSize,
-      sampleSize = pkSize,
-      n = mcRepetitions,
-      seed = mcRandomSeed
-    )
-
-    listOfMCRatioData <- lapply(
+  listOfMCRatioData <- switch(widerPopulation,
+    "reference" = lapply(
       seq_along(selectedIds),
       function(repetition) {
         if (settings$showProgress) {
           setTxtProgressBar(loadingProgress, value = repetition)
         }
         # data.frame is arranged by increasing values of IndividualId
+        # since group_by is slow, ave is used instead
         getPKRatioSummaryStatistics(
           pkData = pkData %>%
-            group_by(QuantityPath, Parameter) %>%
-            mutate(IndividualId = 1:pkSize),
+            mutate(IndividualId = ave(
+              1:pkSize,
+              pkData$QuantityPath,
+              pkData$Parameter,
+              FUN = identity
+            )),
           referencePKData = referencePKData %>%
-            group_by(QuantityPath, Parameter) %>%
-            slice(selectedIds[[repetition]]) %>%
-            mutate(IndividualId = 1:pkSize),
+            slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>%
+            mutate(IndividualId = ave(1:pkSize, QuantityPath, Parameter, FUN = identity)),
           simulationSetName = repetition
         )
       }
-    )
-  } else {
-    selectedIds <- mcSampling(
-      dataSize = pkSize,
-      sampleSize = referenceSize,
-      n = mcRepetitions,
-      seed = mcRandomSeed
-    )
-    listOfMCRatioData <- lapply(
+    ),
+    "comparison" = lapply(
       seq_along(selectedIds),
       function(repetition) {
         if (settings$showProgress) {
           setTxtProgressBar(loadingProgress, value = repetition)
         }
         # data.frame is arranged by increasing values of IndividualId
         getPKRatioSummaryStatistics(
-          pkData = pkData %>%
-            group_by(QuantityPath, Parameter) %>%
-            slice(selectedIds[[repetition]]) %>%
-            mutate(IndividualId = 1:referenceSize),
+          pkData %>%
+            slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>%
+            mutate(IndividualId = ave(1:referenceSize, QuantityPath, Parameter, FUN = identity)),
           referencePKData = referencePKData %>%
-            group_by(QuantityPath, Parameter) %>%
-            mutate(IndividualId = 1:referenceSize),
+            mutate(IndividualId = ave(
+              1:referenceSize,
+              referencePKData$QuantityPath,
+              referencePKData$Parameter,
+              FUN = identity
+            )),
           simulationSetName = repetition
         )
       }
     )
-  }
-  if (settings$showProgress) {
-    close(loadingProgress)
-  }
-  logInfo(messages$runCompleted(getElapsedTime(t0), "Monte Carlo Sampling"))
-  # Transform list to table
+  )
   mcRatioData <- do.call("rbind", listOfMCRatioData)
+  return(mcRatioData)
+}
+
+#' @title getPKRatioSummaryFromMCSampling
+#' @description Get PK Parameter Ratio Measure From Monte Carlo Sampling
+#' @param pkData A data.frame of PK Parameter values for Population to compare
+#' @param referencePKData A data.frame of PK Parameter values for reference Population
+#' @param simulationSetName Name of simulation set
+#' @param settings A list of task settings
+#' @return A data.frame of the PK Parameter ratios summary statistics
+#' @keywords internal
+getPKRatioSummaryFromMCSampling <- function(pkData, referencePKData, simulationSetName, settings = NULL) {
+  t0 <- tic()
+  mcRepetitions <- settings$mcRepetitions %||% getDefaultMCRepetitions()
+  mcRandomSeed <- settings$mcRandomSeed %||% getDefaultMCRandomSeed()
+  logInfo(messages$monteCarlo(simulationSetName, mcRepetitions, mcRandomSeed))
+  mcRatioData <- getPKRatioSummaryFromSettings(
+    pkData = pkData,
+    referencePKData = referencePKData,
+    settings = settings
+  )
+  logInfo(messages$runCompleted(getElapsedTime(t0), "Monte Carlo Sampling"))
+
   # Get median statistics over all MC repetitions as a data.frame
   mcRatioSummary <- mcRatioData %>%
     group_by(QuantityPath, Parameter) %>%
-    summarise_all(median, .groups = "drop_last") %>%
+    summarise_all(median, .group = "drop_last") %>%
     mutate(SimulationSetName = simulationSetName, .before = everything())
 
   return(mcRatioSummary)

diff --git a/man/getPKRatioSummaryFromSettings.Rd b/man/getPKRatioSummaryFromSettings.Rd
diff --git a/tests/testthat/test-monte-carlo-sampling.R b/tests/testthat/test-monte-carlo-sampling.R
@@ -63,7 +63,7 @@ test_that("Monte Carlo Solution is close to known solution", {
     referencePKData = referenceData, 
     simulationSetName = "test",
     # With 10000 repetitions the method runs longer
-    settings = list(showProgress = FALSE, mcRepetitions = 200)
+    settings = list(showProgress = FALSE, numberOfCores = 1, mcRepetitions = 200)
   )
 
   expect_equal(
@@ -84,13 +84,13 @@ test_that("Monte Carlo Solution is repeatable", {
     pkData = comparisonData, 
     referencePKData = referenceData, 
     simulationSetName = "test",
-    settings = list(showProgress = FALSE, mcRepetitions = 100, mcRandomSeed = 3333)
+    settings = list(showProgress = FALSE, numberOfCores = 1, mcRepetitions = 100, mcRandomSeed = 3333)
   )
   mcSolution2 <- ospsuite.reportingengine:::getPKRatioSummaryFromMCSampling(
     pkData = comparisonData, 
     referencePKData = referenceData, 
     simulationSetName = "test",
-    settings = list(showProgress = FALSE, mcRepetitions = 100, mcRandomSeed = 3333)
+    settings = list(showProgress = FALSE, numberOfCores = 1, mcRepetitions = 100, mcRandomSeed = 3333)
   )
 
   expect_equal(mcSolution1, mcSolution2)