Fixes #1086 ratios data.frame is pivoted into matrix

Open-Systems-Pharmacology · Oct 25, 2023 · 9e9ff2b · 9e9ff2b
1 parent d540956
commit 9e9ff2b
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 96 deletions.
diff --git a/R/utilities-ratio-comparison.R b/R/utilities-ratio-comparison.R
@@ -74,9 +74,13 @@ getPKRatioSummaryForDifferentPopulations <- function(structureSet, referenceSet,
     filePath = referenceSet$pkAnalysisResultsFileNames,
     simulation = ospsuite::loadSimulation(referenceSet$simulationSet$simulationFile)
   )
-  # Use arrange to ensure Ids are consistent between PK parameters and output paths
-  pkData <- ospsuite::pkAnalysesToTibble(pkAnalyses) %>% arrange(IndividualId)
-  referencePKData <- ospsuite::pkAnalysesToTibble(referencePKAnalyses) %>% arrange(IndividualId)
+  # Use arrange to ensure Ids, QuantityPath and Parameter are consistent between PK parameters and output paths
+  pkData <- ospsuite::pkAnalysesToTibble(pkAnalyses) %>% 
+    select(IndividualId, QuantityPath, Parameter, Value) %>% 
+    arrange(IndividualId, QuantityPath, Parameter)
+  referencePKData <- ospsuite::pkAnalysesToTibble(referencePKAnalyses) %>% 
+    select(IndividualId, QuantityPath, Parameter, Value) %>% 
+    arrange(IndividualId, QuantityPath, Parameter)
 
   # Check that both PK data to be compared are included in reference PK data
   validateIsIncluded(unique(pkData$QuantityPath), unique(referencePKData$QuantityPath))
@@ -117,58 +121,120 @@ getPKRatioSummaryForSamePopulation <- function(structureSet, referenceSet) {
     filePath = referenceSet$pkAnalysisResultsFileNames,
     simulation = ospsuite::loadSimulation(referenceSet$simulationSet$simulationFile)
   )
-  pkData <- ospsuite::pkAnalysesToTibble(pkAnalyses)
-  referencePKData <- ospsuite::pkAnalysesToTibble(referencePKAnalyses)
+  pkData <- ospsuite::pkAnalysesToTibble(pkAnalyses) %>% 
+    select(IndividualId, QuantityPath, Parameter, Value) %>% 
+    arrange(IndividualId, QuantityPath, Parameter)
+  referencePKData <- ospsuite::pkAnalysesToTibble(referencePKAnalyses) %>% 
+    select(IndividualId, QuantityPath, Parameter, Value) %>% 
+    arrange(IndividualId, QuantityPath, Parameter)
   # Check that both PK data to be compared are included in reference PK data
   validateIsIncluded(unique(pkData$QuantityPath), unique(referencePKData$QuantityPath))
   validateIsIncluded(unique(pkData$Parameter), unique(referencePKData$Parameter))
-
+
+  # Pivot table to get fast computation of ratios and their statistics
+  quantityPaths <- unique(pkData$QuantityPath)
+  parameters <- unique(pkData$Parameter)
+
+  pkRatioSummaryGroups <- data.frame(
+    SimulationSetName = structureSet$simulationSet$simulationSetName,
+    QuantityPath = rep(quantityPaths, each = length(parameters)),
+    Parameter = rep(parameters, length(quantityPaths))
+  )
+
+  pkData <- pkData %>% 
+    tidyr::pivot_wider(
+      names_from = c(QuantityPath, Parameter), 
+      values_from = Value
+    )
+  referencePKData <- referencePKData %>% 
+    tidyr::pivot_wider(
+      names_from = c(QuantityPath, Parameter), 
+      values_from = Value
+    )
+
   pkRatioSummary <- getPKRatioSummaryStatistics(
-    pkData = pkData,
-    referencePKData = referencePKData,
-    simulationSetName = structureSet$simulationSet$simulationSetName
+    pkData = as.matrix(pkData),
+    # In case reference data had more paths and parameters, restrict to those in pkData 
+    referencePKData = as.matrix(referencePKData[,names(referencePKData) %in% names(pkData)])
   )
+  # Combine stats to their parameters, paths and simulation set name
+  pkRatioSummary <- bind_cols(
+    pkRatioSummaryGroups, 
+    summaryStatisticsToDataFrame(pkRatioSummary)
+    )
 
   return(pkRatioSummary)
 }
 
 #' @title getPKRatioSummaryStatistics
 #' @description
 #' Calculate and save summary statistics of ratios of PK parameters
-#' @param pkData A data.frame of PK Parameter values for Population to compare
-#' @param referencePKData A data.frame of PK Parameter values for reference Population
-#' @param simulationSetName Name of simulation set
-#' @return A data.frame of the PK Parameter ratios summary statistics
+#' Note that this function computes on matrix objects to be faster 
+#' than on data.frame when Monte Carlo simulation is performed
+#' @param pkData A matrix of PK Parameter values for Population to compare
+#' @param referencePKData A matrix of PK Parameter values for reference Population
+#' @return A matrix of the PK Parameter ratios summary statistics
 #' @keywords internal
-#' @import dplyr
-getPKRatioSummaryStatistics <- function(pkData, referencePKData, simulationSetName) {
-  # Use left join to include only comparable data
-  # in case reference includes additional parameters or paths
-  ratioData <- left_join(
-    pkData,
-    referencePKData,
-    by = c("IndividualId", "QuantityPath", "Parameter"),
-    suffix = c("", "Ref")
-  ) %>% mutate(Ratio = Value / ValueRef)
+getPKRatioSummaryStatistics <- function(pkData, referencePKData) {
+  # Since data.frame objects were pivoted to get matrix objects
+  # Rows contained unique individuals while
+  # columns contained all combinations of QuantityPath and Parameter
+  # Since the object is a matrix of numeric values, calculation is vectorized
+  ratioData <- pkData / referencePKData
+  ratioSummary <- rbind(
+    # Apply with MARGIN = 2 means that calculation is performed by columns
+    N = apply(ratioData, 2, FUN = function(x){sum(!is.na(x))}),
+    apply(ratioData, 2, FUN = function(x){quantile(x, probs = c(0.05,0.25,0.5,0.75,0.95), na.rm = TRUE)}),
+    Mean = apply(ratioData, 2, FUN = function(x){mean(x, na.rm = TRUE)}),
+    SD = apply(ratioData, 2, FUN = function(x){sd(x, na.rm = TRUE)}),
+    GeoMean = apply(ratioData, 2, FUN = function(x){exp(mean(log(x), na.rm = TRUE))}),
+    GeoSD = apply(ratioData, 2, FUN = function(x){exp(sd(log(x), na.rm = TRUE))})
+  )
+  return(ratioSummary)
+}
 
-  # Data summary is applied to subgroups of QuantityPath and Parameter
-  ratioSummary <- ratioData[, c("QuantityPath", "Parameter", "Ratio")] %>%
-    summarise(
-      N = n(),
-      Perc5 = quantile(Ratio, probs = 0.05, na.rm = TRUE),
-      Perc25 = quantile(Ratio, probs = 0.25, na.rm = TRUE),
-      Perc50 = quantile(Ratio, probs = 0.5, na.rm = TRUE),
-      Perc75 = quantile(Ratio, probs = 0.75, na.rm = TRUE),
-      Perc95 = quantile(Ratio, probs = 0.95, na.rm = TRUE),
-      Mean = mean(Ratio, na.rm = TRUE),
-      SD = sd(Ratio, na.rm = TRUE),
-      GeoMean = exp(mean(log(Ratio), na.rm = TRUE)),
-      GeoSD = exp(sd(log(Ratio), na.rm = TRUE)),
-      .by = c("QuantityPath", "Parameter")
-    ) %>%
-    mutate(SimulationSetName = simulationSetName, .before = everything())
+#' @title summaryStatisticsToDataFrame
+#' @description
+#' Translate matrix of summary statistics to data.frame with appropriate names
+#' @param data A matrix of summary statistics
+#' @return A data.frame displaying summary statistics by column
+#' @keywords internal
+summaryStatisticsToDataFrame <- function(data, monteCarlo = FALSE){
+  # For same population, summary statistics is by row and needs to be transposed
+  # For different populations, summary statistics is directly by column (because of sapply usage)
+  if(!monteCarlo){
+    data <- t(data)
+    }
+  data <- as.data.frame(data)
+  names(data) <- c("N", paste0("Perc", c(5,25,50,75,95)), "Mean", "SD", "GeoMean", "GeoSD")
+  return(data)
+}
 
-  return(as.data.frame(ratioSummary))
+#' @title getMonteCarloMedians
+#' @description
+#' Get all median values from a list of Monte Carlo simulations
+#' @param listOfData A list of summary statistics matrices
+#' @return A data.frame displaying summary statistics by column
+#' @keywords internal
+getMonteCarloMedians <- function(listOfData){
+  # Translate list of matrix objects into a single matrix object
+  # The matrix object has 10 rows of summary statistics, 
+  # repeated as many times as the number of Monte Carlo simulations
+  data <- do.call("rbind", listOfData)
+  medianData <- sapply(
+    1:10,
+    function(statsIndex){
+      selectedRows <- statsIndex + seq(0, nrow(data)-10, 10)
+      # Get median of each column per selected stats
+      apply(data[selectedRows,], 2, median)
+    }
+  )
+  medianData <- summaryStatisticsToDataFrame(
+    # First row is summary stat for individual ids and needs to be removed
+    tail(medianData, -1),
+    monteCarlo = TRUE
+    )
+  return(medianData)
 }
 
 #' @title mcSampling
@@ -247,16 +313,16 @@ getPKRatioSummaryFromAnalyticalSolution <- function(pkData, referencePKData, sim
 
 #' @title getPKRatioSummaryFromSettings
 #' @description Get PK Parameter Ratio Measure From Monte Carlo Sampling
-#' @param pkData A data.frame of PK Parameter values for Population to compare
-#' @param referencePKData A data.frame of PK Parameter values for reference Population
+#' @param pkData A matrix of PK Parameter values for Population to compare
+#' @param referencePKData A matrix of PK Parameter values for reference Population
 #' @param settings A list of task settings
 #' @return A data.frame of the PK Parameter ratios summary statistics
 #' @keywords internal
 getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) {
   # Sample from largest population if size is different
   # Create a list of Sampled PK Parameters for each MC repetition and calculate their Ratio
-  pkSize <- length(unique(pkData$IndividualId))
-  referenceSize <- length(unique(referencePKData$IndividualId))
+  pkSize <- nrow(pkData)
+  referenceSize <- nrow(referencePKData)
   mcRepetitions <- settings$mcRepetitions %||% getDefaultMCRepetitions()
   mcRandomSeed <- settings$mcRandomSeed %||% getDefaultMCRandomSeed()
 
@@ -285,11 +351,8 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) {
     # Parallel code requires that some objects are exported and to use dplyr package
     # Re-import and rename function in this specific environment to be exported on clusters
     getParallelSummaryStatistics <- getPKRatioSummaryStatistics
-    invisible(parallel::clusterEvalQ(cl, library(dplyr)))
     invisible(parallel::clusterExport(cl, c(
       "selectedIds",
-      "pkSize",
-      "referenceSize",
       "pkData",
       "referencePKData",
       "getParallelSummaryStatistics"
@@ -302,12 +365,8 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) {
         seq_along(selectedIds),
         function(repetition) {
           getParallelSummaryStatistics(
-            pkData = pkData %>%
-              mutate(IndividualId = 1:pkSize, .by = c("QuantityPath", "Parameter")),
-            referencePKData = referencePKData %>%
-              slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>%
-              mutate(IndividualId = 1:pkSize, .by = c("QuantityPath", "Parameter")),
-            simulationSetName = repetition
+            pkData = pkData,
+            referencePKData = referencePKData[selectedIds[[repetition]], ]
           )
         }
       ),
@@ -316,18 +375,13 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) {
         seq_along(selectedIds),
         function(repetition) {
           getParallelSummaryStatistics(
-            pkData %>%
-              slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>%
-              mutate(IndividualId = 1:referenceSize, .by = c("QuantityPath", "Parameter")),
-            referencePKData = referencePKData %>%
-              mutate(IndividualId = 1:referenceSize, .by = c("QuantityPath", "Parameter")),
-            simulationSetName = repetition
+            pkData = pkData[selectedIds[[repetition]], ],
+            referencePKData = referencePKData
           )
         }
       )
     )
-    mcRatioData <- do.call("rbind", listOfMCRatioData)
-    return(mcRatioData)
+    return(getMonteCarloMedians(listOfMCRatioData))
   }
 
   if (settings$showProgress) {
@@ -342,15 +396,11 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) {
         if (settings$showProgress) {
           setTxtProgressBar(loadingProgress, value = repetition)
         }
-        # data.frame is arranged by increasing values of IndividualId
-        # since group_by is slow, .by is used instead
+        # With pivoted matrix objects, there is no need to group_by rows correspond to unique individual ids
+        # thus, smaller population can be used as is and wider only needs to be subset by simulated Monte Carlo rows
         getPKRatioSummaryStatistics(
-          pkData = pkData %>%
-            mutate(IndividualId = 1:pkSize, .by = c("QuantityPath", "Parameter")),
-          referencePKData = referencePKData %>%
-            slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>%
-            mutate(IndividualId = 1:pkSize, .by = c("QuantityPath", "Parameter")),
-          simulationSetName = repetition
+          pkData = pkData,
+          referencePKData = referencePKData[selectedIds[[repetition]], ]
         )
       }
     ),
@@ -362,18 +412,13 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) {
         }
         # data.frame is arranged by increasing values of IndividualId
         getPKRatioSummaryStatistics(
-          pkData %>%
-            slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>%
-            mutate(IndividualId = 1:referenceSize, .by = c("QuantityPath", "Parameter")),
-          referencePKData = referencePKData %>%
-            mutate(IndividualId = 1:referenceSize, .by = c("QuantityPath", "Parameter")),
-          simulationSetName = repetition
+          pkData = pkData[selectedIds[[repetition]], ],
+          referencePKData = referencePKData
         )
       }
     )
   )
-  mcRatioData <- do.call("rbind", listOfMCRatioData)
-  return(mcRatioData)
+  return(getMonteCarloMedians(listOfMCRatioData))
 }
 
 #' @title getPKRatioSummaryFromMCSampling
@@ -389,20 +434,37 @@ getPKRatioSummaryFromMCSampling <- function(pkData, referencePKData, simulationS
   mcRepetitions <- settings$mcRepetitions %||% getDefaultMCRepetitions()
   mcRandomSeed <- settings$mcRandomSeed %||% getDefaultMCRandomSeed()
   logInfo(messages$monteCarlo(simulationSetName, mcRepetitions, mcRandomSeed))
-  mcRatioData <- getPKRatioSummaryFromSettings(
-    pkData = pkData,
-    referencePKData = referencePKData,
+
+  # Pivot table to get fast computation of ratios and their statistics
+  quantityPaths <- unique(pkData$QuantityPath)
+  parameters <- unique(pkData$Parameter)
+
+  pkRatioSummaryGroups <- data.frame(
+    SimulationSetName = simulationSetName,
+    QuantityPath = rep(quantityPaths, each = length(parameters)),
+    Parameter = rep(parameters, length(quantityPaths))
+  )
+
+  pkData <- pkData %>% 
+    tidyr::pivot_wider(
+      names_from = c(QuantityPath, Parameter), 
+      values_from = Value
+    )
+  referencePKData <- referencePKData %>% 
+    tidyr::pivot_wider(
+      names_from = c(QuantityPath, Parameter), 
+      values_from = Value
+    )
+
+  pkRatioSummary <- getPKRatioSummaryFromSettings(
+    pkData = as.matrix(pkData),
+    referencePKData = as.matrix(referencePKData[,names(referencePKData) %in% names(pkData)]),
     settings = settings
   )
+  pkRatioSummary <- bind_cols(pkRatioSummaryGroups, pkRatioSummary)
   logInfo(messages$runCompleted(getElapsedTime(t0), "Monte Carlo Sampling"))
 
-  # Get median statistics over all MC repetitions as a data.frame
-  mcRatioSummary <- mcRatioData %>%
-    group_by(QuantityPath, Parameter) %>%
-    summarise_all(median, .group = "drop_last") %>%
-    mutate(SimulationSetName = simulationSetName, .before = everything())
-
-  return(mcRatioSummary)
+  return(pkRatioSummary)
 }
 
 #' @title ratioBoxplot

diff --git a/man/getMonteCarloMedians.Rd b/man/getMonteCarloMedians.Rd
diff --git a/man/getPKRatioSummaryFromSettings.Rd b/man/getPKRatioSummaryFromSettings.Rd
diff --git a/man/getPKRatioSummaryStatistics.Rd b/man/getPKRatioSummaryStatistics.Rd