From 9e9ff2b6bd7c39f4107007b2bad5b1917d6c0ff1 Mon Sep 17 00:00:00 2001 From: pchelle Date: Wed, 25 Oct 2023 04:10:12 -0400 Subject: [PATCH] Fixes #1086 ratios data.frame is pivoted into matrix --- R/utilities-ratio-comparison.R | 236 +++++++++++++++--------- man/getMonteCarloMedians.Rd | 18 ++ man/getPKRatioSummaryFromSettings.Rd | 4 +- man/getPKRatioSummaryStatistics.Rd | 12 +- man/summaryStatisticsToDataFrame.Rd | 18 ++ tests/testthat/test-pop-pk-parameters.R | 2 +- 6 files changed, 194 insertions(+), 96 deletions(-) create mode 100644 man/getMonteCarloMedians.Rd create mode 100644 man/summaryStatisticsToDataFrame.Rd diff --git a/R/utilities-ratio-comparison.R b/R/utilities-ratio-comparison.R index 795ab1c7..f608d8ab 100644 --- a/R/utilities-ratio-comparison.R +++ b/R/utilities-ratio-comparison.R @@ -74,9 +74,13 @@ getPKRatioSummaryForDifferentPopulations <- function(structureSet, referenceSet, filePath = referenceSet$pkAnalysisResultsFileNames, simulation = ospsuite::loadSimulation(referenceSet$simulationSet$simulationFile) ) - # Use arrange to ensure Ids are consistent between PK parameters and output paths - pkData <- ospsuite::pkAnalysesToTibble(pkAnalyses) %>% arrange(IndividualId) - referencePKData <- ospsuite::pkAnalysesToTibble(referencePKAnalyses) %>% arrange(IndividualId) + # Use arrange to ensure Ids, QuantityPath and Parameter are consistent between PK parameters and output paths + pkData <- ospsuite::pkAnalysesToTibble(pkAnalyses) %>% + select(IndividualId, QuantityPath, Parameter, Value) %>% + arrange(IndividualId, QuantityPath, Parameter) + referencePKData <- ospsuite::pkAnalysesToTibble(referencePKAnalyses) %>% + select(IndividualId, QuantityPath, Parameter, Value) %>% + arrange(IndividualId, QuantityPath, Parameter) # Check that both PK data to be compared are included in reference PK data validateIsIncluded(unique(pkData$QuantityPath), unique(referencePKData$QuantityPath)) @@ -117,17 +121,47 @@ getPKRatioSummaryForSamePopulation <- function(structureSet, referenceSet) { filePath = referenceSet$pkAnalysisResultsFileNames, simulation = ospsuite::loadSimulation(referenceSet$simulationSet$simulationFile) ) - pkData <- ospsuite::pkAnalysesToTibble(pkAnalyses) - referencePKData <- ospsuite::pkAnalysesToTibble(referencePKAnalyses) + pkData <- ospsuite::pkAnalysesToTibble(pkAnalyses) %>% + select(IndividualId, QuantityPath, Parameter, Value) %>% + arrange(IndividualId, QuantityPath, Parameter) + referencePKData <- ospsuite::pkAnalysesToTibble(referencePKAnalyses) %>% + select(IndividualId, QuantityPath, Parameter, Value) %>% + arrange(IndividualId, QuantityPath, Parameter) # Check that both PK data to be compared are included in reference PK data validateIsIncluded(unique(pkData$QuantityPath), unique(referencePKData$QuantityPath)) validateIsIncluded(unique(pkData$Parameter), unique(referencePKData$Parameter)) - + + # Pivot table to get fast computation of ratios and their statistics + quantityPaths <- unique(pkData$QuantityPath) + parameters <- unique(pkData$Parameter) + + pkRatioSummaryGroups <- data.frame( + SimulationSetName = structureSet$simulationSet$simulationSetName, + QuantityPath = rep(quantityPaths, each = length(parameters)), + Parameter = rep(parameters, length(quantityPaths)) + ) + + pkData <- pkData %>% + tidyr::pivot_wider( + names_from = c(QuantityPath, Parameter), + values_from = Value + ) + referencePKData <- referencePKData %>% + tidyr::pivot_wider( + names_from = c(QuantityPath, Parameter), + values_from = Value + ) + pkRatioSummary <- getPKRatioSummaryStatistics( - pkData = pkData, - referencePKData = referencePKData, - simulationSetName = structureSet$simulationSet$simulationSetName + pkData = as.matrix(pkData), + # In case reference data had more paths and parameters, restrict to those in pkData + referencePKData = as.matrix(referencePKData[,names(referencePKData) %in% names(pkData)]) ) + # Combine stats to their parameters, paths and simulation set name + pkRatioSummary <- bind_cols( + pkRatioSummaryGroups, + summaryStatisticsToDataFrame(pkRatioSummary) + ) return(pkRatioSummary) } @@ -135,40 +169,72 @@ getPKRatioSummaryForSamePopulation <- function(structureSet, referenceSet) { #' @title getPKRatioSummaryStatistics #' @description #' Calculate and save summary statistics of ratios of PK parameters -#' @param pkData A data.frame of PK Parameter values for Population to compare -#' @param referencePKData A data.frame of PK Parameter values for reference Population -#' @param simulationSetName Name of simulation set -#' @return A data.frame of the PK Parameter ratios summary statistics +#' Note that this function computes on matrix objects to be faster +#' than on data.frame when Monte Carlo simulation is performed +#' @param pkData A matrix of PK Parameter values for Population to compare +#' @param referencePKData A matrix of PK Parameter values for reference Population +#' @return A matrix of the PK Parameter ratios summary statistics #' @keywords internal -#' @import dplyr -getPKRatioSummaryStatistics <- function(pkData, referencePKData, simulationSetName) { - # Use left join to include only comparable data - # in case reference includes additional parameters or paths - ratioData <- left_join( - pkData, - referencePKData, - by = c("IndividualId", "QuantityPath", "Parameter"), - suffix = c("", "Ref") - ) %>% mutate(Ratio = Value / ValueRef) +getPKRatioSummaryStatistics <- function(pkData, referencePKData) { + # Since data.frame objects were pivoted to get matrix objects + # Rows contained unique individuals while + # columns contained all combinations of QuantityPath and Parameter + # Since the object is a matrix of numeric values, calculation is vectorized + ratioData <- pkData / referencePKData + ratioSummary <- rbind( + # Apply with MARGIN = 2 means that calculation is performed by columns + N = apply(ratioData, 2, FUN = function(x){sum(!is.na(x))}), + apply(ratioData, 2, FUN = function(x){quantile(x, probs = c(0.05,0.25,0.5,0.75,0.95), na.rm = TRUE)}), + Mean = apply(ratioData, 2, FUN = function(x){mean(x, na.rm = TRUE)}), + SD = apply(ratioData, 2, FUN = function(x){sd(x, na.rm = TRUE)}), + GeoMean = apply(ratioData, 2, FUN = function(x){exp(mean(log(x), na.rm = TRUE))}), + GeoSD = apply(ratioData, 2, FUN = function(x){exp(sd(log(x), na.rm = TRUE))}) + ) + return(ratioSummary) +} - # Data summary is applied to subgroups of QuantityPath and Parameter - ratioSummary <- ratioData[, c("QuantityPath", "Parameter", "Ratio")] %>% - summarise( - N = n(), - Perc5 = quantile(Ratio, probs = 0.05, na.rm = TRUE), - Perc25 = quantile(Ratio, probs = 0.25, na.rm = TRUE), - Perc50 = quantile(Ratio, probs = 0.5, na.rm = TRUE), - Perc75 = quantile(Ratio, probs = 0.75, na.rm = TRUE), - Perc95 = quantile(Ratio, probs = 0.95, na.rm = TRUE), - Mean = mean(Ratio, na.rm = TRUE), - SD = sd(Ratio, na.rm = TRUE), - GeoMean = exp(mean(log(Ratio), na.rm = TRUE)), - GeoSD = exp(sd(log(Ratio), na.rm = TRUE)), - .by = c("QuantityPath", "Parameter") - ) %>% - mutate(SimulationSetName = simulationSetName, .before = everything()) +#' @title summaryStatisticsToDataFrame +#' @description +#' Translate matrix of summary statistics to data.frame with appropriate names +#' @param data A matrix of summary statistics +#' @return A data.frame displaying summary statistics by column +#' @keywords internal +summaryStatisticsToDataFrame <- function(data, monteCarlo = FALSE){ + # For same population, summary statistics is by row and needs to be transposed + # For different populations, summary statistics is directly by column (because of sapply usage) + if(!monteCarlo){ + data <- t(data) + } + data <- as.data.frame(data) + names(data) <- c("N", paste0("Perc", c(5,25,50,75,95)), "Mean", "SD", "GeoMean", "GeoSD") + return(data) +} - return(as.data.frame(ratioSummary)) +#' @title getMonteCarloMedians +#' @description +#' Get all median values from a list of Monte Carlo simulations +#' @param listOfData A list of summary statistics matrices +#' @return A data.frame displaying summary statistics by column +#' @keywords internal +getMonteCarloMedians <- function(listOfData){ + # Translate list of matrix objects into a single matrix object + # The matrix object has 10 rows of summary statistics, + # repeated as many times as the number of Monte Carlo simulations + data <- do.call("rbind", listOfData) + medianData <- sapply( + 1:10, + function(statsIndex){ + selectedRows <- statsIndex + seq(0, nrow(data)-10, 10) + # Get median of each column per selected stats + apply(data[selectedRows,], 2, median) + } + ) + medianData <- summaryStatisticsToDataFrame( + # First row is summary stat for individual ids and needs to be removed + tail(medianData, -1), + monteCarlo = TRUE + ) + return(medianData) } #' @title mcSampling @@ -247,16 +313,16 @@ getPKRatioSummaryFromAnalyticalSolution <- function(pkData, referencePKData, sim #' @title getPKRatioSummaryFromSettings #' @description Get PK Parameter Ratio Measure From Monte Carlo Sampling -#' @param pkData A data.frame of PK Parameter values for Population to compare -#' @param referencePKData A data.frame of PK Parameter values for reference Population +#' @param pkData A matrix of PK Parameter values for Population to compare +#' @param referencePKData A matrix of PK Parameter values for reference Population #' @param settings A list of task settings #' @return A data.frame of the PK Parameter ratios summary statistics #' @keywords internal getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) { # Sample from largest population if size is different # Create a list of Sampled PK Parameters for each MC repetition and calculate their Ratio - pkSize <- length(unique(pkData$IndividualId)) - referenceSize <- length(unique(referencePKData$IndividualId)) + pkSize <- nrow(pkData) + referenceSize <- nrow(referencePKData) mcRepetitions <- settings$mcRepetitions %||% getDefaultMCRepetitions() mcRandomSeed <- settings$mcRandomSeed %||% getDefaultMCRandomSeed() @@ -285,11 +351,8 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) { # Parallel code requires that some objects are exported and to use dplyr package # Re-import and rename function in this specific environment to be exported on clusters getParallelSummaryStatistics <- getPKRatioSummaryStatistics - invisible(parallel::clusterEvalQ(cl, library(dplyr))) invisible(parallel::clusterExport(cl, c( "selectedIds", - "pkSize", - "referenceSize", "pkData", "referencePKData", "getParallelSummaryStatistics" @@ -302,12 +365,8 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) { seq_along(selectedIds), function(repetition) { getParallelSummaryStatistics( - pkData = pkData %>% - mutate(IndividualId = 1:pkSize, .by = c("QuantityPath", "Parameter")), - referencePKData = referencePKData %>% - slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>% - mutate(IndividualId = 1:pkSize, .by = c("QuantityPath", "Parameter")), - simulationSetName = repetition + pkData = pkData, + referencePKData = referencePKData[selectedIds[[repetition]], ] ) } ), @@ -316,18 +375,13 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) { seq_along(selectedIds), function(repetition) { getParallelSummaryStatistics( - pkData %>% - slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>% - mutate(IndividualId = 1:referenceSize, .by = c("QuantityPath", "Parameter")), - referencePKData = referencePKData %>% - mutate(IndividualId = 1:referenceSize, .by = c("QuantityPath", "Parameter")), - simulationSetName = repetition + pkData = pkData[selectedIds[[repetition]], ], + referencePKData = referencePKData ) } ) ) - mcRatioData <- do.call("rbind", listOfMCRatioData) - return(mcRatioData) + return(getMonteCarloMedians(listOfMCRatioData)) } if (settings$showProgress) { @@ -342,15 +396,11 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) { if (settings$showProgress) { setTxtProgressBar(loadingProgress, value = repetition) } - # data.frame is arranged by increasing values of IndividualId - # since group_by is slow, .by is used instead + # With pivoted matrix objects, there is no need to group_by rows correspond to unique individual ids + # thus, smaller population can be used as is and wider only needs to be subset by simulated Monte Carlo rows getPKRatioSummaryStatistics( - pkData = pkData %>% - mutate(IndividualId = 1:pkSize, .by = c("QuantityPath", "Parameter")), - referencePKData = referencePKData %>% - slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>% - mutate(IndividualId = 1:pkSize, .by = c("QuantityPath", "Parameter")), - simulationSetName = repetition + pkData = pkData, + referencePKData = referencePKData[selectedIds[[repetition]], ] ) } ), @@ -362,18 +412,13 @@ getPKRatioSummaryFromSettings <- function(pkData, referencePKData, settings) { } # data.frame is arranged by increasing values of IndividualId getPKRatioSummaryStatistics( - pkData %>% - slice(selectedIds[[repetition]], .by = c("QuantityPath", "Parameter")) %>% - mutate(IndividualId = 1:referenceSize, .by = c("QuantityPath", "Parameter")), - referencePKData = referencePKData %>% - mutate(IndividualId = 1:referenceSize, .by = c("QuantityPath", "Parameter")), - simulationSetName = repetition + pkData = pkData[selectedIds[[repetition]], ], + referencePKData = referencePKData ) } ) ) - mcRatioData <- do.call("rbind", listOfMCRatioData) - return(mcRatioData) + return(getMonteCarloMedians(listOfMCRatioData)) } #' @title getPKRatioSummaryFromMCSampling @@ -389,20 +434,37 @@ getPKRatioSummaryFromMCSampling <- function(pkData, referencePKData, simulationS mcRepetitions <- settings$mcRepetitions %||% getDefaultMCRepetitions() mcRandomSeed <- settings$mcRandomSeed %||% getDefaultMCRandomSeed() logInfo(messages$monteCarlo(simulationSetName, mcRepetitions, mcRandomSeed)) - mcRatioData <- getPKRatioSummaryFromSettings( - pkData = pkData, - referencePKData = referencePKData, + + # Pivot table to get fast computation of ratios and their statistics + quantityPaths <- unique(pkData$QuantityPath) + parameters <- unique(pkData$Parameter) + + pkRatioSummaryGroups <- data.frame( + SimulationSetName = simulationSetName, + QuantityPath = rep(quantityPaths, each = length(parameters)), + Parameter = rep(parameters, length(quantityPaths)) + ) + + pkData <- pkData %>% + tidyr::pivot_wider( + names_from = c(QuantityPath, Parameter), + values_from = Value + ) + referencePKData <- referencePKData %>% + tidyr::pivot_wider( + names_from = c(QuantityPath, Parameter), + values_from = Value + ) + + pkRatioSummary <- getPKRatioSummaryFromSettings( + pkData = as.matrix(pkData), + referencePKData = as.matrix(referencePKData[,names(referencePKData) %in% names(pkData)]), settings = settings ) + pkRatioSummary <- bind_cols(pkRatioSummaryGroups, pkRatioSummary) logInfo(messages$runCompleted(getElapsedTime(t0), "Monte Carlo Sampling")) - # Get median statistics over all MC repetitions as a data.frame - mcRatioSummary <- mcRatioData %>% - group_by(QuantityPath, Parameter) %>% - summarise_all(median, .group = "drop_last") %>% - mutate(SimulationSetName = simulationSetName, .before = everything()) - - return(mcRatioSummary) + return(pkRatioSummary) } #' @title ratioBoxplot diff --git a/man/getMonteCarloMedians.Rd b/man/getMonteCarloMedians.Rd new file mode 100644 index 00000000..3c8fe6c2 --- /dev/null +++ b/man/getMonteCarloMedians.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-ratio-comparison.R +\name{getMonteCarloMedians} +\alias{getMonteCarloMedians} +\title{getMonteCarloMedians} +\usage{ +getMonteCarloMedians(listOfData) +} +\arguments{ +\item{listOfData}{A list of summary statistics matrices} +} +\value{ +A data.frame displaying summary statistics by column +} +\description{ +Get all median values from a list of Monte Carlo simulations +} +\keyword{internal} diff --git a/man/getPKRatioSummaryFromSettings.Rd b/man/getPKRatioSummaryFromSettings.Rd index 5c7c69b2..ff82d465 100644 --- a/man/getPKRatioSummaryFromSettings.Rd +++ b/man/getPKRatioSummaryFromSettings.Rd @@ -7,9 +7,9 @@ getPKRatioSummaryFromSettings(pkData, referencePKData, settings) } \arguments{ -\item{pkData}{A data.frame of PK Parameter values for Population to compare} +\item{pkData}{A matrix of PK Parameter values for Population to compare} -\item{referencePKData}{A data.frame of PK Parameter values for reference Population} +\item{referencePKData}{A matrix of PK Parameter values for reference Population} \item{settings}{A list of task settings} } diff --git a/man/getPKRatioSummaryStatistics.Rd b/man/getPKRatioSummaryStatistics.Rd index 31d23b36..0a44454b 100644 --- a/man/getPKRatioSummaryStatistics.Rd +++ b/man/getPKRatioSummaryStatistics.Rd @@ -4,19 +4,19 @@ \alias{getPKRatioSummaryStatistics} \title{getPKRatioSummaryStatistics} \usage{ -getPKRatioSummaryStatistics(pkData, referencePKData, simulationSetName) +getPKRatioSummaryStatistics(pkData, referencePKData) } \arguments{ -\item{pkData}{A data.frame of PK Parameter values for Population to compare} +\item{pkData}{A matrix of PK Parameter values for Population to compare} -\item{referencePKData}{A data.frame of PK Parameter values for reference Population} - -\item{simulationSetName}{Name of simulation set} +\item{referencePKData}{A matrix of PK Parameter values for reference Population} } \value{ -A data.frame of the PK Parameter ratios summary statistics +A matrix of the PK Parameter ratios summary statistics } \description{ Calculate and save summary statistics of ratios of PK parameters +Note that this function computes on matrix objects to be faster +than on data.frame when Monte Carlo simulation is performed } \keyword{internal} diff --git a/man/summaryStatisticsToDataFrame.Rd b/man/summaryStatisticsToDataFrame.Rd new file mode 100644 index 00000000..a987fa7b --- /dev/null +++ b/man/summaryStatisticsToDataFrame.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-ratio-comparison.R +\name{summaryStatisticsToDataFrame} +\alias{summaryStatisticsToDataFrame} +\title{summaryStatisticsToDataFrame} +\usage{ +summaryStatisticsToDataFrame(data, monteCarlo = FALSE) +} +\arguments{ +\item{data}{A matrix of summary statistics} +} +\value{ +A data.frame displaying summary statistics by column +} +\description{ +Translate matrix of summary statistics to data.frame with appropriate names +} +\keyword{internal} diff --git a/tests/testthat/test-pop-pk-parameters.R b/tests/testthat/test-pop-pk-parameters.R index a76a424d..f49ed50a 100644 --- a/tests/testthat/test-pop-pk-parameters.R +++ b/tests/testthat/test-pop-pk-parameters.R @@ -150,4 +150,4 @@ updatePKParameter("AUC_tEnd", displayName = "AUC_tEnd", displayUnit = "µmol*min # Clear test workflow folders unlink(workflowPediatric$workflowFolder, recursive = TRUE) unlink(workflowParallel$workflowFolder, recursive = TRUE) -#unlink(workflowRatio$workflowFolder, recursive = TRUE) +unlink(workflowRatio$workflowFolder, recursive = TRUE)