From 0168a6db562df5b6bf634b9c1a8e22f497425207 Mon Sep 17 00:00:00 2001 From: Emily de la Rua Date: Tue, 18 Feb 2025 19:15:13 -0500 Subject: [PATCH] Fix filtering function, add parameter --- R/tbl_filter.R | 85 +++++++++++----------------- R/tbl_sort.R | 87 ++++++++++++++++------------- man/tbl_filter.Rd | 21 ++++--- tests/testthat/_snaps/tbl_filter.md | 22 ++++++++ 4 files changed, 117 insertions(+), 98 deletions(-) diff --git a/R/tbl_filter.R b/R/tbl_filter.R index 781ed1ec8..5095fe30a 100644 --- a/R/tbl_filter.R +++ b/R/tbl_filter.R @@ -2,10 +2,13 @@ #' #' @description `r lifecycle::badge('experimental')`\cr #' -#' This function is used to filter hierarchical table rows. +#' This function is used to filter hierarchical table rows. Filters are not applied to summary or overall rows. #' #' @param x (`tbl_hierarchical`, `tbl_hierarchical_count`)\cr #' A hierarchical gtsummary table of class `'tbl_hierarchical'` or `'tbl_hierarchical_count'`. +#' @param keep_empty_summary (scalar `logical`)\cr +#' Logical argument indicating whether to retain summary rows corresponding to table hierarchy sections that have had +#' all rows filtered out. Default is `TRUE`. #' @inheritParams cards::ard_filter #' @inheritParams rlang::args_dots_empty #' @@ -13,15 +16,15 @@ #' The `filter` argument can be used to filter out rows of a table which do not meet the criteria provided as an #' expression. Rows can be filtered on the values of any of the possible statistics (`n`, `p`, and `N`) provided they #' are included at least once in the table, as well as the values of any `by` variables. Filtering is only applied to -#' rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows preceding at -#' least one inner row that meets the filtering criteria are kept regardless of whether they meet the filtering criteria -#' themselves. In addition to filtering on individual statistic values, filters can be applied across the row (i.e. -#' across all `by` variable values) by using aggregate functions such as `sum()` and `mean()`. +#' rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows are kept +#' regardless of whether they meet the filtering criteria themselves. In addition to filtering on individual statistic +#' values, filters can be applied across the row (i.e. across all `by` variable values) by using aggregate functions +#' such as `sum()` and `mean()`. #' #' Some examples of possible filters: #' - `filter = n > 5` #' - `filter = n == 2 & p < 0.05` -#' - `filter = sum(n) > 4` +#' - `filter = sum(n) >= 4` #' - `filter = mean(n) > 4 | n > 3` #' - `filter = any(n > 2 & TRTA == "Xanomeline High Dose")` #' @@ -61,7 +64,7 @@ tbl_filter <- function(x, ...) { #' @export #' @rdname tbl_filter -tbl_filter.tbl_hierarchical <- function(x, filter, ...) { +tbl_filter.tbl_hierarchical <- function(x, filter, keep_empty_summary = TRUE, ...) { set_cli_abort_call() ard_args <- attributes(x$cards$tbl_hierarchical)$args @@ -69,38 +72,7 @@ tbl_filter.tbl_hierarchical <- function(x, filter, ...) { x_ard <- x$cards$tbl_hierarchical # add dummy rows for variables not in include so their label rows are filtered correctly - not_incl <- setdiff(ard_args$variables, ard_args$include) - if (length(not_incl) > 0) { - cli::cli_inform( - "Not all hierarchy variables present in the table were included in the {.arg include} argument. - These variables ({not_incl}) do not have event rate data available so the total sum of the event rates - for this hierarchy section will be used instead. To use true event rates for all sections of the table, - set {.code include = everything()} when creating your table via {.fun tbl_hierarchical}." - ) - - for (v in not_incl) { - i <- length(ard_args$by) + which(ard_args$variables == v) - x_sum_rows <- x_ard |> - dplyr::group_by(across(all_of(cards::all_ard_group_n((length(ard_args$by) + 1):i)))) |> - dplyr::group_map(function(.df, .g) { - g_cur <- .g[[ncol(.g) - 1]] - if (!is.na(g_cur) && g_cur == v) { - # dummy summary row to add in - .df[1, ] |> mutate( - variable = g_cur, - variable_level = .g[[ncol(.g)]], - stat_name = "no_stat", - stat = list(0), - tmp = TRUE - ) - } else { - NULL - } - }, .keep = TRUE) - - x_ard <- x_ard |> dplyr::bind_rows(x_sum_rows) - } - } + x_ard <- x_ard |> .append_not_incl(ard_args) # add indices to ARD x_ard <- x_ard |> @@ -160,26 +132,37 @@ tbl_filter.tbl_hierarchical <- function(x, filter, ...) { unique() |> setdiff(rm_idx) + # apply filtering while retaining original row order + idx_filter <- intersect(x$table_body$idx_nofilter, idx_filter) + x$table_body <- x$table_body[match(idx_filter, x$table_body$idx_nofilter), ] + if ("tmp" %in% names(x_ard_filter)) { x_ard_filter <- x_ard_filter |> dplyr::filter(is.na(tmp)) |> select(-"tmp") } - # update x$cards - x$cards$tbl_hierarchical <- x_ard_filter |> select(-"idx_nofilter") + # remove summary rows from empty sections if requested + if (!keep_empty_summary) { + if (!dplyr::last(ard_args$variables) %in% x$table_body$variable) { + x$table_body <- x$table_body |> dplyr::filter(!variable %in% outer_cols) + x_ard_filter <- x_ard_filter |> dplyr::filter(!variable %in% outer_cols) + } else { + for (v in rev(outer_cols)) { + empty_rows <- x$table_body |> + dplyr::filter(variable == dplyr::lead(variable) & variable == v) |> + dplyr::pull("idx_nofilter") + x$table_body <- x$table_body |> dplyr::filter(!idx_nofilter %in% empty_rows) + x_ard_filter <- x_ard_filter |> dplyr::filter(!idx_nofilter %in% empty_rows) + } + } + } # update x$table_body - x$table_body <- x$table_body[match(idx_filter, x$table_body$idx_nofilter), ] |> select(-"idx_nofilter") + x$table_body <- x$table_body |> select(-"idx_nofilter") - x + # update x$cards + x$cards$tbl_hierarchical <- x_ard_filter |> select(-"idx_nofilter") - # if (nrow(x$table_body) > 0) { - # cli::cli_inform( - # "For readability, all summary rows preceding at least one row that meets the filtering criteria are kept - # regardless of whether they meet the filtering criteria themselves.", - # .frequency = "once", - # .frequency_id = "sum_rows_lt" - # ) - # } + x } diff --git a/R/tbl_sort.R b/R/tbl_sort.R index 04f5bc0d7..0f2b6aa76 100644 --- a/R/tbl_sort.R +++ b/R/tbl_sort.R @@ -60,46 +60,7 @@ tbl_sort.tbl_hierarchical <- function(x, sort = "descending", ...) { x_ard <- x$cards$tbl_hierarchical # add dummy rows for variables not in include so their label rows are sorted correctly - not_incl <- setdiff(ard_args$variables, ard_args$include) - if (length(not_incl) > 0) { - cli::cli_inform( - "Not all hierarchy variables present in the table were included in the {.arg include} argument. - These variables ({not_incl}) do not have event rate data available so the total sum of the event rates - for this hierarchy section will be used instead. To use true event rates for all sections of the table, - set {.code include = everything()} when creating your table via {.fun tbl_hierarchical}." - ) - - for (v in not_incl) { - i <- length(ard_args$by) + which(ard_args$variables == v) - x_sum_rows <- x_ard |> - dplyr::group_by(across(all_of(cards::all_ard_group_n((length(ard_args$by) + 1):i)))) |> - dplyr::group_map(function(.df, .g) { - # get pseudo-summary row stat value for descending sort - if (sort == "descending") { - stat_nm <- setdiff(.df$stat_name, "N")[1] - sum <- .df |> - dplyr::filter(stat_name == !!stat_nm) |> - dplyr::summarize(s = sum(unlist(stat))) |> - dplyr::pull(s) - } - g_cur <- .g[[ncol(.g) - 1]] - if (!is.na(g_cur) && g_cur == v) { - # dummy summary row to add in - .df[1, ] |> mutate( - variable = g_cur, - variable_level = .g[[ncol(.g)]], - stat_name = if (sort == "descending") stat_nm else "no_stat", - stat = if (sort == "descending") list(sum) else list(0), - tmp = TRUE - ) - } else { - NULL - } - }, .keep = TRUE) - - x_ard <- x_ard |> dplyr::bind_rows(x_sum_rows) - } - } + x_ard <- x_ard |> .append_not_incl(ard_args, sort) # add indices to ARD x_ard <- x_ard |> @@ -173,3 +134,49 @@ tbl_sort.tbl_hierarchical <- function(x, sort = "descending", ...) { x } + +.append_not_incl <- function(x, ard_args, sort = NULL) { + # add dummy rows for variables not in include so their label rows are sorted correctly + not_incl <- setdiff(ard_args$variables, ard_args$include) + if (length(not_incl) > 0) { + cli::cli_inform( + "Not all hierarchy variables present in the table were included in the {.arg include} argument. + These variables ({not_incl}) do not have event rate data available so the total sum of the event rates + for this hierarchy section will be used instead. To use true event rates for all sections of the table, + set {.code include = everything()} when creating your table via {.fun tbl_hierarchical}." + ) + + for (v in not_incl) { + i <- length(ard_args$by) + which(ard_args$variables == v) + x_sum_rows <- x |> + dplyr::group_by(across(all_of(cards::all_ard_group_n((length(ard_args$by) + 1):i)))) |> + dplyr::group_map(function(.df, .g) { + # get pseudo-summary row stat value for descending sort + if (!is.null(sort) && sort == "descending") { + stat_nm <- setdiff(.df$stat_name, "N")[1] + sum <- .df |> + dplyr::filter(stat_name == !!stat_nm) |> + dplyr::summarize(s = sum(unlist(stat))) |> + dplyr::pull(s) + } + g_cur <- .g[[ncol(.g) - 1]] + if (!is.na(g_cur) && g_cur == v) { + # dummy summary row to add in + .df[1, ] |> mutate( + variable = g_cur, + variable_level = .g[[ncol(.g)]], + stat_name = if (!is.null(sort) && sort == "descending") stat_nm else "no_stat", + stat = if (!is.null(sort) && sort == "descending") list(sum) else list(0), + tmp = TRUE + ) + } else { + NULL + } + }, .keep = TRUE) + + x <- x |> dplyr::bind_rows(x_sum_rows) + } + } + + x +} diff --git a/man/tbl_filter.Rd b/man/tbl_filter.Rd index 640ec7264..0de42e998 100644 --- a/man/tbl_filter.Rd +++ b/man/tbl_filter.Rd @@ -7,13 +7,20 @@ \usage{ tbl_filter(x, ...) -\method{tbl_filter}{tbl_hierarchical}(x, filter, ...) +\method{tbl_filter}{tbl_hierarchical}(x, filter, keep_empty_summary = TRUE, ...) } \arguments{ \item{x}{(\code{tbl_hierarchical}, \code{tbl_hierarchical_count})\cr A hierarchical gtsummary table of class \code{'tbl_hierarchical'} or \code{'tbl_hierarchical_count'}.} \item{...}{These dots are for future extensions and must be empty.} + +\item{filter}{(\code{expression})\cr an expression that is used to filter rows of the hierarchical ARD. See the Details +section below for more information.} + +\item{keep_empty_summary}{(scalar \code{logical})\cr +Logical argument indicating whether to retain summary rows corresponding to table hierarchy sections that have had +all rows filtered out. Default is \code{TRUE}.} } \value{ A \code{gtsummary} of the same class as \code{x}. @@ -21,22 +28,22 @@ A \code{gtsummary} of the same class as \code{x}. \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}\cr -This function is used to filter hierarchical table rows. +This function is used to filter hierarchical table rows. Filters are not applied to summary or overall rows. } \details{ The \code{filter} argument can be used to filter out rows of a table which do not meet the criteria provided as an expression. Rows can be filtered on the values of any of the possible statistics (\code{n}, \code{p}, and \code{N}) provided they are included at least once in the table, as well as the values of any \code{by} variables. Filtering is only applied to -rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows preceding at -least one inner row that meets the filtering criteria are kept regardless of whether they meet the filtering criteria -themselves. In addition to filtering on individual statistic values, filters can be applied across the row (i.e. -across all \code{by} variable values) by using aggregate functions such as \code{sum()} and \code{mean()}. +rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows are kept +regardless of whether they meet the filtering criteria themselves. In addition to filtering on individual statistic +values, filters can be applied across the row (i.e. across all \code{by} variable values) by using aggregate functions +such as \code{sum()} and \code{mean()}. Some examples of possible filters: \itemize{ \item \code{filter = n > 5} \item \code{filter = n == 2 & p < 0.05} -\item \code{filter = sum(n) > 4} +\item \code{filter = sum(n) >= 4} \item \code{filter = mean(n) > 4 | n > 3} \item \code{filter = any(n > 2 & TRTA == "Xanomeline High Dose")} } diff --git a/tests/testthat/_snaps/tbl_filter.md b/tests/testthat/_snaps/tbl_filter.md index 82b4ca8d0..cdca373e9 100644 --- a/tests/testthat/_snaps/tbl_filter.md +++ b/tests/testthat/_snaps/tbl_filter.md @@ -1,3 +1,25 @@ +# tbl_filter.tbl_hierarchical() works + + Code + as.data.frame(tbl) + Output + **Sex** \n    **Race** \n        **Reported Term for the Adverse Event** **Placebo** \nN = 86 **Xanomeline High Dose** \nN = 84 **Xanomeline Low Dose** \nN = 84 + 1 Number of patients with event 26 (30%) 42 (50%) 40 (48%) + 2 F 13 (25%) 18 (45%) 23 (46%) + 3 BLACK OR AFRICAN AMERICAN 3 (60%) 4 (67%) 3 (50%) + 4 WHITE 10 (21%) 14 (41%) 20 (45%) + 5 APPLICATION SITE PRURITUS 2 (4.2%) 8 (24%) 10 (23%) + 6 ERYTHEMA 6 (13%) 6 (18%) 8 (18%) + 7 APPLICATION SITE ERYTHEMA 2 (4.2%) 5 (15%) 5 (11%) + 8 M 13 (39%) 24 (55%) 17 (50%) + 9 AMERICAN INDIAN OR ALASKA NATIVE 0 (NA%) 1 (100%) 0 (NA%) + 10 BLACK OR AFRICAN AMERICAN 1 (33%) 1 (33%) 0 (NA%) + 11 WHITE 12 (40%) 22 (55%) 17 (50%) + 12 APPLICATION SITE PRURITUS 1 (3.3%) 12 (30%) 10 (29%) + 13 DIARRHOEA 6 (20%) 3 (7.5%) 2 (5.9%) + 14 ERYTHEMA 3 (10%) 5 (13%) 6 (18%) + 15 APPLICATION SITE ERYTHEMA 1 (3.3%) 10 (25%) 7 (21%) + # tbl_filter.tbl_hierarchical() error messaging works Code