From 0168a6db562df5b6bf634b9c1a8e22f497425207 Mon Sep 17 00:00:00 2001
From: Emily de la Rua <emily.de_la_rua@contractors.roche.com>
Date: Tue, 18 Feb 2025 19:15:13 -0500
Subject: [PATCH] Fix filtering function, add parameter

---
 R/tbl_filter.R                      | 85 +++++++++++-----------------
 R/tbl_sort.R                        | 87 ++++++++++++++++-------------
 man/tbl_filter.Rd                   | 21 ++++---
 tests/testthat/_snaps/tbl_filter.md | 22 ++++++++
 4 files changed, 117 insertions(+), 98 deletions(-)

diff --git a/R/tbl_filter.R b/R/tbl_filter.R
index 781ed1ec8..5095fe30a 100644
--- a/R/tbl_filter.R
+++ b/R/tbl_filter.R
@@ -2,10 +2,13 @@
 #'
 #' @description `r lifecycle::badge('experimental')`\cr
 #'
-#' This function is used to filter hierarchical table rows.
+#' This function is used to filter hierarchical table rows. Filters are not applied to summary or overall rows.
 #'
 #' @param x (`tbl_hierarchical`, `tbl_hierarchical_count`)\cr
 #'   A hierarchical gtsummary table of class `'tbl_hierarchical'` or `'tbl_hierarchical_count'`.
+#' @param keep_empty_summary (scalar `logical`)\cr
+#'   Logical argument indicating whether to retain summary rows corresponding to table hierarchy sections that have had
+#'   all rows filtered out. Default is `TRUE`.
 #' @inheritParams cards::ard_filter
 #' @inheritParams rlang::args_dots_empty
 #'
@@ -13,15 +16,15 @@
 #' The `filter` argument can be used to filter out rows of a table which do not meet the criteria provided as an
 #' expression. Rows can be filtered on the values of any of the possible statistics (`n`, `p`, and `N`) provided they
 #' are included at least once in the table, as well as the values of any `by` variables. Filtering is only applied to
-#' rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows preceding at
-#' least one inner row that meets the filtering criteria are kept regardless of whether they meet the filtering criteria
-#' themselves. In addition to filtering on individual statistic values, filters can be applied across the row (i.e.
-#' across all `by` variable values) by using aggregate functions such as `sum()` and `mean()`.
+#' rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows are kept
+#' regardless of whether they meet the filtering criteria themselves. In addition to filtering on individual statistic
+#' values, filters can be applied across the row (i.e. across all `by` variable values) by using aggregate functions
+#' such as `sum()` and `mean()`.
 #'
 #' Some examples of possible filters:
 #' - `filter = n > 5`
 #' - `filter = n == 2 & p < 0.05`
-#' - `filter = sum(n) > 4`
+#' - `filter = sum(n) >= 4`
 #' - `filter = mean(n) > 4 | n > 3`
 #' - `filter = any(n > 2 & TRTA == "Xanomeline High Dose")`
 #'
@@ -61,7 +64,7 @@ tbl_filter <- function(x, ...) {
 
 #' @export
 #' @rdname tbl_filter
-tbl_filter.tbl_hierarchical <- function(x, filter, ...) {
+tbl_filter.tbl_hierarchical <- function(x, filter, keep_empty_summary = TRUE, ...) {
   set_cli_abort_call()
 
   ard_args <- attributes(x$cards$tbl_hierarchical)$args
@@ -69,38 +72,7 @@ tbl_filter.tbl_hierarchical <- function(x, filter, ...) {
   x_ard <- x$cards$tbl_hierarchical
 
   # add dummy rows for variables not in include so their label rows are filtered correctly
-  not_incl <- setdiff(ard_args$variables, ard_args$include)
-  if (length(not_incl) > 0) {
-    cli::cli_inform(
-      "Not all hierarchy variables present in the table were included in the {.arg include} argument.
-      These variables ({not_incl}) do not have event rate data available so the total sum of the event rates
-      for this hierarchy section will be used instead. To use true event rates for all sections of the table,
-      set {.code include = everything()} when creating your table via {.fun tbl_hierarchical}."
-    )
-
-    for (v in not_incl) {
-      i <- length(ard_args$by) + which(ard_args$variables == v)
-      x_sum_rows <- x_ard |>
-        dplyr::group_by(across(all_of(cards::all_ard_group_n((length(ard_args$by) + 1):i)))) |>
-        dplyr::group_map(function(.df, .g) {
-          g_cur <- .g[[ncol(.g) - 1]]
-          if (!is.na(g_cur) && g_cur == v) {
-            # dummy summary row to add in
-            .df[1, ] |> mutate(
-              variable = g_cur,
-              variable_level = .g[[ncol(.g)]],
-              stat_name = "no_stat",
-              stat = list(0),
-              tmp = TRUE
-            )
-          } else {
-            NULL
-          }
-        }, .keep = TRUE)
-
-      x_ard <- x_ard |> dplyr::bind_rows(x_sum_rows)
-    }
-  }
+  x_ard <- x_ard |> .append_not_incl(ard_args)
 
   # add indices to ARD
   x_ard <- x_ard |>
@@ -160,26 +132,37 @@ tbl_filter.tbl_hierarchical <- function(x, filter, ...) {
     unique() |>
     setdiff(rm_idx)
 
+  # apply filtering while retaining original row order
+  idx_filter <- intersect(x$table_body$idx_nofilter, idx_filter)
+  x$table_body <- x$table_body[match(idx_filter, x$table_body$idx_nofilter), ]
+
   if ("tmp" %in% names(x_ard_filter)) {
     x_ard_filter <- x_ard_filter |>
       dplyr::filter(is.na(tmp)) |>
       select(-"tmp")
   }
 
-  # update x$cards
-  x$cards$tbl_hierarchical <- x_ard_filter |> select(-"idx_nofilter")
+  # remove summary rows from empty sections if requested
+  if (!keep_empty_summary) {
+    if (!dplyr::last(ard_args$variables) %in% x$table_body$variable) {
+      x$table_body <- x$table_body |> dplyr::filter(!variable %in% outer_cols)
+      x_ard_filter <- x_ard_filter |> dplyr::filter(!variable %in% outer_cols)
+    } else {
+      for (v in rev(outer_cols)) {
+        empty_rows <- x$table_body |>
+          dplyr::filter(variable == dplyr::lead(variable) & variable == v) |>
+          dplyr::pull("idx_nofilter")
+        x$table_body <- x$table_body |> dplyr::filter(!idx_nofilter %in% empty_rows)
+        x_ard_filter <- x_ard_filter |> dplyr::filter(!idx_nofilter %in% empty_rows)
+      }
+    }
+  }
 
   # update x$table_body
-  x$table_body <- x$table_body[match(idx_filter, x$table_body$idx_nofilter), ] |> select(-"idx_nofilter")
+  x$table_body <- x$table_body |> select(-"idx_nofilter")
 
-  x
+  # update x$cards
+  x$cards$tbl_hierarchical <- x_ard_filter |> select(-"idx_nofilter")
 
-  # if (nrow(x$table_body) > 0) {
-  #     cli::cli_inform(
-  #       "For readability, all summary rows preceding at least one row that meets the filtering criteria are kept
-  #       regardless of whether they meet the filtering criteria themselves.",
-  #       .frequency = "once",
-  #       .frequency_id = "sum_rows_lt"
-  #     )
-  #   }
+  x
 }
diff --git a/R/tbl_sort.R b/R/tbl_sort.R
index 04f5bc0d7..0f2b6aa76 100644
--- a/R/tbl_sort.R
+++ b/R/tbl_sort.R
@@ -60,46 +60,7 @@ tbl_sort.tbl_hierarchical <- function(x, sort = "descending", ...) {
   x_ard <- x$cards$tbl_hierarchical
 
   # add dummy rows for variables not in include so their label rows are sorted correctly
-  not_incl <- setdiff(ard_args$variables, ard_args$include)
-  if (length(not_incl) > 0) {
-    cli::cli_inform(
-      "Not all hierarchy variables present in the table were included in the {.arg include} argument.
-      These variables ({not_incl}) do not have event rate data available so the total sum of the event rates
-      for this hierarchy section will be used instead. To use true event rates for all sections of the table,
-      set {.code include = everything()} when creating your table via {.fun tbl_hierarchical}."
-    )
-
-    for (v in not_incl) {
-      i <- length(ard_args$by) + which(ard_args$variables == v)
-      x_sum_rows <- x_ard |>
-        dplyr::group_by(across(all_of(cards::all_ard_group_n((length(ard_args$by) + 1):i)))) |>
-        dplyr::group_map(function(.df, .g) {
-          # get pseudo-summary row stat value for descending sort
-          if (sort == "descending") {
-            stat_nm <- setdiff(.df$stat_name, "N")[1]
-            sum <- .df |>
-              dplyr::filter(stat_name == !!stat_nm) |>
-              dplyr::summarize(s = sum(unlist(stat))) |>
-              dplyr::pull(s)
-          }
-          g_cur <- .g[[ncol(.g) - 1]]
-          if (!is.na(g_cur) && g_cur == v) {
-            # dummy summary row to add in
-            .df[1, ] |> mutate(
-              variable = g_cur,
-              variable_level = .g[[ncol(.g)]],
-              stat_name = if (sort == "descending") stat_nm else "no_stat",
-              stat = if (sort == "descending") list(sum) else list(0),
-              tmp = TRUE
-            )
-          } else {
-            NULL
-          }
-        }, .keep = TRUE)
-
-      x_ard <- x_ard |> dplyr::bind_rows(x_sum_rows)
-    }
-  }
+  x_ard <- x_ard |> .append_not_incl(ard_args, sort)
 
   # add indices to ARD
   x_ard <- x_ard |>
@@ -173,3 +134,49 @@ tbl_sort.tbl_hierarchical <- function(x, sort = "descending", ...) {
 
   x
 }
+
+.append_not_incl <- function(x, ard_args, sort = NULL) {
+  # add dummy rows for variables not in include so their label rows are sorted correctly
+  not_incl <- setdiff(ard_args$variables, ard_args$include)
+  if (length(not_incl) > 0) {
+    cli::cli_inform(
+      "Not all hierarchy variables present in the table were included in the {.arg include} argument.
+      These variables ({not_incl}) do not have event rate data available so the total sum of the event rates
+      for this hierarchy section will be used instead. To use true event rates for all sections of the table,
+      set {.code include = everything()} when creating your table via {.fun tbl_hierarchical}."
+    )
+
+    for (v in not_incl) {
+      i <- length(ard_args$by) + which(ard_args$variables == v)
+      x_sum_rows <- x |>
+        dplyr::group_by(across(all_of(cards::all_ard_group_n((length(ard_args$by) + 1):i)))) |>
+        dplyr::group_map(function(.df, .g) {
+          # get pseudo-summary row stat value for descending sort
+          if (!is.null(sort) && sort == "descending") {
+            stat_nm <- setdiff(.df$stat_name, "N")[1]
+            sum <- .df |>
+              dplyr::filter(stat_name == !!stat_nm) |>
+              dplyr::summarize(s = sum(unlist(stat))) |>
+              dplyr::pull(s)
+          }
+          g_cur <- .g[[ncol(.g) - 1]]
+          if (!is.na(g_cur) && g_cur == v) {
+            # dummy summary row to add in
+            .df[1, ] |> mutate(
+              variable = g_cur,
+              variable_level = .g[[ncol(.g)]],
+              stat_name = if (!is.null(sort) && sort == "descending") stat_nm else "no_stat",
+              stat = if (!is.null(sort) && sort == "descending") list(sum) else list(0),
+              tmp = TRUE
+            )
+          } else {
+            NULL
+          }
+        }, .keep = TRUE)
+
+      x <- x |> dplyr::bind_rows(x_sum_rows)
+    }
+  }
+
+  x
+}
diff --git a/man/tbl_filter.Rd b/man/tbl_filter.Rd
index 640ec7264..0de42e998 100644
--- a/man/tbl_filter.Rd
+++ b/man/tbl_filter.Rd
@@ -7,13 +7,20 @@
 \usage{
 tbl_filter(x, ...)
 
-\method{tbl_filter}{tbl_hierarchical}(x, filter, ...)
+\method{tbl_filter}{tbl_hierarchical}(x, filter, keep_empty_summary = TRUE, ...)
 }
 \arguments{
 \item{x}{(\code{tbl_hierarchical}, \code{tbl_hierarchical_count})\cr
 A hierarchical gtsummary table of class \code{'tbl_hierarchical'} or \code{'tbl_hierarchical_count'}.}
 
 \item{...}{These dots are for future extensions and must be empty.}
+
+\item{filter}{(\code{expression})\cr an expression that is used to filter rows of the hierarchical ARD. See the Details
+section below for more information.}
+
+\item{keep_empty_summary}{(scalar \code{logical})\cr
+Logical argument indicating whether to retain summary rows corresponding to table hierarchy sections that have had
+all rows filtered out. Default is \code{TRUE}.}
 }
 \value{
 A \code{gtsummary} of the same class as \code{x}.
@@ -21,22 +28,22 @@ A \code{gtsummary} of the same class as \code{x}.
 \description{
 \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}\cr
 
-This function is used to filter hierarchical table rows.
+This function is used to filter hierarchical table rows. Filters are not applied to summary or overall rows.
 }
 \details{
 The \code{filter} argument can be used to filter out rows of a table which do not meet the criteria provided as an
 expression. Rows can be filtered on the values of any of the possible statistics (\code{n}, \code{p}, and \code{N}) provided they
 are included at least once in the table, as well as the values of any \code{by} variables. Filtering is only applied to
-rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows preceding at
-least one inner row that meets the filtering criteria are kept regardless of whether they meet the filtering criteria
-themselves. In addition to filtering on individual statistic values, filters can be applied across the row (i.e.
-across all \code{by} variable values) by using aggregate functions such as \code{sum()} and \code{mean()}.
+rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows are kept
+regardless of whether they meet the filtering criteria themselves. In addition to filtering on individual statistic
+values, filters can be applied across the row (i.e. across all \code{by} variable values) by using aggregate functions
+such as \code{sum()} and \code{mean()}.
 
 Some examples of possible filters:
 \itemize{
 \item \code{filter = n > 5}
 \item \code{filter = n == 2 & p < 0.05}
-\item \code{filter = sum(n) > 4}
+\item \code{filter = sum(n) >= 4}
 \item \code{filter = mean(n) > 4 | n > 3}
 \item \code{filter = any(n > 2 & TRTA == "Xanomeline High Dose")}
 }
diff --git a/tests/testthat/_snaps/tbl_filter.md b/tests/testthat/_snaps/tbl_filter.md
index 82b4ca8d0..cdca373e9 100644
--- a/tests/testthat/_snaps/tbl_filter.md
+++ b/tests/testthat/_snaps/tbl_filter.md
@@ -1,3 +1,25 @@
+# tbl_filter.tbl_hierarchical() works
+
+    Code
+      as.data.frame(tbl)
+    Output
+         **Sex**  \n    **Race**  \n        **Reported Term for the Adverse Event** **Placebo**  \nN = 86 **Xanomeline High Dose**  \nN = 84 **Xanomeline Low Dose**  \nN = 84
+      1                                               Number of patients with event              26 (30%)                           42 (50%)                          40 (48%)
+      2                                                                           F              13 (25%)                           18 (45%)                          23 (46%)
+      3                                                   BLACK OR AFRICAN AMERICAN               3 (60%)                            4 (67%)                           3 (50%)
+      4                                                                       WHITE              10 (21%)                           14 (41%)                          20 (45%)
+      5                                                   APPLICATION SITE PRURITUS              2 (4.2%)                            8 (24%)                          10 (23%)
+      6                                                                    ERYTHEMA               6 (13%)                            6 (18%)                           8 (18%)
+      7                                                   APPLICATION SITE ERYTHEMA              2 (4.2%)                            5 (15%)                           5 (11%)
+      8                                                                           M              13 (39%)                           24 (55%)                          17 (50%)
+      9                                            AMERICAN INDIAN OR ALASKA NATIVE               0 (NA%)                           1 (100%)                           0 (NA%)
+      10                                                  BLACK OR AFRICAN AMERICAN               1 (33%)                            1 (33%)                           0 (NA%)
+      11                                                                      WHITE              12 (40%)                           22 (55%)                          17 (50%)
+      12                                                  APPLICATION SITE PRURITUS              1 (3.3%)                           12 (30%)                          10 (29%)
+      13                                                                  DIARRHOEA               6 (20%)                           3 (7.5%)                          2 (5.9%)
+      14                                                                   ERYTHEMA               3 (10%)                            5 (13%)                           6 (18%)
+      15                                                  APPLICATION SITE ERYTHEMA              1 (3.3%)                           10 (25%)                           7 (21%)
+
 # tbl_filter.tbl_hierarchical() error messaging works
 
     Code