Fix filtering function, add parameter

ddsjoberg · Feb 19, 2025 · 0168a6d · 0168a6d
1 parent 8708969
commit 0168a6d
Show file tree

Hide file tree

Showing 4 changed files with 117 additions and 98 deletions.
diff --git a/R/tbl_filter.R b/R/tbl_filter.R
@@ -2,26 +2,29 @@
 #'
 #' @description `r lifecycle::badge('experimental')`\cr
 #'
-#' This function is used to filter hierarchical table rows.
+#' This function is used to filter hierarchical table rows. Filters are not applied to summary or overall rows.
 #'
 #' @param x (`tbl_hierarchical`, `tbl_hierarchical_count`)\cr
 #'   A hierarchical gtsummary table of class `'tbl_hierarchical'` or `'tbl_hierarchical_count'`.
+#' @param keep_empty_summary (scalar `logical`)\cr
+#'   Logical argument indicating whether to retain summary rows corresponding to table hierarchy sections that have had
+#'   all rows filtered out. Default is `TRUE`.
 #' @inheritParams cards::ard_filter
 #' @inheritParams rlang::args_dots_empty
 #'
 #' @details
 #' The `filter` argument can be used to filter out rows of a table which do not meet the criteria provided as an
 #' expression. Rows can be filtered on the values of any of the possible statistics (`n`, `p`, and `N`) provided they
 #' are included at least once in the table, as well as the values of any `by` variables. Filtering is only applied to
-#' rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows preceding at
-#' least one inner row that meets the filtering criteria are kept regardless of whether they meet the filtering criteria
-#' themselves. In addition to filtering on individual statistic values, filters can be applied across the row (i.e.
-#' across all `by` variable values) by using aggregate functions such as `sum()` and `mean()`.
+#' rows that correspond to the innermost variable in the hierarchy - all outer variable (summary) rows are kept
+#' regardless of whether they meet the filtering criteria themselves. In addition to filtering on individual statistic
+#' values, filters can be applied across the row (i.e. across all `by` variable values) by using aggregate functions
+#' such as `sum()` and `mean()`.
 #'
 #' Some examples of possible filters:
 #' - `filter = n > 5`
 #' - `filter = n == 2 & p < 0.05`
-#' - `filter = sum(n) > 4`
+#' - `filter = sum(n) >= 4`
 #' - `filter = mean(n) > 4 | n > 3`
 #' - `filter = any(n > 2 & TRTA == "Xanomeline High Dose")`
 #'
@@ -61,46 +64,15 @@ tbl_filter <- function(x, ...) {
 
 #' @export
 #' @rdname tbl_filter
-tbl_filter.tbl_hierarchical <- function(x, filter, ...) {
+tbl_filter.tbl_hierarchical <- function(x, filter, keep_empty_summary = TRUE, ...) {
   set_cli_abort_call()
 
   ard_args <- attributes(x$cards$tbl_hierarchical)$args
   by_cols <- paste0("group", seq_along(length(ard_args$by)), c("", "_level"))
   x_ard <- x$cards$tbl_hierarchical
 
   # add dummy rows for variables not in include so their label rows are filtered correctly
-  not_incl <- setdiff(ard_args$variables, ard_args$include)
-  if (length(not_incl) > 0) {
-    cli::cli_inform(
-      "Not all hierarchy variables present in the table were included in the {.arg include} argument.
-      These variables ({not_incl}) do not have event rate data available so the total sum of the event rates
-      for this hierarchy section will be used instead. To use true event rates for all sections of the table,
-      set {.code include = everything()} when creating your table via {.fun tbl_hierarchical}."
-    )
-
-    for (v in not_incl) {
-      i <- length(ard_args$by) + which(ard_args$variables == v)
-      x_sum_rows <- x_ard |>
-        dplyr::group_by(across(all_of(cards::all_ard_group_n((length(ard_args$by) + 1):i)))) |>
-        dplyr::group_map(function(.df, .g) {
-          g_cur <- .g[[ncol(.g) - 1]]
-          if (!is.na(g_cur) && g_cur == v) {
-            # dummy summary row to add in
-            .df[1, ] |> mutate(
-              variable = g_cur,
-              variable_level = .g[[ncol(.g)]],
-              stat_name = "no_stat",
-              stat = list(0),
-              tmp = TRUE
-            )
-          } else {
-            NULL
-          }
-        }, .keep = TRUE)
-
-      x_ard <- x_ard |> dplyr::bind_rows(x_sum_rows)
-    }
-  }
+  x_ard <- x_ard |> .append_not_incl(ard_args)
 
   # add indices to ARD
   x_ard <- x_ard |>
@@ -160,26 +132,37 @@ tbl_filter.tbl_hierarchical <- function(x, filter, ...) {
     unique() |>
     setdiff(rm_idx)
 
+  # apply filtering while retaining original row order
+  idx_filter <- intersect(x$table_body$idx_nofilter, idx_filter)
+  x$table_body <- x$table_body[match(idx_filter, x$table_body$idx_nofilter), ]
+
   if ("tmp" %in% names(x_ard_filter)) {
     x_ard_filter <- x_ard_filter |>
       dplyr::filter(is.na(tmp)) |>
       select(-"tmp")
   }
 
-  # update x$cards
-  x$cards$tbl_hierarchical <- x_ard_filter |> select(-"idx_nofilter")
+  # remove summary rows from empty sections if requested
+  if (!keep_empty_summary) {
+    if (!dplyr::last(ard_args$variables) %in% x$table_body$variable) {
+      x$table_body <- x$table_body |> dplyr::filter(!variable %in% outer_cols)
+      x_ard_filter <- x_ard_filter |> dplyr::filter(!variable %in% outer_cols)
+    } else {
+      for (v in rev(outer_cols)) {
+        empty_rows <- x$table_body |>
+          dplyr::filter(variable == dplyr::lead(variable) & variable == v) |>
+          dplyr::pull("idx_nofilter")
+        x$table_body <- x$table_body |> dplyr::filter(!idx_nofilter %in% empty_rows)
+        x_ard_filter <- x_ard_filter |> dplyr::filter(!idx_nofilter %in% empty_rows)
+      }
+    }
+  }
 
   # update x$table_body
-  x$table_body <- x$table_body[match(idx_filter, x$table_body$idx_nofilter), ] |> select(-"idx_nofilter")
+  x$table_body <- x$table_body |> select(-"idx_nofilter")
 
-  x
+  # update x$cards
+  x$cards$tbl_hierarchical <- x_ard_filter |> select(-"idx_nofilter")
 
-  # if (nrow(x$table_body) > 0) {
-  #     cli::cli_inform(
-  #       "For readability, all summary rows preceding at least one row that meets the filtering criteria are kept
-  #       regardless of whether they meet the filtering criteria themselves.",
-  #       .frequency = "once",
-  #       .frequency_id = "sum_rows_lt"
-  #     )
-  #   }
+  x
 }
diff --git a/R/tbl_sort.R b/R/tbl_sort.R
@@ -60,46 +60,7 @@ tbl_sort.tbl_hierarchical <- function(x, sort = "descending", ...) {
   x_ard <- x$cards$tbl_hierarchical
 
   # add dummy rows for variables not in include so their label rows are sorted correctly
-  not_incl <- setdiff(ard_args$variables, ard_args$include)
-  if (length(not_incl) > 0) {
-    cli::cli_inform(
-      "Not all hierarchy variables present in the table were included in the {.arg include} argument.
-      These variables ({not_incl}) do not have event rate data available so the total sum of the event rates
-      for this hierarchy section will be used instead. To use true event rates for all sections of the table,
-      set {.code include = everything()} when creating your table via {.fun tbl_hierarchical}."
-    )
-
-    for (v in not_incl) {
-      i <- length(ard_args$by) + which(ard_args$variables == v)
-      x_sum_rows <- x_ard |>
-        dplyr::group_by(across(all_of(cards::all_ard_group_n((length(ard_args$by) + 1):i)))) |>
-        dplyr::group_map(function(.df, .g) {
-          # get pseudo-summary row stat value for descending sort
-          if (sort == "descending") {
-            stat_nm <- setdiff(.df$stat_name, "N")[1]
-            sum <- .df |>
-              dplyr::filter(stat_name == !!stat_nm) |>
-              dplyr::summarize(s = sum(unlist(stat))) |>
-              dplyr::pull(s)
-          }
-          g_cur <- .g[[ncol(.g) - 1]]
-          if (!is.na(g_cur) && g_cur == v) {
-            # dummy summary row to add in
-            .df[1, ] |> mutate(
-              variable = g_cur,
-              variable_level = .g[[ncol(.g)]],
-              stat_name = if (sort == "descending") stat_nm else "no_stat",
-              stat = if (sort == "descending") list(sum) else list(0),
-              tmp = TRUE
-            )
-          } else {
-            NULL
-          }
-        }, .keep = TRUE)
-
-      x_ard <- x_ard |> dplyr::bind_rows(x_sum_rows)
-    }
-  }
+  x_ard <- x_ard |> .append_not_incl(ard_args, sort)
 
   # add indices to ARD
   x_ard <- x_ard |>
@@ -173,3 +134,49 @@ tbl_sort.tbl_hierarchical <- function(x, sort = "descending", ...) {
 
   x
 }
+
+.append_not_incl <- function(x, ard_args, sort = NULL) {
+  # add dummy rows for variables not in include so their label rows are sorted correctly
+  not_incl <- setdiff(ard_args$variables, ard_args$include)
+  if (length(not_incl) > 0) {
+    cli::cli_inform(
+      "Not all hierarchy variables present in the table were included in the {.arg include} argument.
+      These variables ({not_incl}) do not have event rate data available so the total sum of the event rates
+      for this hierarchy section will be used instead. To use true event rates for all sections of the table,
+      set {.code include = everything()} when creating your table via {.fun tbl_hierarchical}."
+    )
+
+    for (v in not_incl) {
+      i <- length(ard_args$by) + which(ard_args$variables == v)
+      x_sum_rows <- x |>
+        dplyr::group_by(across(all_of(cards::all_ard_group_n((length(ard_args$by) + 1):i)))) |>
+        dplyr::group_map(function(.df, .g) {
+          # get pseudo-summary row stat value for descending sort
+          if (!is.null(sort) && sort == "descending") {
+            stat_nm <- setdiff(.df$stat_name, "N")[1]
+            sum <- .df |>
+              dplyr::filter(stat_name == !!stat_nm) |>
+              dplyr::summarize(s = sum(unlist(stat))) |>
+              dplyr::pull(s)
+          }
+          g_cur <- .g[[ncol(.g) - 1]]
+          if (!is.na(g_cur) && g_cur == v) {
+            # dummy summary row to add in
+            .df[1, ] |> mutate(
+              variable = g_cur,
+              variable_level = .g[[ncol(.g)]],
+              stat_name = if (!is.null(sort) && sort == "descending") stat_nm else "no_stat",
+              stat = if (!is.null(sort) && sort == "descending") list(sum) else list(0),
+              tmp = TRUE
+            )
+          } else {
+            NULL
+          }
+        }, .keep = TRUE)
+
+      x <- x |> dplyr::bind_rows(x_sum_rows)
+    }
+  }
+
+  x
+}
diff --git a/man/tbl_filter.Rd b/man/tbl_filter.Rd
diff --git a/tests/testthat/_snaps/tbl_filter.md b/tests/testthat/_snaps/tbl_filter.md
@@ -1,3 +1,25 @@
+# tbl_filter.tbl_hierarchical() works
+
+    Code
+      as.data.frame(tbl)
+    Output
+         **Sex**  \n    **Race**  \n        **Reported Term for the Adverse Event** **Placebo**  \nN = 86 **Xanomeline High Dose**  \nN = 84 **Xanomeline Low Dose**  \nN = 84
+      1                                               Number of patients with event              26 (30%)                           42 (50%)                          40 (48%)
+      2                                                                           F              13 (25%)                           18 (45%)                          23 (46%)
+      3                                                   BLACK OR AFRICAN AMERICAN               3 (60%)                            4 (67%)                           3 (50%)
+      4                                                                       WHITE              10 (21%)                           14 (41%)                          20 (45%)
+      5                                                   APPLICATION SITE PRURITUS              2 (4.2%)                            8 (24%)                          10 (23%)
+      6                                                                    ERYTHEMA               6 (13%)                            6 (18%)                           8 (18%)
+      7                                                   APPLICATION SITE ERYTHEMA              2 (4.2%)                            5 (15%)                           5 (11%)
+      8                                                                           M              13 (39%)                           24 (55%)                          17 (50%)
+      9                                            AMERICAN INDIAN OR ALASKA NATIVE               0 (NA%)                           1 (100%)                           0 (NA%)
+      10                                                  BLACK OR AFRICAN AMERICAN               1 (33%)                            1 (33%)                           0 (NA%)
+      11                                                                      WHITE              12 (40%)                           22 (55%)                          17 (50%)
+      12                                                  APPLICATION SITE PRURITUS              1 (3.3%)                           12 (30%)                          10 (29%)
+      13                                                                  DIARRHOEA               6 (20%)                           3 (7.5%)                          2 (5.9%)
+      14                                                                   ERYTHEMA               3 (10%)                            5 (13%)                           6 (18%)
+      15                                                  APPLICATION SITE ERYTHEMA              1 (3.3%)                           10 (25%)                           7 (21%)
+
 # tbl_filter.tbl_hierarchical() error messaging works
 
     Code