From 972ab4918759c6f49fb54c4bb4a2f43d4d149b2d Mon Sep 17 00:00:00 2001
From: Kyle Barrett <barrettk@metrumrg.com>
Date: Thu, 2 Jan 2025 13:19:41 -0500
Subject: [PATCH 1/5] Fix nm_data(filter = TRUE)

 - Previously returned no rows if column is read in as all NA and a string detection filter was applied (e.g., C='C').

 - Instead of adding `C!='C'`, we now append an `is.na()` expression, which would yield `(C!='C' | is.na(C)`.

 - A simplified version of the filter (or key symbold for `@` and `#` filters), is stored as the name for internal traceability and testing purposes (i.e. so we dont have to add `is.na(col)` for all the tests).
---
 R/filter-nm-data.R                   | 43 ++++++++++++++++---
 man/add_na_filter.Rd                 | 20 +++++++++
 man/translate_nm_expr.Rd             |  2 +-
 tests/testthat/test-filter-nm-data.R | 64 ++++++++++++++++++++++++----
 4 files changed, 112 insertions(+), 17 deletions(-)
 create mode 100644 man/add_na_filter.Rd

diff --git a/R/filter-nm-data.R b/R/filter-nm-data.R
index 04c9081f..075823cf 100644
--- a/R/filter-nm-data.R
+++ b/R/filter-nm-data.R
@@ -169,7 +169,7 @@ invert_operator <- function(expr) {
 #'
 #' }
 #' @keywords internal
-#' @seealso [translate_nm_operator()], [invert_operator()]
+#' @seealso [translate_nm_operator()], [invert_operator()], [add_na_filter()]
 translate_nm_expr <- function(
     nm_expr,
     type = c("ignore", "accept"),
@@ -181,38 +181,67 @@ translate_nm_expr <- function(
   # Translate NM operators
   exprs <- translate_nm_operator(nm_expr)
 
-  r_exprs <- purrr::map_chr(exprs, function(expr){
+  # Compile list of all R filters.
+  # - A simplified version of the filter (no NA handling) or key symbol (#, @) is
+  #   stored as the name for traceability and testing purposes.
+  r_exprs_list <- purrr::map(exprs, function(expr){
     if(type == "ignore"){
       # `IGNORE=#`, `IGNORE=@`, `IGNORE=c1`, `IGNORE=(list)`
       if(expr == "#"){
         # IGNORE=# is the default.  That is, in the absence of IGNORE option, any
         # record whose first character is '#' is treated as a comment record.
-        paste0("!grepl('^#', ", data_cols[1], ")")
+        paste0("!grepl('^#', ", data_cols[1], ")") %>%
+          stats::setNames(expr)
       }else if(expr == "@"){
         # IGNORE=@ signifies that any data record having an alphabetic character
         # or `@` as its first non-blank character in column one should be ignored.
         #  - This permits a table file having header lines to be used as an
         #    NM-TRAN data set.
         #  - add extra `\\` for later parse()
-        paste0("!grepl('^\\\\s*[A-Za-z@]', ", data_cols[1], ")")
+        paste0("!grepl('^\\\\s*[A-Za-z@]', ", data_cols[1], ")") %>%
+          stats::setNames(expr)
       }else if(grepl('^[a-zA-Z]$', expr)){
         # This is for `IGNORE=C` columns. Meaning ignore rows if the _first_ column
         # contains 'C' (this form always points to the _first_ column)
         # - the above regex looks for characters of length>=1, and no symbols
-        paste0(data_cols[1], "!=", "'", expr, "'")
+        r_expr <- paste0(data_cols[1], "!=", "'", expr, "'")
+        add_na_filter(r_expr) %>% stats::setNames(r_expr)
       }else{
         # Invert list form expressions
-        invert_operator(expr)
+        r_expr <- invert_operator(expr)
+        add_na_filter(r_expr) %>% stats::setNames(r_expr)
       }
     }else{
       # ACCEPT option only supports `ACCEPT=(list)` form --> no formatting needed
-      expr
+      add_na_filter(expr) %>% stats::setNames(expr)
     }
   })
 
+  r_exprs <- purrr::list_c(r_exprs_list)
+
   return(r_exprs)
 }
 
+#' Adjust filters to retain NA values for relevant columns
+#'
+#' This function serves to add a  `| is.na(col)` filter to an `R` filter to
+#' avoid filtering out `NA` rows. It is *not* needed when using `grepl` for
+#' string detection (e.g., `IGNORE=#`, `IGNORE=@`).
+#'
+#' @param r_expr An `R` filter expression. e.g., `'BLQ!=1'`, `C!='C'`.
+#' @keywords internal
+#' @seealso [translate_nm_operator()], [invert_operator()]
+add_na_filter <- function(r_expr){
+  column <- stringr::str_extract(r_expr, "^[^!=><]+") %>%
+    stringr::str_trim()
+
+  if(!is.na(column)){
+    paste0("(", r_expr, " | is.na(", column, "))")
+  }else{
+    r_expr
+  }
+}
+
 #' Filter `NONMEM` input data based on `IGNORE` and `ACCEPT` record options
 #'
 #' @param .mod A `bbi_nonmem_model` object
diff --git a/man/add_na_filter.Rd b/man/add_na_filter.Rd
new file mode 100644
index 00000000..c93252e1
--- /dev/null
+++ b/man/add_na_filter.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filter-nm-data.R
+\name{add_na_filter}
+\alias{add_na_filter}
+\title{Adjust filters to retain NA values for relevant columns}
+\usage{
+add_na_filter(r_expr)
+}
+\arguments{
+\item{r_expr}{An \code{R} filter expression. e.g., \code{'BLQ!=1'}, \code{C!='C'}.}
+}
+\description{
+This function serves to add a  \verb{| is.na(col)} filter to an \code{R} filter to
+avoid filtering out \code{NA} rows. It is \emph{not} needed when using \code{grepl} for
+string detection (e.g., \verb{IGNORE=#}, \verb{IGNORE=@}).
+}
+\seealso{
+\code{\link[=translate_nm_operator]{translate_nm_operator()}}, \code{\link[=invert_operator]{invert_operator()}}
+}
+\keyword{internal}
diff --git a/man/translate_nm_expr.Rd b/man/translate_nm_expr.Rd
index 250f070a..bacbc51c 100644
--- a/man/translate_nm_expr.Rd
+++ b/man/translate_nm_expr.Rd
@@ -43,6 +43,6 @@ translate_nm_expr("@", data_cols = data_cols)
 }
 }
 \seealso{
-\code{\link[=translate_nm_operator]{translate_nm_operator()}}, \code{\link[=invert_operator]{invert_operator()}}
+\code{\link[=translate_nm_operator]{translate_nm_operator()}}, \code{\link[=invert_operator]{invert_operator()}}, \code{\link[=add_na_filter]{add_na_filter()}}
 }
 \keyword{internal}
diff --git a/tests/testthat/test-filter-nm-data.R b/tests/testthat/test-filter-nm-data.R
index 8eae99b9..bb02cb50 100644
--- a/tests/testthat/test-filter-nm-data.R
+++ b/tests/testthat/test-filter-nm-data.R
@@ -17,12 +17,12 @@ test_that("translate_nm_expr() translates NONMEM filter expressions", {
   test_exprs <- c("SEX==1", "ID.EQ.2", "WT/=70", "AGE.NE.30", "A=1", "WT.GT.40", "B.LE.20")
 
   expect_equal(
-    translate_nm_expr(test_exprs, type = 'accept'),
+    translate_nm_expr(test_exprs, type = 'accept') %>% names(),
     c("SEX==1", "ID==2", "WT!=70", "AGE!=30", "A==1", "WT>40", "B<=20")
   )
 
   expect_equal(
-    translate_nm_expr(test_exprs, type = 'ignore'),
+    translate_nm_expr(test_exprs, type = 'ignore') %>% names(),
     c("SEX!=1", "ID!=2", "WT==70", "AGE==30", "A!=1", "WT<40", "B>=20")
   )
 
@@ -31,18 +31,18 @@ test_that("translate_nm_expr() translates NONMEM filter expressions", {
   # - only `data_cols[1]` is technically needed
   data_cols <- c("C", "ID", "TIME", "EVID", "DV", "BLQ")
   expect_equal(
-    translate_nm_expr("#", data_cols = data_cols),
+    translate_nm_expr("#", data_cols = data_cols) %>% unname(),
     paste0("!grepl('^#', ", data_cols[1], ")")
   )
 
   expect_equal(
-    translate_nm_expr("C", data_cols = data_cols),
+    translate_nm_expr("C", data_cols = data_cols) %>% names(),
     paste0(data_cols[1], "!='C'")
   )
 
   # Extra `\\` is added for escape purposes when the expression is later parsed
   expect_equal(
-    translate_nm_expr("@", data_cols = data_cols),
+    translate_nm_expr("@", data_cols = data_cols) %>% unname(),
     paste0("!grepl('^\\\\s*[A-Za-z@]', ", data_cols[1], ")")
   )
 
@@ -75,18 +75,26 @@ test_that("filter_nm_data() filters input data using IGNORE/ACCEPT options", {
   data_rec$values[[7]]$value <- "(ID.EQ.2, SEX=1, WT.LE.50)"
   nmrec::write_ctl(ctl, get_model_path(mod2))
 
-  # Check expected expressions
+  ## Check expected expressions ##
   input_data <- nm_data(mod2) %>% suppressMessages()
   nm_exprs <- get_data_filter_exprs(mod2)
   r_filters <- translate_nm_expr(
     nm_expr = nm_exprs$exprs, type = nm_exprs$type, data_cols = names(input_data)
   )
-  filter_expression <- paste(r_filters, collapse = " & ")
+
+  # Check filters and names
+  # - names: simplified version of filter (no NA handling) or key symbol (#, @)
+  #   stored as the name for traceability and testing purposes.
+  # - value: the actual filter expression supplied to the final filter
+  expect_equal(names(r_filters), c("@", "ID!=2", "SEX!=1", "WT>=50"))
   expect_equal(
-    filter_expression, "!grepl('^\\\\s*[A-Za-z@]', ID) & ID!=2 & SEX!=1 & WT>=50"
-  )
+    unname(r_filters),
+    c("!grepl('^\\\\s*[A-Za-z@]', ID)", "(ID!=2 | is.na(ID))",
+      "(SEX!=1 | is.na(SEX))", "(WT>=50 | is.na(WT))")
+    )
 
   # Check that filter expression works correctly
+  filter_expression <- paste(r_filters, collapse = " & ")
   filtered_data <- filter_nm_data(mod2)
   expect_equal(
     nrow(filtered_data),
@@ -159,3 +167,41 @@ test_that("filter_nm_data() errors if expressions cant be parsed", {
     "ignore/accept list could not be converted to filters"
   )
 })
+
+
+test_that("filter_nm_data() works when NA values are present", {
+  mod2 <- copy_model_from(MOD1, "2")
+  on.exit(delete_models(mod2, .force = TRUE, .tags = NULL))
+
+  # Add additional IGNORE expressions and compare to dplyr filters
+  ctl <- get_model_ctl(mod2)
+  data_rec <- nmrec::select_records(ctl, "data")[[1]]
+  data_rec$parse()
+
+  # Include a filter for a column with NA values
+  data_rec$values[[7]]$value <- "(C='C', BLQ=1)"
+  nmrec::write_ctl(ctl, get_model_path(mod2))
+
+  # Test 1
+  # - C column does *not* lead to loss of subjects due to NA values
+  # - 1 subject lost due to BLQ.
+  data <- nm_data(mod2)
+  data_test1 <- data %>% dplyr::mutate(C = NA, BLQ = 0)
+  data_test1$BLQ[nrow(data_test1)] <- 1
+
+  data_f1 <- filter_nm_data(mod2, data = data_test1)
+  expect_equal(nrow(data) - 1, nrow(data_f1))
+
+  # Test 2
+  # - filter works appropriately when found strings and NA values are present
+  # - 1 subject lost due to BLQ. 1 subject lost due to 'C' filter
+  data_test2 <- data_test1
+  data_test2$C[1] <- "C"
+  data_test2$C[2] <- "."
+  data_test2$BLQ[3] <- NA
+
+  data_f2 <- filter_nm_data(mod2, data = data_test2)
+  expect_equal(nrow(data) - 2, nrow(data_f2))
+  expect_true(is.na(data_f2$BLQ[2]))
+  expect_equal(data_f2$C[1], ".")
+})

From 7f089470cf682fd157adb377f9b9d4ff52e78a19 Mon Sep 17 00:00:00 2001
From: Kyle Barrett <barrettk@metrumrg.com>
Date: Thu, 2 Jan 2025 14:05:43 -0500
Subject: [PATCH 2/5] add more checks to avoid malformed filter expressions

 - unlikely scenario since NONMEM doesnt support these types of filters, but better to have an informative error here than to create a bad filter expression
---
 R/filter-nm-data.R                   | 22 +++++++++++++++++++++-
 tests/testthat/test-filter-nm-data.R | 14 ++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/R/filter-nm-data.R b/R/filter-nm-data.R
index 075823cf..08c2c963 100644
--- a/R/filter-nm-data.R
+++ b/R/filter-nm-data.R
@@ -212,7 +212,27 @@ translate_nm_expr <- function(
         add_na_filter(r_expr) %>% stats::setNames(r_expr)
       }
     }else{
-      # ACCEPT option only supports `ACCEPT=(list)` form --> no formatting needed
+      # ACCEPT option only supports `ACCEPT=(list)` form
+      # - no formatting needed, but check for user error to avoid unintended or
+      #   malformed filter expressions
+      nm_docs_msg <- "See NONMEM documentation for more details"
+      if(expr == "#" || expr == "@"){
+        cli::cli_abort(
+          c(
+            "{.val #} and {.val @} are only supported for IGNORE list expressions",
+            "i" = nm_docs_msg
+          )
+        )
+      }
+      if(grepl('^[a-zA-Z]$', expr)){
+        cli::cli_abort(
+          c(
+            "ACCEPT option only supports `ACCEPT=(list)` form",
+            "i" = nm_docs_msg
+          )
+        )
+      }
+
       add_na_filter(expr) %>% stats::setNames(expr)
     }
   })
diff --git a/tests/testthat/test-filter-nm-data.R b/tests/testthat/test-filter-nm-data.R
index bb02cb50..75a3a68a 100644
--- a/tests/testthat/test-filter-nm-data.R
+++ b/tests/testthat/test-filter-nm-data.R
@@ -46,6 +46,20 @@ test_that("translate_nm_expr() translates NONMEM filter expressions", {
     paste0("!grepl('^\\\\s*[A-Za-z@]', ", data_cols[1], ")")
   )
 
+  # Using accept with these filters errors out
+  expect_error(
+    translate_nm_expr("#", type = "accept", data_cols = data_cols),
+    "only supported for IGNORE"
+  )
+  expect_error(
+    translate_nm_expr("@", type = "accept", data_cols = data_cols),
+    "only supported for IGNORE"
+  )
+  expect_error(
+    translate_nm_expr("C", type = "accept", data_cols = data_cols),
+    "ACCEPT option only supports"
+  )
+
   # Error out for unsupported logical operators
   test_exprs_bad <- c(test_exprs, "GEN=1 .AND. AGE > 60")
   expect_error(

From 8ae0f3599ca199b2ea5980c90799b9e1ae0e3dc8 Mon Sep 17 00:00:00 2001
From: kyleb <kyleb@metrumrg.com>
Date: Fri, 3 Jan 2025 17:21:22 -0500
Subject: [PATCH 3/5] update IGNORE=C to work like IGNORE=@

---
 R/filter-nm-data.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/filter-nm-data.R b/R/filter-nm-data.R
index 08c2c963..abf6f502 100644
--- a/R/filter-nm-data.R
+++ b/R/filter-nm-data.R
@@ -203,9 +203,9 @@ translate_nm_expr <- function(
       }else if(grepl('^[a-zA-Z]$', expr)){
         # This is for `IGNORE=C` columns. Meaning ignore rows if the _first_ column
         # contains 'C' (this form always points to the _first_ column)
-        # - the above regex looks for characters of length>=1, and no symbols
-        r_expr <- paste0(data_cols[1], "!=", "'", expr, "'")
-        add_na_filter(r_expr) %>% stats::setNames(r_expr)
+        # - the above regex looks for characters of length=1, and no symbols
+        paste0("!grepl('^", expr, "', ", data_cols[1], ")") %>%
+          stats::setNames(expr)
       }else{
         # Invert list form expressions
         r_expr <- invert_operator(expr)

From 048cf3947ec477aefa638a6f77d5687fb0c08261 Mon Sep 17 00:00:00 2001
From: Kyle Barrett <barrettk@metrumrg.com>
Date: Tue, 7 Jan 2025 15:51:02 -0500
Subject: [PATCH 4/5] fix test and add regression test

---
 tests/testthat/test-filter-nm-data.R | 31 ++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/tests/testthat/test-filter-nm-data.R b/tests/testthat/test-filter-nm-data.R
index 75a3a68a..b2811a4d 100644
--- a/tests/testthat/test-filter-nm-data.R
+++ b/tests/testthat/test-filter-nm-data.R
@@ -36,8 +36,8 @@ test_that("translate_nm_expr() translates NONMEM filter expressions", {
   )
 
   expect_equal(
-    translate_nm_expr("C", data_cols = data_cols) %>% names(),
-    paste0(data_cols[1], "!='C'")
+    translate_nm_expr("C", data_cols = data_cols) %>% unname(),
+    paste0("!grepl('^", data_cols[1], "', ", data_cols[1], ")")
   )
 
   # Extra `\\` is added for escape purposes when the expression is later parsed
@@ -183,6 +183,33 @@ test_that("filter_nm_data() errors if expressions cant be parsed", {
 })
 
 
+test_that("filter_nm_data() works for IGNORE=C filters", {
+  mod2 <- copy_model_from(MOD1, "2")
+  on.exit(delete_models(mod2, .force = TRUE, .tags = NULL))
+
+  # Add additional IGNORE expressions and compare to dplyr filters
+  ctl <- get_model_ctl(mod2)
+  data_rec <- nmrec::select_records(ctl, "data")[[1]]
+  data_rec$parse()
+
+  # Include a filter for a column with NA values
+  data_rec$values[[7]]$value <- "='C'"
+  nmrec::write_ctl(ctl, get_model_path(mod2))
+
+  # Test
+  # - 2 subjects lost due to 'C' filter
+  data <- nm_data(mod2)
+  data_test <- data %>% dplyr::mutate(C = NA) %>%
+    # 'C' must be the first column for this test
+    dplyr::relocate("C")
+  data_test$C[1] <- "C"
+  data_test$C[2] <- "Comment"
+  data_test$C[3] <- "."
+
+  data_f <- filter_nm_data(mod2, data = data_test)
+  expect_equal(nrow(data) - 2, nrow(data_f))
+})
+
 test_that("filter_nm_data() works when NA values are present", {
   mod2 <- copy_model_from(MOD1, "2")
   on.exit(delete_models(mod2, .force = TRUE, .tags = NULL))

From 3be0ccc1968faf2f6c185fce05765e6f6026a7ad Mon Sep 17 00:00:00 2001
From: Kyle Barrett <barrettk@metrumrg.com>
Date: Thu, 9 Jan 2025 13:43:33 -0500
Subject: [PATCH 5/5] Use expect_equivalent instead of unnaming the vector

---
 tests/testthat/test-filter-nm-data.R | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/testthat/test-filter-nm-data.R b/tests/testthat/test-filter-nm-data.R
index b2811a4d..ef43d16d 100644
--- a/tests/testthat/test-filter-nm-data.R
+++ b/tests/testthat/test-filter-nm-data.R
@@ -30,20 +30,20 @@ test_that("translate_nm_expr() translates NONMEM filter expressions", {
   # Use of `@`, `#`, or form `IGNORE=C2` require `data_cols` to be specified
   # - only `data_cols[1]` is technically needed
   data_cols <- c("C", "ID", "TIME", "EVID", "DV", "BLQ")
-  expect_equal(
-    translate_nm_expr("#", data_cols = data_cols) %>% unname(),
-    paste0("!grepl('^#', ", data_cols[1], ")")
+  expect_equivalent(
+    translate_nm_expr("#", data_cols = data_cols),
+    "!grepl('^#', C)"
   )
 
-  expect_equal(
-    translate_nm_expr("C", data_cols = data_cols) %>% unname(),
-    paste0("!grepl('^", data_cols[1], "', ", data_cols[1], ")")
+  expect_equivalent(
+    translate_nm_expr("C", data_cols = data_cols),
+    "!grepl('^C', C)"
   )
 
   # Extra `\\` is added for escape purposes when the expression is later parsed
-  expect_equal(
-    translate_nm_expr("@", data_cols = data_cols) %>% unname(),
-    paste0("!grepl('^\\\\s*[A-Za-z@]', ", data_cols[1], ")")
+  expect_equivalent(
+    translate_nm_expr("@", data_cols = data_cols),
+    "!grepl('^\\\\s*[A-Za-z@]', C)"
   )
 
   # Using accept with these filters errors out
@@ -193,7 +193,7 @@ test_that("filter_nm_data() works for IGNORE=C filters", {
   data_rec$parse()
 
   # Include a filter for a column with NA values
-  data_rec$values[[7]]$value <- "='C'"
+  data_rec$values[[7]]$value <- "=C"
   nmrec::write_ctl(ctl, get_model_path(mod2))
 
   # Test