Subset parameter on datastep #171.

dbosak01 · Nov 5, 2024 · a610957 · a610957
1 parent 9ff0f69
commit a610957
Show file tree

Hide file tree

Showing 7 changed files with 83 additions and 4 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
 # libr 1.3.5
 * Added 'parquet' file format to `libname()` function.
+* Added 'subset' parameter to `datastep()` function to filter the data on input.
 
 # libr 1.3.4
 * Fixed issue where `libname()` was failing on empty dataset.

diff --git a/R/datastep.R b/R/datastep.R
@@ -100,6 +100,10 @@ e$output <- list()
 #' 
 #' \code{calculate} and \code{retain} are both input parameters.
 #' 
+#' The \code{subset} and \code{where} parameters can both be used to filter
+#' the datastep data.  The difference is that \code{subset} is an input
+#' parameter, and \code{where} is an output parameter.  
+#' 
 #' @section Set and Merge Operations:
 #' The \code{datastep} function allows you to join one or more input datasets 
 #' into a single output dataset.  There are two operations in this regard:
@@ -267,6 +271,11 @@ e$output <- list()
 #' \code{delete} function, or \code{output} function to filter desired results.
 #' @param log Whether or not to log the datastep.  Default is TRUE.  This 
 #' parameter is used internally.
+#' @param subset The \code{subset} parameter accepts an \code{expression} object
+#' that will be used to subset the data.  The \code{subset} expression will be 
+#' executed \strong{before} the datastep executes.  In this regard, the 
+#' \code{subset} parameter on the R datastep is similar to the \code{where} clause
+#' on the SAS datastep.
 #' @return The processed data frame, tibble, or data table.  
 #' @family datastep
 #' @seealso \code{\link{libname}} function to create a data library, and
@@ -536,7 +545,8 @@ datastep <- function(data, steps, keep = NULL,
                      merge = NULL,
                      merge_by = NULL,
                      merge_in = NULL, 
-                     log = TRUE) {
+                     log = TRUE, 
+                     subset = NULL) {
 
   if (!"data.frame" %in% class(data))
     stop("input data must be inherited from data.frame")
@@ -703,6 +713,24 @@ datastep <- function(data, steps, keep = NULL,
                           check.names = FALSE) 
   }
 
+  # Subset Before
+  if (!is.null(subset)) {
+
+    data <- tryCatch({subset(data, eval(subset))},
+                     error = function(cond){ret})
+
+    # Give warning if there are no rows and no output()
+    if (hout == FALSE & nrow(data) == 0) {
+      warning("After subset, input dataset has no rows.") 
+    }
+
+    rowcount <- nrow(data)
+
+    # Restore attributes from original data 
+    data <- copy_attributes(data_attributes, data)
+
+  }
+
   # Add automatic variables
   data <- add_autos(data, by, sort_check)
 

diff --git a/docs/news/index.html b/docs/news/index.html
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
@@ -10,7 +10,7 @@ articles:
   libr-faq: libr-faq.html
   libr-management: libr-management.html
   libr: libr.html
-last_built: 2024-11-02T03:46Z
+last_built: 2024-11-05T01:57Z
 urls:
   reference: https://libr.r-sassy.org/reference
   article: https://libr.r-sassy.org/articles
diff --git a/docs/reference/datastep.html b/docs/reference/datastep.html
diff --git a/man/datastep.Rd b/man/datastep.Rd
diff --git a/tests/testthat/test-datastep.R b/tests/testthat/test-datastep.R
@@ -1807,3 +1807,29 @@ test_that("ds51: delete() works with NA in data frame.", {
 
 })
 
+
+test_that("ds52: subset clause works.", {
+
+  df <- datastep(mtcars,
+                 subset = expression(cyl == 8),
+                 {
+
+                   if (mpg >= 20)
+                     mpgcat <- "High"
+                   else
+                     mpgcat <- "Low"
+
+                   recdt <- as.Date("1974-06-10")
+
+                   if (cyl == 8)
+                     is8cyl <- TRUE
+                   else
+                     is8cyl <- FALSE
+
+                 })
+
+  df
+
+  expect_equal(mean(df$cyl), 8)
+
+})