diff --git a/NEWS.md b/NEWS.md index 8d7633a..2920411 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # libr 1.3.5 * Added 'parquet' file format to `libname()` function. +* Added 'subset' parameter to `datastep()` function to filter the data on input. # libr 1.3.4 * Fixed issue where `libname()` was failing on empty dataset. diff --git a/R/datastep.R b/R/datastep.R index 2a6d6f6..e01e4d8 100644 --- a/R/datastep.R +++ b/R/datastep.R @@ -100,6 +100,10 @@ e$output <- list() #' #' \code{calculate} and \code{retain} are both input parameters. #' +#' The \code{subset} and \code{where} parameters can both be used to filter +#' the datastep data. The difference is that \code{subset} is an input +#' parameter, and \code{where} is an output parameter. +#' #' @section Set and Merge Operations: #' The \code{datastep} function allows you to join one or more input datasets #' into a single output dataset. There are two operations in this regard: @@ -267,6 +271,11 @@ e$output <- list() #' \code{delete} function, or \code{output} function to filter desired results. #' @param log Whether or not to log the datastep. Default is TRUE. This #' parameter is used internally. +#' @param subset The \code{subset} parameter accepts an \code{expression} object +#' that will be used to subset the data. The \code{subset} expression will be +#' executed \strong{before} the datastep executes. In this regard, the +#' \code{subset} parameter on the R datastep is similar to the \code{where} clause +#' on the SAS datastep. #' @return The processed data frame, tibble, or data table. #' @family datastep #' @seealso \code{\link{libname}} function to create a data library, and @@ -536,7 +545,8 @@ datastep <- function(data, steps, keep = NULL, merge = NULL, merge_by = NULL, merge_in = NULL, - log = TRUE) { + log = TRUE, + subset = NULL) { if (!"data.frame" %in% class(data)) stop("input data must be inherited from data.frame") @@ -703,6 +713,24 @@ datastep <- function(data, steps, keep = NULL, check.names = FALSE) } + # Subset Before + if (!is.null(subset)) { + + data <- tryCatch({subset(data, eval(subset))}, + error = function(cond){ret}) + + # Give warning if there are no rows and no output() + if (hout == FALSE & nrow(data) == 0) { + warning("After subset, input dataset has no rows.") + } + + rowcount <- nrow(data) + + # Restore attributes from original data + data <- copy_attributes(data_attributes, data) + + } + # Add automatic variables data <- add_autos(data, by, sort_check) diff --git a/docs/news/index.html b/docs/news/index.html index 8a8490b..73206fe 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -84,6 +84,7 @@

Changelog

diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 5ce5a55..df2ff35 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -10,7 +10,7 @@ articles: libr-faq: libr-faq.html libr-management: libr-management.html libr: libr.html -last_built: 2024-11-02T03:46Z +last_built: 2024-11-05T01:57Z urls: reference: https://libr.r-sassy.org/reference article: https://libr.r-sassy.org/articles diff --git a/docs/reference/datastep.html b/docs/reference/datastep.html index fc95152..5674fab 100644 --- a/docs/reference/datastep.html +++ b/docs/reference/datastep.html @@ -114,7 +114,8 @@

Step through data row-by-row

merge = NULL, merge_by = NULL, merge_in = NULL, - log = TRUE + log = TRUE, + subset = NULL )
@@ -269,6 +270,14 @@

Arguments

Whether or not to log the datastep. Default is TRUE. This parameter is used internally.

+ +
subset
+

The subset parameter accepts an expression object +that will be used to subset the data. The subset expression will be +executed before the datastep executes. In this regard, the +subset parameter on the R datastep is similar to the where clause +on the SAS datastep.

+

Value

@@ -360,6 +369,9 @@

Optional Parameters

value of the prior step/row. This functionality allows you to increment values or perform cumulative operations.

calculate and retain are both input parameters.

+

The subset and where parameters can both be used to filter +the datastep data. The difference is that subset is an input +parameter, and where is an output parameter.

Set and Merge Operations

diff --git a/man/datastep.Rd b/man/datastep.Rd index 2796b1a..f95ec79 100644 --- a/man/datastep.Rd +++ b/man/datastep.Rd @@ -23,7 +23,8 @@ datastep( merge = NULL, merge_by = NULL, merge_in = NULL, - log = TRUE + log = TRUE, + subset = NULL ) } \arguments{ @@ -136,6 +137,12 @@ came from the corresponding table. Use the \code{where} parameter, \item{log}{Whether or not to log the datastep. Default is TRUE. This parameter is used internally.} + +\item{subset}{The \code{subset} parameter accepts an \code{expression} object +that will be used to subset the data. The \code{subset} expression will be +executed \strong{before} the datastep executes. In this regard, the +\code{subset} parameter on the R datastep is similar to the \code{where} clause +on the SAS datastep.} } \value{ The processed data frame, tibble, or data table. @@ -240,6 +247,10 @@ value of the prior step/row. This functionality allows you to increment values or perform cumulative operations. \code{calculate} and \code{retain} are both input parameters. + +The \code{subset} and \code{where} parameters can both be used to filter +the datastep data. The difference is that \code{subset} is an input +parameter, and \code{where} is an output parameter. } \section{Set and Merge Operations}{ diff --git a/tests/testthat/test-datastep.R b/tests/testthat/test-datastep.R index 733204e..ed5c0e6 100644 --- a/tests/testthat/test-datastep.R +++ b/tests/testthat/test-datastep.R @@ -1807,3 +1807,29 @@ test_that("ds51: delete() works with NA in data frame.", { }) + +test_that("ds52: subset clause works.", { + + df <- datastep(mtcars, + subset = expression(cyl == 8), + { + + if (mpg >= 20) + mpgcat <- "High" + else + mpgcat <- "Low" + + recdt <- as.Date("1974-06-10") + + if (cyl == 8) + is8cyl <- TRUE + else + is8cyl <- FALSE + + }) + + df + + expect_equal(mean(df$cyl), 8) + +})