From 55dc5d40936968bad0cec5cd39d6073cda901be6 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 27 Aug 2022 17:11:10 +0200 Subject: [PATCH 01/11] research for catalog files --- src/readsas.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/readsas.cpp b/src/readsas.cpp index 38f624b..4ba9b08 100644 --- a/src/readsas.cpp +++ b/src/readsas.cpp @@ -520,6 +520,7 @@ Rcpp::List readsas(const char * filePath, if (( PAGE_TYPE == 16384 || // PAGE_META_TYPE_2 + PAGE_TYPE == 4096 || // CATALOG PAGE_TYPE == 1024 || // PAGE_AMD_TYPE PAGE_TYPE == 640 || PAGE_TYPE == 512 || // PAGE_MIX_TYPE_2 PAGE_MIX_TYPE_1 PAGE_TYPE == 384 || PAGE_TYPE == 256 || // PAGE_DATA_TYPE_2 PAGE_DATA_TYPE @@ -528,6 +529,21 @@ Rcpp::List readsas(const char * filePath, { for (auto i = 0; i < SUBHEADER_COUNT; ++i) { + + if (PAGE_TYPE == 4096) { + + size_t txt_pos = unk16 + alignval; + sas.seekg(txt_pos, sas.cur); + + std::string sysresr_bitmap; + sysresr_bitmap.resize(16, '\0'); + sysresr_bitmap = readstring(sysresr_bitmap, sas); + Rcout << sysresr_bitmap << std::endl; + + stop("stop"); + + } + if (u64 == 4) { potabs[i].SH_OFF = readbin(potabs[i].SH_OFF, sas, swapit); // 8 From c4b05d8e94b6b15c9eebde0f8cc6dab3930bdcca Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Thu, 1 Sep 2022 20:43:02 +0200 Subject: [PATCH 02/11] improve select.rows. It is now possible to select rows by integer vector positions or even zero rows. This allows to mimic the where behavior of sas. Read a few columns, find the cases you want to keep and read only these positions. --- R/readsas.R | 35 +++++++++--------------- man/read.sas.Rd | 5 ++-- src/readsas.cpp | 54 ++++++++++++++++++++++---------------- src/sas.h | 5 ++++ tests/testthat/test_read.R | 26 +++++++++++++++--- 5 files changed, 74 insertions(+), 51 deletions(-) diff --git a/R/readsas.R b/R/readsas.R index dfd643b..fee6c08 100644 --- a/R/readsas.R +++ b/R/readsas.R @@ -6,9 +6,8 @@ #'@param debug print debug information #'@param convert_dates default is TRUE #'@param recode default is TRUE -#'@param select.rows \emph{integer.} Vector of one or two numbers. If single -#' value rows from 1:val are selected. If two values of a range are selected -#' the rows in range will be selected. +#'@param select.rows \emph{integer.} Vector of rows to import. Minimum 0. Rows +#' imported are sorted. #'@param select.cols \emph{character:} Vector of variables to select. #'@param remove_deleted logical if deleted rows should be removed from data #'@param rownames first column will be used as rowname and removed from data @@ -43,25 +42,9 @@ read.sas <- function(file, debug = FALSE, convert_dates = TRUE, recode = TRUE, if (!is.numeric(select.rows)) { return(message("select.rows must be of type numeric")) } else { - # guard against negative values - if (any(select.rows < 0)) - select.rows <- abs(select.rows) + if (any(select.rows < 0)) stop("must select at least a single row") - # check that length is not > 2 - if (length(select.rows) > 2) - return(message("select.rows must be of length 1 or 2.")) - - # if length 1 start at row 1 - if (length(select.rows) == 1) - select.rows <- c(1, select.rows) - - # reorder if 2 is bigger than 1 - if (select.rows[2] < select.rows[1]) - select.rows <- c(select.rows[2], select.rows[1]) - - # make sure to start at index position 1 if select.rows[2] > 0 - if (select.rows[2] > 0 && select.rows[1] == 0) - select.rows[1] <- 1 + select.rows <- sort(select.rows - 1) } } @@ -72,16 +55,22 @@ read.sas <- function(file, debug = FALSE, convert_dates = TRUE, recode = TRUE, data <- readsas(filepath, debug, select.rows, select.cols) - # rowcount <- attr(data, "rowcount") # if rownames, remove the rowname variable from attributes cvec <- ifelse(rownames, -1, substitute()) + # rownames start at 0 + row_names <- attr(data, "rvec") + 1 + + # only if a row was returned + if (nrow(data)) row.names(data) <- row_names + if (rownames) { rownames(data) <- data[[1]] data[[1]] <- NULL } + encoding <- attr(data, "encoding") <- attr(data, "encoding") ## shorten attributes and reassign @@ -174,9 +163,9 @@ read.sas <- function(file, debug = FALSE, convert_dates = TRUE, recode = TRUE, if (remove_deleted) { + sel <- row_names del_rows <- attr(data, "deleted_rows") - sel <- attr(data, "rvec") val <- attr(data, "valid")[sel] del <- attr(data, "deleted")[sel] attr(data, "deleted") <- NULL diff --git a/man/read.sas.Rd b/man/read.sas.Rd index bfe4d66..742aadc 100644 --- a/man/read.sas.Rd +++ b/man/read.sas.Rd @@ -24,9 +24,8 @@ read.sas( \item{recode}{default is TRUE} -\item{select.rows}{\emph{integer.} Vector of one or two numbers. If single -value rows from 1:val are selected. If two values of a range are selected -the rows in range will be selected.} +\item{select.rows}{\emph{integer.} Vector of rows to import. Minimum 0. Rows +imported are sorted.} \item{select.cols}{\emph{character:} Vector of variables to select.} diff --git a/src/readsas.cpp b/src/readsas.cpp index 4ba9b08..94093a5 100644 --- a/src/readsas.cpp +++ b/src/readsas.cpp @@ -433,9 +433,10 @@ Rcpp::List readsas(const char * filePath, uint8_t alignval = 8; if (u64 != 4) alignval = 4; - uint64_t rowlength = 0, n = 0, delobs = 0; + + uint64_t rowlength = 0, delobs = 0; int64_t colf_p1 = 0, colf_p2 = 0; - int64_t k = 0; + int64_t n = 0, k = 0; std::vector stringvec(pagecount) ; auto totalrows = 0; @@ -1840,18 +1841,26 @@ Rcpp::List readsas(const char * filePath, // --- begin select rows or cols ----------------------------------- // - uint64_t nmin = 0, nmax = 0; + int64_t nmin = 0, nmax = 0; uint64_t nn = 0; // if selectrows is c(0,0) use full data + IntegerVector rvec; if (selectrows_.isNull()) { - nmin = 1; - nmax = n; + nmin = 0; + nmax = n -1; + // sequences of column and row + rvec = seq(nmin, nmax); } else { IntegerVector selectrows(selectrows_); - if (selectrows.size() != 2) stop("selected rows must be vector of two."); - nmin = selectrows(0); - nmax = selectrows(1); + + // all rows must be available + if (any_keepr(selectrows, n)) + Rcpp::warning("row > %d selected. Reducing select.rows", n); + + rvec = selectrows[selectrows < n]; + nmin = min(rvec); + nmax = max(rvec); } // make sure that n is not greater than nmax or nmin @@ -1860,9 +1869,6 @@ Rcpp::List readsas(const char * filePath, if (n < nmin) nmin = n; - - // sequences of column and row - IntegerVector rvec = seq(nmin, nmax); // otherwise if n == 0 nn would be 1 if (nmax > 0) nn = rvec.size(); @@ -1984,7 +1990,6 @@ Rcpp::List readsas(const char * filePath, std::vector valid(n); bool firstpage = 0; - bool keepr = 0; // sas provides two modes, compressed and uncompressed data. compressed // data has to be uncompressed and consists of always single rows. un- @@ -2002,24 +2007,27 @@ Rcpp::List readsas(const char * filePath, auto i = -1; // counter output data frame uint64_t ii = 0; // row on the selected page - for (uint64_t iii = 0; iii < n; ++iii) { + for (int64_t iii = 0; iii < n; ++iii) { + /* nmin is not a c vector starting at 0. i is initialized at -1 so will * be 0 once its bigger than nmin. This allows to import only the * selected rows. Once nmax is reached, import will stop. */ - if (iii >= (nmin-1) ) { - keepr = 1; + + bool keepr = false; + if (any_keepr(rvec, iii)) { + keepr = true; ++i; } + if (iii > nmax) break; + // Rcout << "---------------------" << std::endl; // Rcout << iii << " " << nmin << std::endl; if (debug && i == 0) Rcout << "row i / ii / iii / keepr: " << i << " " << ii << " " << iii <<" " << keepr << std::endl; - if (iii >= nmax) break; - if (pagecount > 0) { while (totalrowsvec[page] == 0) { @@ -2215,7 +2223,7 @@ Rcpp::List readsas(const char * filePath, sas.seekg(0, std::ios_base::beg); auto i = -1; - for (uint64_t iii = 0; iii < n; ++iii) { + for (int64_t iii = 0; iii < n; ++iii) { if (debug && i == 0) Rcout << "row: " << i << " --------------------" <= (nmin-1)) { - keepr = 1; + + bool keepr = false; + if (any_keepr(rvec, iii)) { + keepr = true; ++i; } - if (iii >= nmax) break; + if (iii > nmax) break; // for completeness valid[iii] = true; @@ -2360,7 +2370,7 @@ Rcpp::List readsas(const char * filePath, // 3. Create a data.frame if (nn > 0) - df.attr("row.names") = seq(1, nn); + df.attr("row.names") = rvec; if (varnames.size() == kk) df.attr("names") = varnames; diff --git a/src/sas.h b/src/sas.h index 19f0cfc..078ea8c 100644 --- a/src/sas.h +++ b/src/sas.h @@ -526,4 +526,9 @@ std::vector order_(std::vector v) { // } } +bool any_keepr(Rcpp::IntegerVector rvec, uint64_t idx) { + return std::find(rvec.begin(), rvec.end(), idx) != rvec.end(); +} + + #endif diff --git a/tests/testthat/test_read.R b/tests/testthat/test_read.R index 924ad47..4b85b34 100644 --- a/tests/testthat/test_read.R +++ b/tests/testthat/test_read.R @@ -53,7 +53,7 @@ test_that("read file with deleted rows", { ### select rows fl <- system.file("extdata", "mtcars.sas7bdat", package = "readsas") -dd <- read.sas(fl, select.rows = c(2,5), rownames = TRUE) +dd <- read.sas(fl, select.rows = c(2:5), rownames = TRUE) test_that("select.rows", { expect_true(all.equal(dd, mtcars[2:5,], check.attributes = FALSE)) @@ -70,16 +70,36 @@ test_that("select.rows", { }) -### select cols & rows +### select cols & rows - read rows 2 to 5 fl <- system.file("extdata", "mtcars.sas7bdat", package = "readsas") -dd <- read.sas(fl, select.cols = c("VAR1", "mpg", "hp"), select.rows = c(2,5), rownames = TRUE) +dd <- read.sas(fl, select.cols = c("VAR1", "mpg", "hp"), select.rows = c(2:5), rownames = TRUE) test_that("select.rows", { expect_true(all.equal(dd, mtcars[2:5,c("mpg", "hp")], check.attributes = FALSE)) }) +### select cols & rows pt 2 - read rows 2 and 5 +fl <- system.file("extdata", "mtcars.sas7bdat", package = "readsas") + +dd <- read.sas(fl, select.cols = c("VAR1", "mpg", "hp"), select.rows = c(2,5), rownames = TRUE) + +test_that("select.rows", { + expect_true(all.equal(dd, mtcars[c(2,5), c("mpg", "hp")], check.attributes = FALSE)) +}) + + +### select cols & rows pt 3 - read zero rows +fl <- system.file("extdata", "mtcars.sas7bdat", package = "readsas") + +dd <- read.sas(fl, select.cols = c("VAR1", "mpg", "hp"), select.rows = c(0), rownames = TRUE) + +test_that("select.rows", { + expect_true(all.equal(dd, mtcars[NULL, c("mpg", "hp")], check.attributes = FALSE)) +}) + + test_that("convert time and date", { exp <- structure(11033, class = "Date") From d5929d3f62fd47fbaa9653c0d384163162647281 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 3 Sep 2022 12:03:28 +0200 Subject: [PATCH 03/11] cleanup --- R/readsas.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/readsas.R b/R/readsas.R index fee6c08..58d62fb 100644 --- a/R/readsas.R +++ b/R/readsas.R @@ -42,7 +42,7 @@ read.sas <- function(file, debug = FALSE, convert_dates = TRUE, recode = TRUE, if (!is.numeric(select.rows)) { return(message("select.rows must be of type numeric")) } else { - if (any(select.rows < 0)) stop("must select at least a single row") + if (any(select.rows < 0)) stop("select.rows must be >= 0") select.rows <- sort(select.rows - 1) } @@ -55,8 +55,6 @@ read.sas <- function(file, debug = FALSE, convert_dates = TRUE, recode = TRUE, data <- readsas(filepath, debug, select.rows, select.cols) - # rowcount <- attr(data, "rowcount") - # if rownames, remove the rowname variable from attributes cvec <- ifelse(rownames, -1, substitute()) # rownames start at 0 From 6f83ec87abf38eb73bb8b801a4928d5bb0a0ad4a Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 3 Sep 2022 12:03:46 +0200 Subject: [PATCH 04/11] extend description and provide example --- R/readsas.R | 43 ++++++++++++++++++++++++++++++------------- man/read.sas.Rd | 18 +++++++++++++++++- 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/R/readsas.R b/R/readsas.R index 58d62fb..c31988f 100644 --- a/R/readsas.R +++ b/R/readsas.R @@ -1,22 +1,39 @@ #' read.sas #' -#'@author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} +#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} #' -#'@param file file to read -#'@param debug print debug information -#'@param convert_dates default is TRUE -#'@param recode default is TRUE -#'@param select.rows \emph{integer.} Vector of rows to import. Minimum 0. Rows +#' @description `read.sas` is a general function for reading sas7bdat files. +#' It supports a variety of complex sas7bdat files, x86 and x64, big and small +#' endian, and both compression types. It has been tested with numeric and +#' character data, and provides helper functions for converting sas7bdat to R +#' types `Date` and `POSIXct`. Time variables are converted to characters. +#' Conversion to date variables is applied if a known date, datetime or time +#' format is found in the sas7bdat file. For user-defined formats, the package +#' provides functions to convert from sas7bdat to R. +#' Input files may contain deleted lines that are marked as deleted instead of +#' being removed from the input data. These are removed on import, if you still +#' need them look at `remove_deleted`. Formats, labels and additional file +#' information are available with `attributes()`. +#' +#' @param file file to read +#' @param debug print debug information +#' @param convert_dates default is TRUE +#' @param recode default is TRUE +#' @param select.rows \emph{integer.} Vector of rows to import. Minimum 0. Rows #' imported are sorted. -#'@param select.cols \emph{character:} Vector of variables to select. -#'@param remove_deleted logical if deleted rows should be removed from data -#'@param rownames first column will be used as rowname and removed from data +#' @param select.cols \emph{character:} Vector of variables to select. +#' @param remove_deleted logical if deleted rows should be removed from data +#' @param rownames first column will be used as rowname and removed from data +#' +#' @useDynLib readsas, .registration=TRUE +#' @importFrom utils download.file +#' @importFrom stringi stri_encode #' -#'@useDynLib readsas, .registration=TRUE -#'@importFrom utils download.file -#'@importFrom stringi stri_encode +#' @examples +#' fl <- system.file("extdata", "cars.sas7bdat", package = "readsas") +#' read.sas(fl) #' -#'@export +#' @export read.sas <- function(file, debug = FALSE, convert_dates = TRUE, recode = TRUE, select.rows = NULL, select.cols = NULL, remove_deleted = TRUE, rownames = FALSE) { diff --git a/man/read.sas.Rd b/man/read.sas.Rd index 742aadc..4f1874f 100644 --- a/man/read.sas.Rd +++ b/man/read.sas.Rd @@ -34,7 +34,23 @@ imported are sorted.} \item{rownames}{first column will be used as rowname and removed from data} } \description{ -read.sas +`read.sas` is a general function for reading sas7bdat files. +It supports a variety of complex sas7bdat files, x86 and x64, big and small +endian, and both compression types. It has been tested with numeric and +character data, and provides helper functions for converting sas7bdat to R +types `Date` and `POSIXct`. Time variables are converted to characters. +Conversion to date variables is applied if a known date, datetime or time +format is found in the sas7bdat file. For user-defined formats, the package +provides functions to convert from sas7bdat to R. +Input files may contain deleted lines that are marked as deleted instead of +being removed from the input data. These are removed on import, if you still +need them look at `remove_deleted`. Formats, labels and additional file +information are available with `attributes()`. +} +\examples{ +fl <- system.file("extdata", "cars.sas7bdat", package = "readsas") +read.sas(fl) + } \author{ Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} From fe90684ac8cb642ea0edab2d3b9ff9962097f16e Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 3 Sep 2022 12:17:43 +0200 Subject: [PATCH 05/11] update DESCRIPTION and man page --- DESCRIPTION | 10 ++++++---- R/readsas.R | 9 +++++---- man/read.sas.Rd | 17 +++++++++-------- 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 951f4db..bda2ae5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -11,11 +11,13 @@ Description: Package to read the SAS file format. License: GPL (>= 2) | file LICENSE URL: https://github.com/JanMarvin/readsas BugReports: https://github.com/JanMarvin/readsas/issues -Imports: Rcpp (>= 1.0.2), stringi +Depends: R (>= 3.4.0) +Imports: Rcpp, stringi LinkingTo: Rcpp -SystemRequirements: C++11 Suggests: testthat, foreign, datasets -ByteCompile: yes -RoxygenNote: 7.2.1 Encoding: UTF-8 Language: en-US +RoxygenNote: 7.2.1 +Roxygen: list(markdown = TRUE) +Config/testthat/edition: 3 +Config/testthat/parallel: true diff --git a/R/readsas.R b/R/readsas.R index c31988f..9e46a4a 100644 --- a/R/readsas.R +++ b/R/readsas.R @@ -1,7 +1,5 @@ #' read.sas #' -#' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} -#' #' @description `read.sas` is a general function for reading sas7bdat files. #' It supports a variety of complex sas7bdat files, x86 and x64, big and small #' endian, and both compression types. It has been tested with numeric and @@ -10,6 +8,7 @@ #' Conversion to date variables is applied if a known date, datetime or time #' format is found in the sas7bdat file. For user-defined formats, the package #' provides functions to convert from sas7bdat to R. +#' #' Input files may contain deleted lines that are marked as deleted instead of #' being removed from the input data. These are removed on import, if you still #' need them look at `remove_deleted`. Formats, labels and additional file @@ -17,8 +16,8 @@ #' #' @param file file to read #' @param debug print debug information -#' @param convert_dates default is TRUE -#' @param recode default is TRUE +#' @param convert_dates default is `TRUE` +#' @param recode default is `TRUE` #' @param select.rows \emph{integer.} Vector of rows to import. Minimum 0. Rows #' imported are sorted. #' @param select.cols \emph{character:} Vector of variables to select. @@ -29,6 +28,8 @@ #' @importFrom utils download.file #' @importFrom stringi stri_encode #' +#' @seealso \link[foreign]{read.xport} +#' #' @examples #' fl <- system.file("extdata", "cars.sas7bdat", package = "readsas") #' read.sas(fl) diff --git a/man/read.sas.Rd b/man/read.sas.Rd index 4f1874f..69b5743 100644 --- a/man/read.sas.Rd +++ b/man/read.sas.Rd @@ -20,9 +20,9 @@ read.sas( \item{debug}{print debug information} -\item{convert_dates}{default is TRUE} +\item{convert_dates}{default is \code{TRUE}} -\item{recode}{default is TRUE} +\item{recode}{default is \code{TRUE}} \item{select.rows}{\emph{integer.} Vector of rows to import. Minimum 0. Rows imported are sorted.} @@ -34,24 +34,25 @@ imported are sorted.} \item{rownames}{first column will be used as rowname and removed from data} } \description{ -`read.sas` is a general function for reading sas7bdat files. +\code{read.sas} is a general function for reading sas7bdat files. It supports a variety of complex sas7bdat files, x86 and x64, big and small endian, and both compression types. It has been tested with numeric and character data, and provides helper functions for converting sas7bdat to R -types `Date` and `POSIXct`. Time variables are converted to characters. +types \code{Date} and \code{POSIXct}. Time variables are converted to characters. Conversion to date variables is applied if a known date, datetime or time format is found in the sas7bdat file. For user-defined formats, the package provides functions to convert from sas7bdat to R. + Input files may contain deleted lines that are marked as deleted instead of being removed from the input data. These are removed on import, if you still -need them look at `remove_deleted`. Formats, labels and additional file -information are available with `attributes()`. +need them look at \code{remove_deleted}. Formats, labels and additional file +information are available with \code{attributes()}. } \examples{ fl <- system.file("extdata", "cars.sas7bdat", package = "readsas") read.sas(fl) } -\author{ -Jan Marvin Garbuszus \email{jan.garbuszus@ruhr-uni-bochum.de} +\seealso{ +\link[foreign]{read.xport} } From ca79f73b37cd614242df3451cac109391a4bfdc2 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 3 Sep 2022 12:30:25 +0200 Subject: [PATCH 06/11] avoid range error if file has no rows --- src/readsas.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/readsas.cpp b/src/readsas.cpp index 5572f75..544ea82 100644 --- a/src/readsas.cpp +++ b/src/readsas.cpp @@ -1852,7 +1852,7 @@ Rcpp::List readsas(const char * filePath, nmin = 0; nmax = n -1; // sequences of column and row - rvec = seq(nmin, nmax); + if (nmax > nmin) rvec = seq(nmin, nmax); } else { IntegerVector selectrows(selectrows_); From 3bbb802542e7fccabc0cd4d98ba8d0bd63ed1218 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 3 Sep 2022 13:08:35 +0200 Subject: [PATCH 07/11] Handle some corner cases with select.rows --- R/readsas.R | 2 +- man/read.sas.Rd | 2 +- src/readsas.cpp | 15 ++++++++++----- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/R/readsas.R b/R/readsas.R index 07d67f3..3bf7c36 100644 --- a/R/readsas.R +++ b/R/readsas.R @@ -19,7 +19,7 @@ #' @param convert_dates default is `TRUE` #' @param recode default is `TRUE` #' @param select.rows \emph{integer.} Vector of rows to import. Minimum 0. Rows -#' imported are sorted. +#' imported are sorted. If 0 is in `select.rows`, zero rows are returned. #' @param select.cols \emph{character:} Vector of variables to select. #' @param remove_deleted logical if deleted rows should be removed from data #' @param rownames first column will be used as rowname and removed from data diff --git a/man/read.sas.Rd b/man/read.sas.Rd index 14a132a..327f469 100644 --- a/man/read.sas.Rd +++ b/man/read.sas.Rd @@ -26,7 +26,7 @@ read.sas( \item{recode}{default is \code{TRUE}} \item{select.rows}{\emph{integer.} Vector of rows to import. Minimum 0. Rows -imported are sorted.} +imported are sorted. If 0 is in \code{select.rows}, zero rows are returned.} \item{select.cols}{\emph{character:} Vector of variables to select.} diff --git a/src/readsas.cpp b/src/readsas.cpp index 544ea82..3214193 100644 --- a/src/readsas.cpp +++ b/src/readsas.cpp @@ -1852,7 +1852,7 @@ Rcpp::List readsas(const char * filePath, nmin = 0; nmax = n -1; // sequences of column and row - if (nmax > nmin) rvec = seq(nmin, nmax); + if (nmax >= nmin) rvec = seq(nmin, nmax); } else { IntegerVector selectrows(selectrows_); @@ -1860,9 +1860,14 @@ Rcpp::List readsas(const char * filePath, if (any_keepr(selectrows, n)) Rcpp::warning("row > %d selected. Reducing select.rows", n); - rvec = selectrows[selectrows < n]; - nmin = min(rvec); - nmax = max(rvec); + if (!any_keepr(selectrows, -1)) { + rvec = selectrows[selectrows < n]; + nmin = min(rvec); + nmax = max(rvec); + } else { + nmin = -1; + nmax = -1; + } } // make sure that n is not greater than nmax or nmin @@ -1872,7 +1877,7 @@ Rcpp::List readsas(const char * filePath, nmin = n; // otherwise if n == 0 nn would be 1 - if (nmax > 0) nn = rvec.size(); + if (rvec.size() > 0) nn = rvec.size(); if (debug) Rcout << "reading n/nn/nmin/nmax: " << n << "/" << nn << "/" << nmin << "/" << nmax << std::endl; From f6fb7496a81d1e7f03a5111bef7778d3e598cf6f Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 3 Sep 2022 13:08:50 +0200 Subject: [PATCH 08/11] fix for testthat e3 ignore_attr --- tests/testthat/test_read.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test_read.R b/tests/testthat/test_read.R index d51940c..8f8b26b 100644 --- a/tests/testthat/test_read.R +++ b/tests/testthat/test_read.R @@ -15,37 +15,37 @@ test_that("read file with deleted rows", { fl <- system.file("extdata", "test.sas7bdat", package = "readsas") exp <- data.frame(x = 1:3) got <- read.sas(fl) - expect_equal(exp, got, check.attributes = FALSE) + expect_equal(exp, got, ignore_attr = TRUE) # read file with deleted: delete x = 2 fl <- system.file("extdata", "test2.sas7bdat", package = "readsas") exp <- data.frame(x = 1:3)[c(1, 3), , drop = FALSE] got <- read.sas(fl) - expect_equal(exp, got, check.attributes = FALSE) + expect_equal(exp, got, ignore_attr = TRUE) # read file with deleted: delete x > 1 fl <- system.file("extdata", "test3.sas7bdat", package = "readsas") exp <- data.frame(x = 1:3)[c(1), , drop = FALSE] got <- read.sas(fl) - expect_equal(exp, got, check.attributes = FALSE) + expect_equal(exp, got, ignore_attr = TRUE) # read file with deleted: delete x = 1 fl <- system.file("extdata", "test4.sas7bdat", package = "readsas") exp <- data.frame(x = 1:3)[-c(1), , drop = FALSE] got <- read.sas(fl) - expect_equal(exp, got, check.attributes = FALSE) + expect_equal(exp, got, ignore_attr = TRUE) # read file with deleted: delete x < 3 fl <- system.file("extdata", "test5.sas7bdat", package = "readsas") exp <- data.frame(x = 1:3)[-c(1, 2), , drop = FALSE] got <- read.sas(fl) - expect_equal(exp, got, check.attributes = FALSE) + expect_equal(exp, got, ignore_attr = TRUE) # skip row fl <- system.file("extdata", "test2.sas7bdat", package = "readsas") exp <- data.frame(x = 1:3)[c(1), , drop = FALSE] got <- read.sas(fl, select.rows = c(1, 2)) - expect_equal(exp, got, check.attributes = FALSE) + expect_equal(exp, got, ignore_attr = TRUE) }) From e7d48ec7834af7c47531eea8a3ded7d5ee865a3f Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 3 Sep 2022 13:09:43 +0200 Subject: [PATCH 09/11] use tempfile for uncompress to allow parallel testing --- R/RcppExports.R | 5 +++-- R/readsas.R | 5 ++++- man/readsas.Rd | 4 +++- src/RcppExports.cpp | 9 +++++---- src/readsas.cpp | 6 ++++-- 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index d565295..484e266 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -8,9 +8,10 @@ #' @param selectrows_ integer vector of selected rows #' @param selectcols_ character vector of selected rows #' @param empty_to_na logical convert '' to NA_character_ +#' @param tempstr filepath used for temp output when uncompressing #' @import Rcpp #' @export -readsas <- function(filePath, debug, selectrows_, selectcols_, empty_to_na) { - .Call(`_readsas_readsas`, filePath, debug, selectrows_, selectcols_, empty_to_na) +readsas <- function(filePath, debug, selectrows_, selectcols_, empty_to_na, tempstr) { + .Call(`_readsas_readsas`, filePath, debug, selectrows_, selectcols_, empty_to_na, tempstr) } diff --git a/R/readsas.R b/R/readsas.R index 3bf7c36..2e4ac1c 100644 --- a/R/readsas.R +++ b/R/readsas.R @@ -73,7 +73,10 @@ read.sas <- function(file, debug = FALSE, convert_dates = TRUE, recode = TRUE, return(message("select.cols must be of type character")) } - data <- readsas(filepath, debug, select.rows, select.cols, empty_to_na) + tempstr <- tempfile() + on.exit(unlink(tempstr), add = TRUE) + data <- readsas(filepath, debug, select.rows, select.cols, empty_to_na, + tempstr) cvec <- ifelse(rownames, -1, substitute()) diff --git a/man/readsas.Rd b/man/readsas.Rd index 28cee13..a011980 100644 --- a/man/readsas.Rd +++ b/man/readsas.Rd @@ -4,7 +4,7 @@ \alias{readsas} \title{Reads SAS data files} \usage{ -readsas(filePath, debug, selectrows_, selectcols_, empty_to_na) +readsas(filePath, debug, selectrows_, selectcols_, empty_to_na, tempstr) } \arguments{ \item{filePath}{The full systempath to the sas7bdat file you want to import.} @@ -16,6 +16,8 @@ readsas(filePath, debug, selectrows_, selectcols_, empty_to_na) \item{selectcols_}{character vector of selected rows} \item{empty_to_na}{logical convert '' to NA_character_} + +\item{tempstr}{filepath used for temp output when uncompressing} } \description{ Reads SAS data files diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 35f0b4d..4a08cc3 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,8 +11,8 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // readsas -Rcpp::List readsas(const char * filePath, const bool debug, Nullable selectrows_, Nullable selectcols_, const bool empty_to_na); -RcppExport SEXP _readsas_readsas(SEXP filePathSEXP, SEXP debugSEXP, SEXP selectrows_SEXP, SEXP selectcols_SEXP, SEXP empty_to_naSEXP) { +Rcpp::List readsas(const char * filePath, const bool debug, Nullable selectrows_, Nullable selectcols_, const bool empty_to_na, std::string tempstr); +RcppExport SEXP _readsas_readsas(SEXP filePathSEXP, SEXP debugSEXP, SEXP selectrows_SEXP, SEXP selectcols_SEXP, SEXP empty_to_naSEXP, SEXP tempstrSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -21,13 +21,14 @@ BEGIN_RCPP Rcpp::traits::input_parameter< Nullable >::type selectrows_(selectrows_SEXP); Rcpp::traits::input_parameter< Nullable >::type selectcols_(selectcols_SEXP); Rcpp::traits::input_parameter< const bool >::type empty_to_na(empty_to_naSEXP); - rcpp_result_gen = Rcpp::wrap(readsas(filePath, debug, selectrows_, selectcols_, empty_to_na)); + Rcpp::traits::input_parameter< std::string >::type tempstr(tempstrSEXP); + rcpp_result_gen = Rcpp::wrap(readsas(filePath, debug, selectrows_, selectcols_, empty_to_na, tempstr)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_readsas_readsas", (DL_FUNC) &_readsas_readsas, 5}, + {"_readsas_readsas", (DL_FUNC) &_readsas_readsas, 6}, {NULL, NULL, 0} }; diff --git a/src/readsas.cpp b/src/readsas.cpp index 3214193..b8b8777 100644 --- a/src/readsas.cpp +++ b/src/readsas.cpp @@ -37,6 +37,7 @@ using namespace Rcpp; //' @param selectrows_ integer vector of selected rows //' @param selectcols_ character vector of selected rows //' @param empty_to_na logical convert '' to NA_character_ +//' @param tempstr filepath used for temp output when uncompressing //' @import Rcpp //' @export // [[Rcpp::export]] @@ -44,7 +45,8 @@ Rcpp::List readsas(const char * filePath, const bool debug, Nullable selectrows_, Nullable selectcols_, - const bool empty_to_na) + const bool empty_to_na, + std::string tempstr) { std::ifstream sas(filePath, std::ios::in | std::ios::binary | std::ios::ate); auto sas_size = sas.tellg(); @@ -52,7 +54,7 @@ Rcpp::List readsas(const char * filePath, sas.seekg(0, std::ios_base::beg); - const std::string tempstr = ".readsas_unc_tmp_file"; + if (tempstr.compare("") == 0) tempstr = ".readsas_unc_tmp_file"; std::fstream out (tempstr, std::ios::out | std::ios::binary); From e29b6a98c7b3a76b0ce14af51a18e23fa960495e Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 3 Sep 2022 13:46:02 +0200 Subject: [PATCH 10/11] remove development code --- src/readsas.cpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/src/readsas.cpp b/src/readsas.cpp index b8b8777..5cf0122 100644 --- a/src/readsas.cpp +++ b/src/readsas.cpp @@ -525,7 +525,6 @@ Rcpp::List readsas(const char * filePath, if (( PAGE_TYPE == 16384 || // PAGE_META_TYPE_2 - PAGE_TYPE == 4096 || // CATALOG PAGE_TYPE == 1024 || // PAGE_AMD_TYPE PAGE_TYPE == 640 || PAGE_TYPE == 512 || // PAGE_MIX_TYPE_2 PAGE_MIX_TYPE_1 PAGE_TYPE == 384 || PAGE_TYPE == 256 || // PAGE_DATA_TYPE_2 PAGE_DATA_TYPE @@ -535,20 +534,6 @@ Rcpp::List readsas(const char * filePath, for (auto i = 0; i < SUBHEADER_COUNT; ++i) { - if (PAGE_TYPE == 4096) { - - size_t txt_pos = unk16 + alignval; - sas.seekg(txt_pos, sas.cur); - - std::string sysresr_bitmap; - sysresr_bitmap.resize(16, '\0'); - sysresr_bitmap = readstring(sysresr_bitmap, sas); - Rcout << sysresr_bitmap << std::endl; - - stop("stop"); - - } - if (u64 == 4) { potabs[i].SH_OFF = readbin(potabs[i].SH_OFF, sas, swapit); // 8 From fc2e8f2ccb52eb0b41be6381184bc9510ae7fbd8 Mon Sep 17 00:00:00 2001 From: Jan Marvin Garbuszus Date: Sat, 3 Sep 2022 13:48:18 +0200 Subject: [PATCH 11/11] update wording --- R/readsas.R | 2 +- man/read.sas.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/readsas.R b/R/readsas.R index 2e4ac1c..a38c961 100644 --- a/R/readsas.R +++ b/R/readsas.R @@ -9,7 +9,7 @@ #' format is found in the sas7bdat file. For user-defined formats, the package #' provides functions to convert from sas7bdat to R. #' -#' Input files may contain deleted lines that are marked as deleted instead of +#' Input files may contain deleted rows that are marked as deleted instead of #' being removed from the input data. These are removed on import, if you still #' need them look at `remove_deleted`. Formats, labels and additional file #' information are available with `attributes()`. diff --git a/man/read.sas.Rd b/man/read.sas.Rd index 327f469..b978b80 100644 --- a/man/read.sas.Rd +++ b/man/read.sas.Rd @@ -47,7 +47,7 @@ Conversion to date variables is applied if a known date, datetime or time format is found in the sas7bdat file. For user-defined formats, the package provides functions to convert from sas7bdat to R. -Input files may contain deleted lines that are marked as deleted instead of +Input files may contain deleted rows that are marked as deleted instead of being removed from the input data. These are removed on import, if you still need them look at \code{remove_deleted}. Formats, labels and additional file information are available with \code{attributes()}.