From 9ee881b54adf1be7ad197e9cdb14f9eecd30db48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Sun, 28 Sep 2025 14:49:02 +0200 Subject: [PATCH 01/11] implement new tokenizer --- R/RcppExports.R | 4 ++ src/RcppExports.cpp | 12 ++++ src/tokenizer.cpp | 157 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 src/tokenizer.cpp diff --git a/R/RcppExports.R b/R/RcppExports.R index 0254605..a07eb57 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,10 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +parse_unit <- function(x) { + .Call('_units_parse_unit', PACKAGE = 'units', x) +} + ud_exit <- function() { invisible(.Call('_units_ud_exit', PACKAGE = 'units')) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 1a74d25..6f3c672 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -10,6 +10,17 @@ Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif +// parse_unit +SEXP parse_unit(std::string_view x); +RcppExport SEXP _units_parse_unit(SEXP xSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string_view >::type x(xSEXP); + rcpp_result_gen = Rcpp::wrap(parse_unit(x)); + return rcpp_result_gen; +END_RCPP +} // ud_exit void ud_exit(); RcppExport SEXP _units_ud_exit() { @@ -269,6 +280,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { + {"_units_parse_unit", (DL_FUNC) &_units_parse_unit, 1}, {"_units_ud_exit", (DL_FUNC) &_units_ud_exit, 0}, {"_units_ud_init", (DL_FUNC) &_units_ud_init, 1}, {"_units_ud_set_encoding", (DL_FUNC) &_units_ud_set_encoding, 1}, diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp new file mode 100644 index 0000000..db5ddbc --- /dev/null +++ b/src/tokenizer.cpp @@ -0,0 +1,157 @@ +#include +using namespace Rcpp; + +class SymbolicUnits { +public: + SymbolicUnits(std::string_view x) : x(x), it(x.begin()) { tokenize(); } + SymbolicUnits(std::string_view x, std::string_view::iterator start, std::string_view::iterator end) + : x(x.substr(start - x.begin(), end - start)), it(this->x.end()) { + numerator.push_back(std::string(this->x)); + } + operator SEXP(); + +private: + std::string_view x; + std::string_view::iterator it; + std::vector numerator, denominator; + bool in_denominator = false; + + /* helpers -----------------------------------------------------------------*/ + + bool is_multiplicative(const char& c) { + return c == '*' || c == '.' || c == ' '; + } + + bool is_number_char(const char& c) { + return std::isdigit(c) || c == '.' || c == 'e' || c == '-'; + } + + bool is_name_char(const char& c) { + return !std::isdigit(c) && !is_multiplicative(c) && c != '/' && + c != '(' && c != ')' && c != '^' && c != '-' && c != '+'; + } + + /* getters -----------------------------------------------------------------*/ + + SymbolicUnits get_paren() { + int level = 1; + auto start = ++it; + + for (; it != x.end(); ++it) { + const char& c = *it; + if (c == '(') + ++level; + else if (c == ')' && --level == 0) + break; + } + + if (level != 0) + stop("unmatched parenthesis"); + + return SymbolicUnits(x.substr(start - x.begin(), it++ - start)); + } + + int get_exponent() { + int exponent = 1; + if (it == x.end()) return exponent; + + bool is_negative = false; + bool is_exponent = false; + if (*it == '^') { + is_exponent = true; + ++it; + } + switch (*it) { + case '-': + is_negative = true; + case '+': + is_exponent = true; + ++it; + } + bool is_digit = it != x.end() && std::isdigit(*it); + + if (!is_digit && is_exponent) + stop("invalid exponent"); + else if (is_digit) { + exponent = *(it++) - '0'; + for (; it != x.end() && isdigit(*it); ++it) + exponent = exponent * 10 + (*it - '0'); + if (is_negative) exponent = -exponent; + } + + return exponent; + } + + SymbolicUnits get_number() { + auto start = it++; + for (; it != x.end() && is_number_char(*it); ++it); + return SymbolicUnits(x, start, it); + } + + SymbolicUnits get_name() { + auto start = (*it == '`') ? ++it : it++; + for (; it != x.end() && is_name_char(*it); ++it); + return SymbolicUnits(x, start, it - (*(it-1) == '`' ? 1 : 0)); + } + + /* main logic --------------------------------------------------------------*/ + + void combine(SymbolicUnits& o, int exponent = 1) { + bool invert = in_denominator != (exponent < 0); + auto& target_num = invert ? denominator : numerator; + auto& target_den = invert ? numerator : denominator; + + for (exponent = std::abs(exponent); exponent > 0; --exponent) { + target_num.insert(target_num.end(), o.numerator.begin(), o.numerator.end()); + target_den.insert(target_den.end(), o.denominator.begin(), o.denominator.end()); + } + } + + void tokenize() { + while (it != x.end()) { + const char& c = *it; + + // handle separators + if (is_multiplicative(c)) { + ++it; + continue; + } + if (c == '/') { + in_denominator = !in_denominator; + ++it; + continue; + } + + // handle parenthesis, digits, and symbols/names + auto token = (c == '(') ? get_paren() : + (std::isdigit(c) ? get_number() : get_name()); + combine(token, get_exponent()); + in_denominator = false; + } + } +}; + +#include + +SymbolicUnits::operator SEXP() { + List unit = List::create( + _["numerator"] = numerator, + _["denominator"] = denominator + ); + unit.attr("class") = "symbolic_units"; + return unit; +}; + +// [[Rcpp::export]] +SEXP parse_unit(std::string_view x) { return SymbolicUnits(x); } + +/*** R +parse_unit("m") +parse_unit("m2") +parse_unit("m2 / g^3") +parse_unit("m2 / g-1") +parse_unit("ml / min / 1.73 / m^2") +parse_unit("ml/min/1.73m^2") +parse_unit("ml/min/1.73/m^2") +parse_unit("ml/min/(1.73m^2)") +*/ From f1fb41eb0d73aa40f3285755042af0d3f1047fb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Sun, 28 Sep 2025 14:50:43 +0200 Subject: [PATCH 02/11] reimplement as_units based on new tokenizer --- R/conversion.R | 3 +- R/make_units.R | 250 ++++++++++++++++--------------------------------- man/units.Rd | 64 ++++--------- 3 files changed, 100 insertions(+), 217 deletions(-) diff --git a/R/conversion.R b/R/conversion.R index f924b53..436ab20 100644 --- a/R/conversion.R +++ b/R/conversion.R @@ -233,9 +233,10 @@ set_units.numeric <- function(x, value, ..., mode = units_options("set_units_mod value <- unitless else if (mode == "symbols") { value <- substitute(value) - if(is.numeric(value) && !identical(value, 1) && !identical(value, 1L)) stop("The only valid number defining a unit is '1', signifying a unitless unit") + if (is.name(value) || is.call(value)) + value <- format(value) } units(x) <- as_units(value, ...) diff --git a/R/make_units.R b/R/make_units.R index 5301322..948eefa 100644 --- a/R/make_units.R +++ b/R/make_units.R @@ -121,7 +121,7 @@ #' # or #' drop_units(y) make_units <- function(bare_expression, check_is_valid = TRUE) { - as_units.call(substitute(bare_expression), check_is_valid = check_is_valid) + as_units(format(substitute(bare_expression)), check_is_valid = check_is_valid) } #' @name units @@ -189,63 +189,59 @@ as_units.difftime <- function(x, value, ...) { # ----- as_units.character helpers ------ -backtick <- function(x) { - # backtick all character runs uninterupted by one of ^()*^/`- or a space - # don't double up backticks - x <- gsub("`?([^() \\*^/`-]+)`?", "`\\1`", x) - gsub("`([0-9]*\\.?[0-9]+)`", "\\1", x) # unbacktick bare numbers +is_udunits_time <- function(s) { + ud_is_parseable(s) && ud_are_convertible(s, "seconds since 1970-01-01") } -are_exponents_implicit <- function(s) { - s <- trimws(s) - has <- function(chr, regex = FALSE) - grepl(chr, s, fixed = !regex, perl = regex) - !has("^") && !has("*") && !has("/") && has("\\s|\\D.*\\d$", regex = TRUE) +# from package:yasp, paste collapse with serial (oxford) comma +pc_and <- function(..., sep = "") { + x <- paste(..., sep = sep, collapse = NULL) + lx <- length(x) + if(lx == 0L) + "" + else if (lx == 1L) + x + else if (lx == 2L) + paste0(x, collapse = " and ") + else + paste0( paste0(x[-lx], collapse = ", "), ", and ", x[lx]) } -is_udunits_time <- function(s) { - ud_is_parseable(s) && ud_are_convertible(s, "seconds since 1970-01-01") +.msg_units_not_recognized <- function(unrecognized_symbols, full_expr) { + + if (is.language(full_expr)) + full_expr <- deparse(full_expr) + + is_are <- if (length(unrecognized_symbols) > 1L) "are" else "is" + + paste0("In ", sQuote(full_expr), ", ", + pc_and(sQuote(unrecognized_symbols)), " ", is_are, " not recognized by udunits.\n\n", + "See a table of valid unit symbols and names with valid_udunits().\n", + "Custom user-defined units can be added with install_unit().\n\n", + "See a table of valid unit prefixes with valid_udunits_prefixes().\n", + "Prefixes will automatically work with any user-defined unit.") } #' @name units #' @export #' +#' @param check_is_valid throw an error if all the unit symbols are not either +#' recognized by udunits2, or a custom +#' user defined via \code{install_unit()}. If \code{FALSE}, no check +#' for validity is performed. +#' #' @param force_single_symbol Whether to perform no string parsing and force #' treatment of the string as a single symbol. #' -#' @param implicit_exponents If the unit string is in product power form (e.g. -#' \code{"km m-2 s-1"}). Defaults to \code{NULL}, in which case a guess is made -#' based on the supplied string. Set to \code{TRUE} or \code{FALSE} if the guess is -#' incorrect. -#' #' @section Character strings: #' #' Generally speaking, there are 3 types of unit strings are accepted in #' \code{as_units} (and by extension, \code{`units<-`}). #' -#' The first, and likely most common, is a "standard" format unit +#' The first type, and likely most common, is a "standard" format unit #' specification where the relationship between unit symbols or names is #' specified explicitly with arithmetic symbols for division \code{/}, -#' multiplication \code{*} and power exponents \code{^}, or other mathematical -#' functions like \code{log()}. In this case, the string is parsed as an R -#' expression via \code{parse(text = )} after backticking all unit symbols and -#' names, and then passed on to \code{as_units.call()}. A heuristic is used to -#' perform backticking, such that any continuous set of characters -#' uninterrupted by one of \code{()\\*^-} are backticked (unless the character -#' sequence consists solely of numbers \code{0-9}), with some care to not -#' double up on pre-existing backticks. This heuristic appears to be quite -#' robust, and works for units would otherwise not be valid R syntax. For -#' example, percent (\code{"\%"}), feet (\code{"'"}), inches (\code{"in"}), -#' and Tesla (\code{"T"}) are all backticked and parsed correctly. -#' -#' Nevertheless, for certain complex unit expressions, this backticking heuristic -#' may give incorrect results. If the string supplied fails to parse as an R -#' expression, then the string is treated as a single symbolic unit and -#' \code{symbolic_unit(chr)} is used as a fallback with a warning. In that -#' case, automatic unit simplification may not work properly when performing -#' operations on unit objects, but unit conversion and other Math operations -#' should still give correct results so long as the unit string supplied -#' returns \code{TRUE} for \code{ud_is_parsable()}. +#' multiplication \code{*} and power exponents \code{^}. #' #' The second type of unit string accepted is one with implicit exponents. In #' this format, \code{/}, \code{*}, and \code{^}, may not be present in the @@ -253,11 +249,16 @@ is_udunits_time <- function(s) { #' symbol may optionally be followed by a single number, specifying the power. #' For example \code{"m2 s-2"} is equivalent to \code{"(m^2)*(s^-2)"}. #' -#' It must be noted that prepended numbers are supported too, but their -#' interpretation slightly varies depending on whether they are separated from -#' the unit string or not. E.g., \code{"1000 m"} is interpreted as magnitude -#' and unit, but \code{"1000m"} is interpreted as a prefixed unit, and it is -#' equivalent to \code{"km"} to all effects. +#' If the string supplied fails to parse, then the string is treated as a +#' single symbolic unit and \code{symbolic_unit(chr)} is used as a fallback +#' with a warning. In that case, automatic unit simplification may not work +#' properly when performing operations on unit objects, but unit conversion +#' and other Math operations should still give correct results so long as +#' the unit string supplied returns \code{TRUE} for \code{ud_is_parsable()}. +#' +#' It must be noted that prepended numbers are supported too, but are not +#' treated as magnitudes. For example, \code{"1000 m"} is interpreted as +#' a prefixed unit, and it is equivalent to \code{"km"} to all effects. #' #' The third type of unit string format accepted is the special case of #' udunits time duration with a reference origin, for example \code{"hours @@ -268,157 +269,69 @@ is_udunits_time <- function(s) { #' otherwise encouraged to use \code{R}'s date and time functionality provided #' by \code{Date} and \code{POSIXt} classes. #' -as_units.character <- function(x, +#' @note By default, unit names are automatically substituted with unit names +#' (e.g., kilogram --> kg). To turn off this behavior, set +#' \code{units_options(auto_convert_names_to_symbols = FALSE)} +#' +#' @seealso \code{\link{install_unit}}, \code{\link{valid_udunits}} +as_units.character <- function(x, ..., check_is_valid = TRUE, - implicit_exponents = NULL, - force_single_symbol = FALSE, ...) { + force_single_symbol = FALSE) { stopifnot(is.character(x), length(x) == 1) - if (isTRUE(x == "")) return(unitless) + if (any(is.na(x))) + stop("a missing value for units is not allowed") + + if (isTRUE(x == "" || x == "1")) + return(.as.units(1, unitless)) if(force_single_symbol || is_udunits_time(x)) return(symbolic_unit(x, check_is_valid = check_is_valid)) - if(is.null(implicit_exponents)) - implicit_exponents <- are_exponents_implicit(x) - - if(implicit_exponents) - x <- convert_implicit_to_explicit_exponents(x) - - x <- backtick(x) - o <- try(expr <- parse(text = x)[[1]], silent = TRUE) - + o <- try(su <- parse_unit(x), silent = TRUE) if(inherits(o, "try-error")) { - warning("Could not parse expression: ", sQuote(x), # nocov - ". Returning as a single symbolic unit()", call. = FALSE) # nocov - return(symbolic_unit(x, check_is_valid = check_is_valid)) # nocov + warning("Could not parse expression: ", sQuote(x), # nocov + ". Returning as a single symbolic unit()", call. = FALSE) # nocov + return(symbolic_unit(x, check_is_valid = check_is_valid)) # nocov } - as_units.call(expr, check_is_valid = check_is_valid) -} - - -convert_implicit_to_explicit_exponents <- function(x) { - if (length(grep(c("[*/]"), x)) > 0) - stop("If 'implicit_exponents = TRUE', strings cannot contain `*' or `/'") - x <- gsub("\\b([^\\d-]+)([-]?\\d+)\\b", "\\1^(\\2)", x, perl =TRUE) - x <- gsub("\\s+", " * ", trimws(x), perl = TRUE) - x -} - -# ----- as_units.call helpers ------ - -# from package:yasp, paste collapse with serial (oxford) comma -pc_and <- function(..., sep = "") { - x <- paste(..., sep = sep, collapse = NULL) - lx <- length(x) - if(lx == 0L) - "" - else if (lx == 1L) - x - else if (lx == 2L) - paste0(x, collapse = " and ") - else - paste0( paste0(x[-lx], collapse = ", "), ", and ", x[lx]) -} - -#`%not_in%` <- function(x, table) match(x, table, nomatch = 0L) == 0L - -.msg_units_not_recognized <- function(unrecognized_symbols, full_expr) { - - if (is.language(full_expr)) - full_expr <- deparse(full_expr) - - is_are <- if (length(unrecognized_symbols) > 1L) "are" else "is" - - paste0("In ", sQuote(full_expr), ", ", - pc_and(sQuote(unrecognized_symbols)), " ", is_are, " not recognized by udunits.\n\n", - "See a table of valid unit symbols and names with valid_udunits().\n", - "Custom user-defined units can be added with install_unit().\n\n", - "See a table of valid unit prefixes with valid_udunits_prefixes().\n", - "Prefixes will automatically work with any user-defined unit.") -} - -units_eval_env <- new.env(parent = baseenv()) -units_eval_env$ln <- function(x) base::log(x) -units_eval_env$lg <- function(x) base::log(x, base = 10) -units_eval_env$lb <- function(x) base::log(x, base = 2) - - -#' @name units -#' @export -#' -#' @param check_is_valid throw an error if all the unit symbols are not either -#' recognized by udunits2, or a custom -#' user defined via \code{install_unit()}. If \code{FALSE}, no check -#' for validity is performed. -#' -#' @note By default, unit names are automatically substituted with unit names -#' (e.g., kilogram --> kg). To turn off this behavior, set -#' \code{units_options(auto_convert_names_to_symbols = FALSE)} -#' -#' @section Expressions: -#' -#' In \code{as_units()}, each of the symbols in the unit expression is treated -#' individually, such that each symbol must be recognized by the udunits -#' database, \emph{or} be a custom, -#' user-defined unit symbol that was defined by \code{install_unit()}. To -#' see which symbols and names are currently recognized by the udunits -#' database, see \code{valid_udunits()}. -#' -#' @seealso \code{\link{install_unit}}, \code{\link{valid_udunits}} -as_units.call <- function(x, check_is_valid = TRUE, ...) { - - if(missing(x) || identical(x, quote(expr =)) || - identical(x, 1) || identical(x, 1L)) - return(.as.units(1, unitless)) - - if (is.vector(x) && !is.expression(x) && any(is.na(x))) - stop("a missing value for units is not allowed") - - stopifnot(is.language(x)) - - vars <- all.vars(x) - if(!length(vars)) - stop(call. = FALSE, -"No symbols found. Please supply bare expressions with this approach. -See ?as_units for usage examples.") - if (check_is_valid) { + vars <- c(su$numerator, su$denominator) valid <- vapply(vars, ud_is_parseable, logical(1L)) if (!all(valid)) stop(.msg_units_not_recognized(vars[!valid], x), call. = FALSE) } - names(vars) <- vars - tmp_env <- lapply(vars, symbolic_unit, check_is_valid = FALSE) - - if (dont_simplify_here <- is.na(.units.simplify())) { - units_options(simplify = FALSE) - on.exit(units_options(simplify = NA)) + if (units_options("auto_convert_names_to_symbols")) { + name_to_symbol <- function(chr) + if (length(sym <- ud_get_symbol(chr))) sym else chr + su$numerator <- vapply(su$numerator, name_to_symbol, character(1), USE.NAMES=FALSE) + su$denominator <- vapply(su$denominator, name_to_symbol, character(1), USE.NAMES=FALSE) } - unit <- tryCatch( eval(x, tmp_env, units_eval_env), - error = function(e) stop( paste0( conditionMessage(e), "\n", - "Did you try to supply a value in a context where a bare expression was expected?" - ), call. = FALSE )) - -# if(as.numeric(unit) %not_in% c(1, 0)) # 0 if log() used. -# stop(call. = FALSE, -#"In ", sQuote(deparse(x)), " the numeric multiplier ", sQuote(as.numeric(unit)), " is invalid. -#Use `install_unit()` to define a new unit that is a multiple of another unit.") + if (is.na(.units.simplify())) { + units_options(simplify = FALSE) + on.exit(units_options(simplify = NA)) + } + .simplify_units(1, su) +} - .as.units(as.numeric(unit), units(unit)) +#' @name units +#' @export +as_units.call <- function(x, ...) { + as_units(format(x), ...) } #' @name units #' @export -as_units.expression <- as_units.call +as_units.expression <- function(x, ...) { + as_units(as.character(x), ...) +} #' @name units #' @export -as_units.name <- as_units.call +as_units.name <- as_units.expression #' @name units #' @export @@ -442,7 +355,6 @@ as_units.Date = function(x, value, ...) { symbolic_unit <- function(chr, check_is_valid = TRUE) { - stopifnot(is.character(chr), length(chr) == 1) if (check_is_valid && !ud_is_parseable(chr)) { diff --git a/man/units.Rd b/man/units.Rd index 394a65a..4277b58 100644 --- a/man/units.Rd +++ b/man/units.Rd @@ -46,14 +46,14 @@ as_units(x, ...) \method{as_units}{difftime}(x, value, ...) -\method{as_units}{character}(x, check_is_valid = TRUE, - implicit_exponents = NULL, force_single_symbol = FALSE, ...) +\method{as_units}{character}(x, ..., check_is_valid = TRUE, + force_single_symbol = FALSE) -\method{as_units}{call}(x, check_is_valid = TRUE, ...) +\method{as_units}{call}(x, ...) -\method{as_units}{expression}(x, check_is_valid = TRUE, ...) +\method{as_units}{expression}(x, ...) -\method{as_units}{name}(x, check_is_valid = TRUE, ...) +\method{as_units}{name}(x, ...) \method{as_units}{POSIXt}(x, value, ...) @@ -80,11 +80,6 @@ recognized by udunits2, or a custom user defined via \code{install_unit()}. If \code{FALSE}, no check for validity is performed.} -\item{implicit_exponents}{If the unit string is in product power form (e.g. -\code{"km m-2 s-1"}). Defaults to \code{NULL}, in which case a guess is made -based on the supplied string. Set to \code{TRUE} or \code{FALSE} if the guess is -incorrect.} - \item{force_single_symbol}{Whether to perform no string parsing and force treatment of the string as a single symbol.} } @@ -128,29 +123,10 @@ By default, unit names are automatically substituted with unit names Generally speaking, there are 3 types of unit strings are accepted in \code{as_units} (and by extension, \code{`units<-`}). - The first, and likely most common, is a "standard" format unit + The first type, and likely most common, is a "standard" format unit specification where the relationship between unit symbols or names is specified explicitly with arithmetic symbols for division \code{/}, - multiplication \code{*} and power exponents \code{^}, or other mathematical - functions like \code{log()}. In this case, the string is parsed as an R - expression via \code{parse(text = )} after backticking all unit symbols and - names, and then passed on to \code{as_units.call()}. A heuristic is used to - perform backticking, such that any continuous set of characters - uninterrupted by one of \code{()\\*^-} are backticked (unless the character - sequence consists solely of numbers \code{0-9}), with some care to not - double up on pre-existing backticks. This heuristic appears to be quite - robust, and works for units would otherwise not be valid R syntax. For - example, percent (\code{"\%"}), feet (\code{"'"}), inches (\code{"in"}), - and Tesla (\code{"T"}) are all backticked and parsed correctly. - - Nevertheless, for certain complex unit expressions, this backticking heuristic - may give incorrect results. If the string supplied fails to parse as an R - expression, then the string is treated as a single symbolic unit and - \code{symbolic_unit(chr)} is used as a fallback with a warning. In that - case, automatic unit simplification may not work properly when performing - operations on unit objects, but unit conversion and other Math operations - should still give correct results so long as the unit string supplied - returns \code{TRUE} for \code{ud_is_parsable()}. + multiplication \code{*} and power exponents \code{^}. The second type of unit string accepted is one with implicit exponents. In this format, \code{/}, \code{*}, and \code{^}, may not be present in the @@ -158,11 +134,16 @@ By default, unit names are automatically substituted with unit names symbol may optionally be followed by a single number, specifying the power. For example \code{"m2 s-2"} is equivalent to \code{"(m^2)*(s^-2)"}. - It must be noted that prepended numbers are supported too, but their - interpretation slightly varies depending on whether they are separated from - the unit string or not. E.g., \code{"1000 m"} is interpreted as magnitude - and unit, but \code{"1000m"} is interpreted as a prefixed unit, and it is - equivalent to \code{"km"} to all effects. + If the string supplied fails to parse, then the string is treated as a + single symbolic unit and \code{symbolic_unit(chr)} is used as a fallback + with a warning. In that case, automatic unit simplification may not work + properly when performing operations on unit objects, but unit conversion + and other Math operations should still give correct results so long as + the unit string supplied returns \code{TRUE} for \code{ud_is_parsable()}. + + It must be noted that prepended numbers are supported too, but are not + treated as magnitudes. For example, \code{"1000 m"} is interpreted as + a prefixed unit, and it is equivalent to \code{"km"} to all effects. The third type of unit string format accepted is the special case of udunits time duration with a reference origin, for example \code{"hours @@ -174,17 +155,6 @@ By default, unit names are automatically substituted with unit names by \code{Date} and \code{POSIXt} classes. } -\section{Expressions}{ - - - In \code{as_units()}, each of the symbols in the unit expression is treated - individually, such that each symbol must be recognized by the udunits - database, \emph{or} be a custom, - user-defined unit symbol that was defined by \code{install_unit()}. To - see which symbols and names are currently recognized by the udunits - database, see \code{valid_udunits()}. -} - \examples{ x = 1:3 class(x) From 0ba30d2a6c3a240997e416273bf77850b532e603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Sun, 28 Sep 2025 14:51:31 +0200 Subject: [PATCH 03/11] remove unsupported bits --- tests/testthat/test_unit_creation.R | 44 ----------------------------- vignettes/units.Rmd | 1 - 2 files changed, 45 deletions(-) diff --git a/tests/testthat/test_unit_creation.R b/tests/testthat/test_unit_creation.R index f6afe2e..c010eba 100644 --- a/tests/testthat/test_unit_creation.R +++ b/tests/testthat/test_unit_creation.R @@ -1,47 +1,3 @@ - -test_that("parse_units() backticks strings correctly", { - - x <- matrix(ncol = 2, byrow = TRUE, c( - "in", "`in`", - "`in`", "`in`", - "kelvin", "`kelvin`", - "%", "`%`", - "T", "`T`", - "'/s", "`'`/`s`", - "'", "`'`" , - '"', '`"`' , - '"/s' , '`"`/`s`', - "s/'" , "`s`/`'`" , - "C" , "`C`" , - "F" , "`F`", - "\u00B0C", "`\u00B0C`", - "\u2103" , "`\u2103`", - "\u00B0F", "`\u00B0F`", - "\u2109", "`\u2109`", - "log(ug)", "`log`(`ug`)", - "log(ug/l)", "`log`(`ug`/`l`)", - "kg*m/s^2", "`kg`*`m`/`s`^2" - )) - colnames(x) <- c("input", "expected_output") - - expect_identical(units:::backtick(x[,"input"]), x[,"expected_output"]) -}) - -test_that("explicit exponents identified correctly", { - expect_true( are_exponents_implicit("m s") ) - expect_true( are_exponents_implicit("m2") ) - expect_true( are_exponents_implicit("m-2") ) - expect_true( are_exponents_implicit("2 m") ) - expect_true( are_exponents_implicit("m s-2") ) - expect_true( are_exponents_implicit("m s-2 kg") ) - expect_true( are_exponents_implicit("2 m s") ) - - expect_false( are_exponents_implicit("m") ) - expect_false( are_exponents_implicit("m/s") ) - expect_false( are_exponents_implicit("m^2") ) - expect_false( are_exponents_implicit("m*s") ) -}) - test_that("global options are respected", { # rt: round trip rt <- function(x) as.character(units(as_units(x))) diff --git a/vignettes/units.Rmd b/vignettes/units.Rmd index d1a7d86..3185679 100644 --- a/vignettes/units.Rmd +++ b/vignettes/units.Rmd @@ -133,7 +133,6 @@ sum(a) min(a) max(a) range(a) -make_units(min(m/s, km/h)) # converts to first unit: ``` ### Printing From 2378f5213320a940447031a87ef09cbd409b4b9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Sun, 28 Sep 2025 15:28:11 +0200 Subject: [PATCH 04/11] run full pillar test --- tests/testthat/_snaps/tidyverse.md | 11 +++++++++-- tests/testthat/test_tidyverse.R | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/testthat/_snaps/tidyverse.md b/tests/testthat/_snaps/tidyverse.md index 4da8963..c779a2e 100644 --- a/tests/testthat/_snaps/tidyverse.md +++ b/tests/testthat/_snaps/tidyverse.md @@ -1,15 +1,22 @@ # pillar methods are available for units objects Code - pillar::pillar(x[1]) + pillar::pillar(x) Output [km] 1 + 2 + 3 Code - pillar::pillar(m[1]) + pillar::pillar(m) Output 1 [km] + 2 [km] + 3 [km] + 4 [g] + 5 [g] + 6 [g] diff --git a/tests/testthat/test_tidyverse.R b/tests/testthat/test_tidyverse.R index a5df966..742fb3f 100644 --- a/tests/testthat/test_tidyverse.R +++ b/tests/testthat/test_tidyverse.R @@ -9,8 +9,8 @@ test_that("pillar methods are available for units objects", { expect_equal(pillar::type_sum(m), "mixed_units") expect_snapshot({ - pillar::pillar(x[1]) - pillar::pillar(m[1]) + pillar::pillar(x) + pillar::pillar(m) }) }) From a85c574d63b714f54b898225ff50d2005f9548c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Sun, 28 Sep 2025 16:02:57 +0200 Subject: [PATCH 05/11] add more tests --- tests/testthat/test_unit_creation.R | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/testthat/test_unit_creation.R b/tests/testthat/test_unit_creation.R index c010eba..66fe786 100644 --- a/tests/testthat/test_unit_creation.R +++ b/tests/testthat/test_unit_creation.R @@ -125,3 +125,31 @@ test_that("set_units default enforces NSE", { # is it bad if this works? # expect_error(set_units(1:3, "m/s")) }) + +expect_symbolic <- function(u, n, d) + expect_equal(units(as_units(u)), units:::.symbolic_units(n, d)) + +test_that("exotic units work", { + # check what udunits support + # units:::R_ut_format(units:::R_ut_parse(some_string)) + + expect_symbolic("2.2 m s", c("2.2", "m", "s"), character(0)) + expect_symbolic("2.2*m*s", c("2.2", "m", "s"), character(0)) + #expect_symbolic("2.2.m.s", c("2.2", "m", "s"), character(0)) + + expect_symbolic("m2/s", c("m", "m"), "s") + expect_symbolic("m^2/s", c("m", "m"), "s") + expect_symbolic("m 2/s", c("m", "2"), "s") + expect_symbolic("m-2/s", character(0), c("m", "m", "s")) + expect_symbolic("m^-2/s", character(0), c("m", "m", "s")) + expect_symbolic("m/s2", "m", c("s", "s")) + expect_symbolic("m/s^2", "m", c("s", "s")) + expect_symbolic("m/s 2", c("m", "2"), "s") + expect_symbolic("m/s-2", c("m", "s", "s"), character(0)) + expect_symbolic("m/s^-2", c("m", "s", "s"), character(0)) + + expect_symbolic("ml/min/1.73m^2", c("ml", "m", "m"), c("min", "1.73")) + expect_symbolic("ml/min/(1.73m^2)", "ml", c("min", "1.73", "m", "m")) + expect_symbolic("ml/min/1.73/m^2", "ml", c("min", "1.73", "m", "m")) + expect_symbolic("ml/min/1.73m-2", "ml", c("min", "1.73", "m", "m")) +}) From 29aa42201e9f2e00b393ab6e8b78ca1f1a04351b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Sun, 28 Sep 2025 18:58:26 +0200 Subject: [PATCH 06/11] treat numbers as prefixes --- R/RcppExports.R | 4 ++-- R/make_units.R | 2 +- R/options.R | 12 ++++++++++-- man/units_options.Rd | 6 ++++-- src/RcppExports.cpp | 9 +++++---- src/tokenizer.cpp | 19 ++++++++++++++----- tests/testthat/test_unit_creation.R | 7 ++++++- 7 files changed, 42 insertions(+), 17 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index a07eb57..37f8965 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,8 +1,8 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 -parse_unit <- function(x) { - .Call('_units_parse_unit', PACKAGE = 'units', x) +parse_unit <- function(x, strict = FALSE) { + .Call('_units_parse_unit', PACKAGE = 'units', x, strict) } ud_exit <- function() { diff --git a/R/make_units.R b/R/make_units.R index 948eefa..38c4cac 100644 --- a/R/make_units.R +++ b/R/make_units.R @@ -289,7 +289,7 @@ as_units.character <- function(x, ..., if(force_single_symbol || is_udunits_time(x)) return(symbolic_unit(x, check_is_valid = check_is_valid)) - o <- try(su <- parse_unit(x), silent = TRUE) + o <- try(su <- parse_unit(x, units_options("strict_tokenizer")), silent=TRUE) if(inherits(o, "try-error")) { warning("Could not parse expression: ", sQuote(x), # nocov ". Returning as a single symbolic unit()", call. = FALSE) # nocov diff --git a/R/options.R b/R/options.R index 5e6fa47..30ca52b 100644 --- a/R/options.R +++ b/R/options.R @@ -6,6 +6,7 @@ negative_power = FALSE, parse = TRUE, set_units_mode = "symbols", + strict_tokenizer = FALSE, auto_convert_names_to_symbols = TRUE, simplify = NA, allow_mixed = FALSE, @@ -29,6 +30,7 @@ #' @param negative_power logical, default \code{FALSE}; should denominators have negative power, or follow a division symbol? #' @param parse logical, default \code{TRUE}; should the units be made into an expression (so we get subscripts)? Setting to \code{FALSE} may be useful if \link{parse} fails, e.g. if the unit contains symbols that assume a particular encoding #' @param set_units_mode character; either \code{"symbols"} or \code{"standard"}; see \link{set_units}; default is \code{"symbols"} +#' @param strict_tokenizer logical, default \code{FALSE}; non-strict tokenization attaches constants to the following symbol. #' @param auto_convert_names_to_symbols logical, default \code{TRUE}: should names, such as \code{degree_C} be converted to their usual symbol? #' @param simplify logical, default \code{NA}; simplify units in expressions? #' @param allow_mixed logical; if \code{TRUE}, combining mixed units creates a \code{mixed_units} object, if \code{FALSE} it generates an error @@ -45,8 +47,10 @@ #' units_options(sep = c("~", "~"), group = c("[", "]"), negative_power = FALSE, parse = TRUE) #' units_options("group") #' @export -units_options = function(..., sep, group, negative_power, parse, set_units_mode, auto_convert_names_to_symbols, simplify, - allow_mixed, unitless_symbol, define_bel) { +units_options = function(..., sep, group, negative_power, parse, set_units_mode, + strict_tokenizer, auto_convert_names_to_symbols, simplify, + allow_mixed, unitless_symbol, define_bel) +{ # op = as.list(units:::.units_options) ret = list() if (!missing(sep)) { @@ -69,6 +73,10 @@ units_options = function(..., sep, group, negative_power, parse, set_units_mode, stopifnot(is.character(set_units_mode) && length(set_units_mode) == 1) ret$set_units_mode = .setopt(set_units_mode) } + if (!missing(strict_tokenizer)) { + stopifnot(is.logical(strict_tokenizer)) + ret$strict_tokenizer = .setopt(strict_tokenizer) + } if (!missing(auto_convert_names_to_symbols)) { stopifnot(is.logical(auto_convert_names_to_symbols)) ret$auto_convert_names_to_symbols = .setopt(auto_convert_names_to_symbols) diff --git a/man/units_options.Rd b/man/units_options.Rd index 0f869fc..5f737bf 100644 --- a/man/units_options.Rd +++ b/man/units_options.Rd @@ -5,8 +5,8 @@ \title{set one or more units global options} \usage{ units_options(..., sep, group, negative_power, parse, set_units_mode, - auto_convert_names_to_symbols, simplify, allow_mixed, unitless_symbol, - define_bel) + strict_tokenizer, auto_convert_names_to_symbols, simplify, allow_mixed, + unitless_symbol, define_bel) } \arguments{ \item{...}{named options (character) for which the value is queried} @@ -21,6 +21,8 @@ units_options(..., sep, group, negative_power, parse, set_units_mode, \item{set_units_mode}{character; either \code{"symbols"} or \code{"standard"}; see \link{set_units}; default is \code{"symbols"}} +\item{strict_tokenizer}{logical, default \code{FALSE}; non-strict tokenization attaches constants to the following symbol.} + \item{auto_convert_names_to_symbols}{logical, default \code{TRUE}: should names, such as \code{degree_C} be converted to their usual symbol?} \item{simplify}{logical, default \code{NA}; simplify units in expressions?} diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 6f3c672..ad8adaa 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -11,13 +11,14 @@ Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // parse_unit -SEXP parse_unit(std::string_view x); -RcppExport SEXP _units_parse_unit(SEXP xSEXP) { +SEXP parse_unit(std::string_view x, bool strict); +RcppExport SEXP _units_parse_unit(SEXP xSEXP, SEXP strictSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::string_view >::type x(xSEXP); - rcpp_result_gen = Rcpp::wrap(parse_unit(x)); + Rcpp::traits::input_parameter< bool >::type strict(strictSEXP); + rcpp_result_gen = Rcpp::wrap(parse_unit(x, strict)); return rcpp_result_gen; END_RCPP } @@ -280,7 +281,7 @@ END_RCPP } static const R_CallMethodDef CallEntries[] = { - {"_units_parse_unit", (DL_FUNC) &_units_parse_unit, 1}, + {"_units_parse_unit", (DL_FUNC) &_units_parse_unit, 2}, {"_units_ud_exit", (DL_FUNC) &_units_ud_exit, 0}, {"_units_ud_init", (DL_FUNC) &_units_ud_init, 1}, {"_units_ud_set_encoding", (DL_FUNC) &_units_ud_set_encoding, 1}, diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index db5ddbc..544e117 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -3,7 +3,8 @@ using namespace Rcpp; class SymbolicUnits { public: - SymbolicUnits(std::string_view x) : x(x), it(x.begin()) { tokenize(); } + SymbolicUnits(std::string_view x, bool strict=false) + : x(x), it(x.begin()), strict(strict) { tokenize(); } SymbolicUnits(std::string_view x, std::string_view::iterator start, std::string_view::iterator end) : x(x.substr(start - x.begin(), end - start)), it(this->x.end()) { numerator.push_back(std::string(this->x)); @@ -15,6 +16,8 @@ class SymbolicUnits { std::string_view::iterator it; std::vector numerator, denominator; bool in_denominator = false; + bool stay = false; + bool strict = false; /* helpers -----------------------------------------------------------------*/ @@ -48,7 +51,7 @@ class SymbolicUnits { if (level != 0) stop("unmatched parenthesis"); - return SymbolicUnits(x.substr(start - x.begin(), it++ - start)); + return SymbolicUnits(x.substr(start - x.begin(), it++ - start), strict); } int get_exponent() { @@ -85,6 +88,7 @@ class SymbolicUnits { SymbolicUnits get_number() { auto start = it++; for (; it != x.end() && is_number_char(*it); ++it); + stay = !strict; return SymbolicUnits(x, start, it); } @@ -117,7 +121,7 @@ class SymbolicUnits { continue; } if (c == '/') { - in_denominator = !in_denominator; + in_denominator = true; ++it; continue; } @@ -126,7 +130,10 @@ class SymbolicUnits { auto token = (c == '(') ? get_paren() : (std::isdigit(c) ? get_number() : get_name()); combine(token, get_exponent()); - in_denominator = false; + + // switch to numerator if needed + if (!stay) in_denominator = false; + stay = false; } } }; @@ -143,7 +150,9 @@ SymbolicUnits::operator SEXP() { }; // [[Rcpp::export]] -SEXP parse_unit(std::string_view x) { return SymbolicUnits(x); } +SEXP parse_unit(std::string_view x, bool strict=false) { + return SymbolicUnits(x, strict); +} /*** R parse_unit("m") diff --git a/tests/testthat/test_unit_creation.R b/tests/testthat/test_unit_creation.R index 66fe786..4c2128a 100644 --- a/tests/testthat/test_unit_creation.R +++ b/tests/testthat/test_unit_creation.R @@ -148,8 +148,13 @@ test_that("exotic units work", { expect_symbolic("m/s-2", c("m", "s", "s"), character(0)) expect_symbolic("m/s^-2", c("m", "s", "s"), character(0)) - expect_symbolic("ml/min/1.73m^2", c("ml", "m", "m"), c("min", "1.73")) expect_symbolic("ml/min/(1.73m^2)", "ml", c("min", "1.73", "m", "m")) expect_symbolic("ml/min/1.73/m^2", "ml", c("min", "1.73", "m", "m")) + expect_symbolic("ml/min/1.73m^2", "ml", c("min", "1.73", "m", "m")) + expect_symbolic("ml/min/1.73m-2", c("ml", "m", "m"), c("min", "1.73")) + + old <- unlist(units_options(strict_tokenizer=TRUE)) + on.exit(units_options(strict_tokenizer=old)) + expect_symbolic("ml/min/1.73m^2", c("ml", "m", "m"), c("min", "1.73")) expect_symbolic("ml/min/1.73m-2", "ml", c("min", "1.73", "m", "m")) }) From 8672f3a82762ef2e9a0e3aa2f108730e66c3009e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Sun, 28 Sep 2025 20:16:28 +0200 Subject: [PATCH 07/11] follow strict_tokenizer while formatting too --- R/symbolic_units.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/symbolic_units.R b/R/symbolic_units.R index f185160..f519a8e 100644 --- a/R/symbolic_units.R +++ b/R/symbolic_units.R @@ -53,12 +53,16 @@ unitless <- .symbolic_units(vector("character"), vector("character")) for (i in seq_along(sym)) if (pwr[i] != 1) sym[i] <- paste(sym[i], pwr[i], sep = pwr_op) - paste0(sym, collapse = paste0(op, sep)) + + s <- matrix(c(sym, rep(paste0(op, sep), length(sym)-1), ""), nrow=length(sym)) + if (!units_options("strict_tokenizer")) + s[sym >= "0" & sym <= "9", 2] <- "" + paste0(t(s), collapse="") } #' @export as.character.symbolic_units <- function(x, ..., - neg_power = get(".units.negative_power", envir = .units_options), + neg_power = units_options("negative_power"), escape_units = FALSE, prod_sep = "*", plot_sep = "") { sep <- plot_sep From b5a0a28fb1158658ca68d7522dd72f3c509d9076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Sun, 28 Sep 2025 20:47:46 +0200 Subject: [PATCH 08/11] implement lookahead to enable numbers in the middle of symbols --- src/tokenizer.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp index 544e117..535a43a 100644 --- a/src/tokenizer.cpp +++ b/src/tokenizer.cpp @@ -29,9 +29,15 @@ class SymbolicUnits { return std::isdigit(c) || c == '.' || c == 'e' || c == '-'; } - bool is_name_char(const char& c) { - return !std::isdigit(c) && !is_multiplicative(c) && c != '/' && + bool is_symbol_char(const char& c) { + if (!std::isdigit(c)) return !is_multiplicative(c) && c != '/' && c != '(' && c != ')' && c != '^' && c != '-' && c != '+'; + + auto lookahead = it; + for (; lookahead != x.end() && std::isdigit(*lookahead); ++lookahead); + if (lookahead != x.end() && is_symbol_char(*lookahead)) + return true; + return false; } /* getters -----------------------------------------------------------------*/ @@ -92,9 +98,9 @@ class SymbolicUnits { return SymbolicUnits(x, start, it); } - SymbolicUnits get_name() { + SymbolicUnits get_symbol() { auto start = (*it == '`') ? ++it : it++; - for (; it != x.end() && is_name_char(*it); ++it); + for (; it != x.end() && is_symbol_char(*it); ++it); return SymbolicUnits(x, start, it - (*(it-1) == '`' ? 1 : 0)); } @@ -128,7 +134,7 @@ class SymbolicUnits { // handle parenthesis, digits, and symbols/names auto token = (c == '(') ? get_paren() : - (std::isdigit(c) ? get_number() : get_name()); + (std::isdigit(c) ? get_number() : get_symbol()); combine(token, get_exponent()); // switch to numerator if needed @@ -163,4 +169,6 @@ parse_unit("ml / min / 1.73 / m^2") parse_unit("ml/min/1.73m^2") parse_unit("ml/min/1.73/m^2") parse_unit("ml/min/(1.73m^2)") +parse_unit("inH2O") +parse_unit("inH2O2") */ From 8e297613e9d54511efd8bbc440a58a55a84c9460 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Sun, 28 Sep 2025 21:35:47 +0200 Subject: [PATCH 09/11] try to convert only if parseable --- R/make_units.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/make_units.R b/R/make_units.R index 38c4cac..3133847 100644 --- a/R/make_units.R +++ b/R/make_units.R @@ -305,7 +305,7 @@ as_units.character <- function(x, ..., if (units_options("auto_convert_names_to_symbols")) { name_to_symbol <- function(chr) - if (length(sym <- ud_get_symbol(chr))) sym else chr + if (ud_is_parseable(chr) && length(sym <- ud_get_symbol(chr))) sym else chr su$numerator <- vapply(su$numerator, name_to_symbol, character(1), USE.NAMES=FALSE) su$denominator <- vapply(su$denominator, name_to_symbol, character(1), USE.NAMES=FALSE) } From c1c7fbad9d8cfece8dfe61de84812a966b9d4797 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Mon, 29 Sep 2025 12:26:29 +0200 Subject: [PATCH 10/11] bump version, update NEWS and revdep checks --- DESCRIPTION | 2 +- NEWS.md | 9 ++- revdep/README.md | 22 +++--- revdep/cran.md | 13 +--- revdep/problems.md | 181 ++++++++------------------------------------- 5 files changed, 52 insertions(+), 175 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index fb139f9..0e449b0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: units -Version: 0.8-7.3 +Version: 0.8-7.4 Title: Measurement Units for R Vectors Authors@R: c(person("Edzer", "Pebesma", role = c("aut", "cre"), email = "edzer.pebesma@uni-muenster.de", comment = c(ORCID = "0000-0001-8049-7069")), person("Thomas", "Mailund", role = "aut", email = "mailund@birc.au.dk"), diff --git a/NEWS.md b/NEWS.md index a786c1c..682fbf2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,11 @@ -# version devel +# version 1.0-0 + +* Breaking change: a new tokenizer fixes longstanding issues with parsing + complex unit expressions, but may break existing code that relied on the + previous (buggy) behavior. The major change is that now numbers are + consistently treated as prefixes, so that units like `ml / min / 1.73m^2` + used in physiology are now correctly parsed as `ml / (min * 1.73 * m^2)`. + See `?as_units` for details; #416 addressing #221, #383 * Vectorize `ud_*()` helpers; #405 addressing #404 diff --git a/revdep/README.md b/revdep/README.md index 302f924..e6f041a 100644 --- a/revdep/README.md +++ b/revdep/README.md @@ -2,7 +2,7 @@ |field |value | |:--------|:----------------------------| -|version |R version 4.4.2 (2024-10-31) | +|version |R version 4.4.3 (2025-02-28) | |os |Ubuntu 22.04.5 LTS | |system |x86_64, linux-gnu | |ui |X11 | @@ -10,22 +10,22 @@ |collate |en_US.UTF-8 | |ctype |en_US.UTF-8 | |tz |UTC | -|date |2025-02-14 | +|date |2025-09-29 | |pandoc |2.9.2.1 @ /usr/bin/pandoc | +|quarto |NA | # Dependencies -|package |old |new |Δ | -|:-------|:-----|:-----|:--| -|units |0.8-5 |0.8-6 |* | +|package |old |new |Δ | +|:-------|:-----|:-------|:--| +|units |0.8-7 |0.8-7.3 |* | +|Rcpp |NA |1.1.0 |* | # Revdeps -## New problems (3) +## New problems (1) -|package |version |error |warning |note | -|:---------|:-------|:------|:-------|:----| -|[divvy](problems.md#divvy)|1.0.0 |__+3__ | | | -|[Rdistance](problems.md#rdistance)|3.0.0 |__+1__ | |1 | -|[vein](problems.md#vein)|1.1.3 |__+1__ | |1 | +|package |version |error |warning |note | +|:--------|:-------|:------|:-------|:----| +|[epocakir](problems.md#epocakir)|0.9.9 |__+2__ | | | diff --git a/revdep/cran.md b/revdep/cran.md index 65b0b56..8210f60 100644 --- a/revdep/cran.md +++ b/revdep/cran.md @@ -1,8 +1,8 @@ ## revdepcheck results -We checked 123 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. +We checked 137 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. - * We saw 3 new problems + * We saw 1 new problems * We failed to check 0 packages Issues with CRAN packages are summarised below. @@ -10,14 +10,7 @@ Issues with CRAN packages are summarised below. ### New problems (This reports the first line of each new failure) -* divvy +* epocakir checking examples ... ERROR checking tests ... - checking running R code from vignettes ... - -* Rdistance - checking examples ... ERROR - -* vein - checking tests ... diff --git a/revdep/problems.md b/revdep/problems.md index 00270ed..0b3770b 100644 --- a/revdep/problems.md +++ b/revdep/problems.md @@ -1,14 +1,14 @@ -# divvy +# epocakir
-* Version: 1.0.0 -* GitHub: https://github.com/GawainAntell/divvy -* Source code: https://github.com/cran/divvy -* Date/Publication: 2023-10-26 08:20:03 UTC -* Number of recursive dependencies: 91 +* Version: 0.9.9 +* GitHub: https://github.com/alwinw/epocakir +* Source code: https://github.com/cran/epocakir +* Date/Publication: 2023-01-06 15:30:06 UTC +* Number of recursive dependencies: 78 -Run `revdepcheck::revdep_details(, "divvy")` for more info +Run `revdepcheck::revdep_details(, "epocakir")` for more info
@@ -16,26 +16,22 @@ Run `revdepcheck::revdep_details(, "divvy")` for more info * checking examples ... ERROR ``` - Running examples in ‘divvy-Ex.R’ failed + Running examples in ‘epocakir-Ex.R’ failed The error most likely occurred in: - > ### Name: rangeSize - > ### Title: Calculate common metrics of spatial distribution - > ### Aliases: rangeSize + > ### Name: GFR_staging + > ### Title: GFR Staging + > ### Aliases: GFR_staging GFR_staging.data.frame GFR_staging.units + > ### GFR_staging.numeric > > ### ** Examples > - > # generate 20 occurrences for a pseudo-species - ... - > set.seed(2) - > x <- rnorm(20, 110.5885, 2) - > y <- rnorm(20, 44.4280, 1) - > pts <- cbind(x,y) + > df <- tibble::tibble( + + eGFR = units::set_units(c(-1, NA, 100, 70, 50, 35, 20, 10), "mL/min/1.73m2") + + ) > - > rangeSize(pts) - Error in UseMethod("units") : - no applicable method for 'units' applied to an object of class "c('integer', 'numeric')" - Calls: rangeSize -> cbind -> cbind -> units + > GFR_staging(df, "eGFR") + Error: cannot convert mL/min/1.73m^2 into mL/min Execution halted ``` @@ -45,139 +41,20 @@ Run `revdepcheck::revdep_details(, "divvy")` for more info ERROR Running the tests in ‘tests/testthat.R’ failed. Last 13 lines of output: - Expected `rangeSize(bivAlt[, xyCell], crs = prj)` to run without any conditions. - i Actually got a with text: - no applicable method for 'units' applied to an object of class "c('integer', 'numeric')" - ── Error ('test_calc_eco_variables.R:43:3'): rangeSize and sdSumry accept projected coords ── - Error in `UseMethod("units")`: no applicable method for 'units' applied to an object of class "c('integer', 'numeric')" - Backtrace: - ▆ - 1. └─divvy::rangeSize(bivAlt[, xyCell], crs = prj) at test_calc_eco_variables.R:43:3 - 2. └─base::cbind(...) - 3. └─units (local) cbind(deparse.level, ...) - 4. └─base::units(dots[[1]]) + ▆ + 1. ├─epocakir::GFR_staging(df, "eGFR") at test-ckd.R:378:3 + 2. └─epocakir:::GFR_staging.data.frame(df, "eGFR") + 3. ├─epocakir::GFR_staging(.data[[rlang::as_name(rlang::enquo(GFR))]]) + 4. └─epocakir:::GFR_staging.units(.data[[rlang::as_name(rlang::enquo(GFR))]]) + 5. ├─epocakir::GFR_staging(as_metric(GFR = GFR, value_only = TRUE)) + 6. └─epocakir::as_metric(GFR = GFR, value_only = TRUE) + 7. ├─units::set_units(meas, conversion$metric_units, mode = "standard") + 8. └─units:::set_units.units(meas, conversion$metric_units, mode = "standard") + 9. ├─base::`units<-`(`*tmp*`, value = as_units(value, ...)) + 10. └─units:::`units<-.units`(`*tmp*`, value = as_units(value, ...)) - [ FAIL 4 | WARN 0 | SKIP 0 | PASS 22 ] + [ FAIL 1 | WARN 9 | SKIP 0 | PASS 598 ] Error: Test failures Execution halted ``` -* checking running R code from vignettes ... - ``` - ‘habitat-rangesize-case-study.Rmd’ using ‘UTF-8’... OK - ‘subsampling-vignette.Rmd’ using ‘UTF-8’... failed - ERROR - Errors in running code in vignettes: - when running code in ‘subsampling-vignette.Rmd’ - ... - > unique(names(bandLocs)) - [1] "[-50,-30)" "[-10,10)" "[10,30)" "[30,50)" "[50,70)" - - > unsamp <- sdSumry(dat = bivalves, taxVar = "genus", - + collections = "collection_no", xy = xyCell, quotaQ = 0.8, - + quotaN = 100, omitDom = .... [TRUNCATED] - - When sourcing ‘subsampling-vignette.R’: - Error: no applicable method for 'units' applied to an object of class "c('integer', 'numeric')" - Execution halted - ``` - -# Rdistance - -
- -* Version: 3.0.0 -* GitHub: https://github.com/tmcd82070/Rdistance -* Source code: https://github.com/cran/Rdistance -* Date/Publication: 2023-06-13 07:30:05 UTC -* Number of recursive dependencies: 95 - -Run `revdepcheck::revdep_details(, "Rdistance")` for more info - -
- -## Newly broken - -* checking examples ... ERROR - ``` - Running examples in ‘Rdistance-Ex.R’ failed - The error most likely occurred in: - - > ### Name: plot.dfunc - > ### Title: plot.dfunc - Plot method for distance (detection) functions - > ### Aliases: plot.dfunc - > ### Keywords: models - > - > ### ** Examples - > - ... - + , col="wheat" - + , density=30 - + , angle=c(-45,0,45) - + , cex.axis=1.5 - + , cex.lab=2 - + , ylab="Probability") - Error in UseMethod("units") : - no applicable method for 'units' applied to an object of class "logical" - Calls: plot ... barplot.default -> xyrect -> rect -> rbind -> rbind -> units - Execution halted - ``` - -## In both - -* checking Rd files ... NOTE - ``` - checkRd: (-1) halfnorm.like.Rd:105: Lost braces; missing escapes or markup? - 105 | {f(x|a,b,c_1,c_2,...,c_k) = f(x|a,b)(1 + c(1) h_i1(x) + c(2) h_i2(x) + ... + c(k) h_ik(x)). } - | ^ - checkRd: (-1) secondDeriv.Rd:15: Lost braces - 15 | This must be a function of the form FUN <- function(x, ...){...} - | ^ - ``` - -# vein - -
- -* Version: 1.1.3 -* GitHub: https://github.com/atmoschem/vein -* Source code: https://github.com/cran/vein -* Date/Publication: 2024-05-01 13:50:02 UTC -* Number of recursive dependencies: 56 - -Run `revdepcheck::revdep_details(, "vein")` for more info - -
- -## Newly broken - -* checking tests ... - ``` - Running ‘testthat.R’ - ERROR - Running the tests in ‘tests/testthat.R’ failed. - Last 13 lines of output: - Weighted mean = 1.5 - [ FAIL 1 | WARN 0 | SKIP 0 | PASS 701 ] - - ══ Failed tests ════════════════════════════════════════════════════════════════ - ── Failure ('test-long_to_wide.R:9:3'): long_to_wide works ───────────────────── - long_to_wide(df)$CO[1] not equal to 1. - Attributes: < Modes: list, NULL > - Attributes: < Lengths: 2, 0 > - Attributes: < names for target but not for current > - Attributes: < current is not list-like > - target is units, current is numeric - - [ FAIL 1 | WARN 0 | SKIP 0 | PASS 701 ] - Error: Test failures - Execution halted - ``` - -## In both - -* checking data for non-ASCII characters ... NOTE - ``` - Note: found 49 marked UTF-8 strings - ``` - From 455450acb3d183f37b1ed6c9a956d9ae2bc18834 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= Date: Mon, 29 Sep 2025 13:39:45 +0200 Subject: [PATCH 11/11] more tests --- tests/testthat/test_unit_creation.R | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test_unit_creation.R b/tests/testthat/test_unit_creation.R index 4c2128a..c3702f0 100644 --- a/tests/testthat/test_unit_creation.R +++ b/tests/testthat/test_unit_creation.R @@ -128,6 +128,8 @@ test_that("set_units default enforces NSE", { expect_symbolic <- function(u, n, d) expect_equal(units(as_units(u)), units:::.symbolic_units(n, d)) +expect_symbolic_nocheck <- function(u, n, d) + expect_equal(units(as_units(u, check_is_valid=FALSE)), units:::.symbolic_units(n, d)) test_that("exotic units work", { # check what udunits support @@ -138,6 +140,7 @@ test_that("exotic units work", { #expect_symbolic("2.2.m.s", c("2.2", "m", "s"), character(0)) expect_symbolic("m2/s", c("m", "m"), "s") + expect_symbolic("m20/s", rep("m", 20), "s") expect_symbolic("m^2/s", c("m", "m"), "s") expect_symbolic("m 2/s", c("m", "2"), "s") expect_symbolic("m-2/s", character(0), c("m", "m", "s")) @@ -154,7 +157,14 @@ test_that("exotic units work", { expect_symbolic("ml/min/1.73m-2", c("ml", "m", "m"), c("min", "1.73")) old <- unlist(units_options(strict_tokenizer=TRUE)) - on.exit(units_options(strict_tokenizer=old)) expect_symbolic("ml/min/1.73m^2", c("ml", "m", "m"), c("min", "1.73")) expect_symbolic("ml/min/1.73m-2", "ml", c("min", "1.73", "m", "m")) + units_options(strict_tokenizer=old) + + expect_symbolic_nocheck("inH2O", "inH2O", character(0)) + expect_symbolic_nocheck("inH2O2", c("inH2O", "inH2O"), character(0)) + + expect_symbolic("m/(g/(s/L))", c("m", "s"), c("g", "L")) + expect_error(expect_warning(as_units("m/(g/s"))) + expect_error(expect_warning(as_units("m^m"))) })