From 9ee881b54adf1be7ad197e9cdb14f9eecd30db48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Sun, 28 Sep 2025 14:49:02 +0200
Subject: [PATCH 01/11] implement new tokenizer

---
 R/RcppExports.R     |   4 ++
 src/RcppExports.cpp |  12 ++++
 src/tokenizer.cpp   | 157 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 173 insertions(+)
 create mode 100644 src/tokenizer.cpp
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 0254605..a07eb57 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -1,6 +1,10 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
+parse_unit <- function(x) {
+    .Call('_units_parse_unit', PACKAGE = 'units', x)
+}
+
 ud_exit <- function() {
     invisible(.Call('_units_ud_exit', PACKAGE = 'units'))
 }
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 1a74d25..6f3c672 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -10,6 +10,17 @@ Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
 Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 #endif
 
+// parse_unit
+SEXP parse_unit(std::string_view x);
+RcppExport SEXP _units_parse_unit(SEXP xSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< std::string_view >::type x(xSEXP);
+    rcpp_result_gen = Rcpp::wrap(parse_unit(x));
+    return rcpp_result_gen;
+END_RCPP
+}
 // ud_exit
 void ud_exit();
 RcppExport SEXP _units_ud_exit() {
@@ -269,6 +280,7 @@ END_RCPP
 }
 
 static const R_CallMethodDef CallEntries[] = {
+    {"_units_parse_unit", (DL_FUNC) &_units_parse_unit, 1},
     {"_units_ud_exit", (DL_FUNC) &_units_ud_exit, 0},
     {"_units_ud_init", (DL_FUNC) &_units_ud_init, 1},
     {"_units_ud_set_encoding", (DL_FUNC) &_units_ud_set_encoding, 1},
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
new file mode 100644
index 0000000..db5ddbc
--- /dev/null
+++ b/src/tokenizer.cpp
@@ -0,0 +1,157 @@
+#include <RcppCommon.h>
+using namespace Rcpp;
+
+class SymbolicUnits {
+public:
+  SymbolicUnits(std::string_view x) : x(x), it(x.begin()) { tokenize(); }
+  SymbolicUnits(std::string_view x, std::string_view::iterator start, std::string_view::iterator end)
+    : x(x.substr(start - x.begin(), end - start)), it(this->x.end()) {
+    numerator.push_back(std::string(this->x));
+  }
+  operator SEXP();
+
+private:
+  std::string_view x;
+  std::string_view::iterator it;
+  std::vector<std::string> numerator, denominator;
+  bool in_denominator = false;
+
+  /* helpers -----------------------------------------------------------------*/
+
+  bool is_multiplicative(const char& c) {
+    return c == '*' || c == '.' || c == ' ';
+  }
+
+  bool is_number_char(const char& c) {
+    return std::isdigit(c) || c == '.' || c == 'e' || c == '-';
+  }
+
+  bool is_name_char(const char& c) {
+    return !std::isdigit(c) && !is_multiplicative(c) && c != '/' &&
+      c != '(' && c != ')' && c != '^' && c != '-' && c != '+';
+  }
+
+  /* getters -----------------------------------------------------------------*/
+
+  SymbolicUnits get_paren() {
+    int level = 1;
+    auto start = ++it;
+
+    for (; it != x.end(); ++it) {
+      const char& c = *it;
+      if (c == '(')
+        ++level;
+      else if (c == ')' && --level == 0)
+        break;
+    }
+
+    if (level != 0)
+      stop("unmatched parenthesis");
+
+    return SymbolicUnits(x.substr(start - x.begin(), it++ - start));
+  }
+
+  int get_exponent() {
+    int exponent = 1;
+    if (it == x.end()) return exponent;
+
+    bool is_negative = false;
+    bool is_exponent = false;
+    if (*it == '^') {
+      is_exponent = true;
+      ++it;
+    }
+    switch (*it) {
+    case '-':
+      is_negative = true;
+    case '+':
+      is_exponent = true;
+      ++it;
+    }
+    bool is_digit = it != x.end() && std::isdigit(*it);
+
+    if (!is_digit && is_exponent)
+      stop("invalid exponent");
+    else if (is_digit) {
+      exponent = *(it++) - '0';
+      for (; it != x.end() && isdigit(*it); ++it)
+        exponent = exponent * 10 + (*it - '0');
+      if (is_negative) exponent = -exponent;
+    }
+
+    return exponent;
+  }
+
+  SymbolicUnits get_number() {
+    auto start = it++;
+    for (; it != x.end() && is_number_char(*it); ++it);
+    return SymbolicUnits(x, start, it);
+  }
+
+  SymbolicUnits get_name() {
+    auto start = (*it == '`') ? ++it : it++;
+    for (; it != x.end() && is_name_char(*it); ++it);
+    return SymbolicUnits(x, start, it - (*(it-1) == '`' ? 1 : 0));
+  }
+
+  /* main logic --------------------------------------------------------------*/
+
+  void combine(SymbolicUnits& o, int exponent = 1) {
+    bool invert = in_denominator != (exponent < 0);
+    auto& target_num = invert ? denominator : numerator;
+    auto& target_den = invert ? numerator : denominator;
+
+    for (exponent = std::abs(exponent); exponent > 0; --exponent) {
+      target_num.insert(target_num.end(), o.numerator.begin(), o.numerator.end());
+      target_den.insert(target_den.end(), o.denominator.begin(), o.denominator.end());
+    }
+  }
+
+  void tokenize() {
+    while (it != x.end()) {
+      const char& c = *it;
+
+      // handle separators
+      if (is_multiplicative(c)) {
+        ++it;
+        continue;
+      }
+      if (c == '/') {
+        in_denominator = !in_denominator;
+        ++it;
+        continue;
+      }
+
+      // handle parenthesis, digits, and symbols/names
+      auto token = (c == '(') ? get_paren() :
+        (std::isdigit(c) ? get_number() : get_name());
+      combine(token, get_exponent());
+      in_denominator = false;
+    }
+  }
+};
+
+#include <Rcpp.h>
+
+SymbolicUnits::operator SEXP() {
+  List unit = List::create(
+    _["numerator"]   = numerator,
+    _["denominator"] = denominator
+  );
+  unit.attr("class") = "symbolic_units";
+  return unit;
+};
+
+// [[Rcpp::export]]
+SEXP parse_unit(std::string_view x) { return SymbolicUnits(x); }
+
+/*** R
+parse_unit("m")
+parse_unit("m2")
+parse_unit("m2 / g^3")
+parse_unit("m2 / g-1")
+parse_unit("ml / min / 1.73 / m^2")
+parse_unit("ml/min/1.73m^2")
+parse_unit("ml/min/1.73/m^2")
+parse_unit("ml/min/(1.73m^2)")
+*/

From f1fb41eb0d73aa40f3285755042af0d3f1047fb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Sun, 28 Sep 2025 14:50:43 +0200
Subject: [PATCH 02/11] reimplement as_units based on new tokenizer

---
 R/conversion.R |   3 +-
 R/make_units.R | 250 ++++++++++++++++---------------------------------
 man/units.Rd   |  64 ++++---------
 3 files changed, 100 insertions(+), 217 deletions(-)

diff --git a/R/conversion.R b/R/conversion.R
index f924b53..436ab20 100644
--- a/R/conversion.R
+++ b/R/conversion.R
@@ -233,9 +233,10 @@ set_units.numeric <- function(x, value, ..., mode = units_options("set_units_mod
     value <- unitless
   else if (mode == "symbols") {
     value <- substitute(value)
-
     if(is.numeric(value) && !identical(value, 1) && !identical(value, 1L))
       stop("The only valid number defining a unit is '1', signifying a unitless unit")
+    if (is.name(value) || is.call(value))
+      value <- format(value)
   }
 
   units(x) <- as_units(value, ...)
diff --git a/R/make_units.R b/R/make_units.R
index 5301322..948eefa 100644
--- a/R/make_units.R
+++ b/R/make_units.R
@@ -121,7 +121,7 @@
 #' # or
 #' drop_units(y)
 make_units <- function(bare_expression, check_is_valid = TRUE) {
-  as_units.call(substitute(bare_expression), check_is_valid = check_is_valid)
+  as_units(format(substitute(bare_expression)), check_is_valid = check_is_valid)
 }
 
 #' @name units
@@ -189,63 +189,59 @@ as_units.difftime <- function(x, value, ...) {
 
 #  ----- as_units.character helpers ------
 
-backtick <- function(x) {
-  # backtick all character runs uninterupted by one of ^()*^/`- or a space
-  # don't double up backticks
-  x <- gsub("`?([^() \\*^/`-]+)`?", "`\\1`", x)
-  gsub("`([0-9]*\\.?[0-9]+)`", "\\1", x) # unbacktick bare numbers
+is_udunits_time <- function(s) {
+  ud_is_parseable(s) && ud_are_convertible(s, "seconds since 1970-01-01")
 }
 
-are_exponents_implicit <- function(s) {
-  s <- trimws(s)
-  has <- function(chr, regex = FALSE)
-    grepl(chr, s, fixed = !regex, perl = regex)
-  !has("^") && !has("*") && !has("/") && has("\\s|\\D.*\\d$", regex = TRUE)
+# from package:yasp, paste collapse with serial (oxford) comma
+pc_and <- function(..., sep = "") {
+  x <- paste(..., sep = sep, collapse = NULL)
+  lx <- length(x)
+  if(lx == 0L)
+    ""
+  else if (lx == 1L)
+    x
+  else if (lx == 2L)
+    paste0(x, collapse = " and ")
+  else
+    paste0( paste0(x[-lx], collapse = ", "), ", and ", x[lx])
 }
 
-is_udunits_time <- function(s) {
-  ud_is_parseable(s) && ud_are_convertible(s, "seconds since 1970-01-01")
+.msg_units_not_recognized <- function(unrecognized_symbols, full_expr) {
+
+  if (is.language(full_expr))
+    full_expr <- deparse(full_expr)
+
+  is_are <- if (length(unrecognized_symbols) > 1L) "are" else "is"
+
+  paste0("In ", sQuote(full_expr), ", ",
+         pc_and(sQuote(unrecognized_symbols)), " ", is_are, " not recognized by udunits.\n\n",
+         "See a table of valid unit symbols and names with valid_udunits().\n",
+         "Custom user-defined units can be added with install_unit().\n\n",
+         "See a table of valid unit prefixes with valid_udunits_prefixes().\n",
+         "Prefixes will automatically work with any user-defined unit.")
 }
 
 #' @name units
 #' @export
 #'
+#' @param check_is_valid throw an error if all the unit symbols are not either
+#'   recognized by udunits2, or a custom
+#'   user defined via \code{install_unit()}. If \code{FALSE}, no check
+#'   for validity is performed.
+#'
 #' @param force_single_symbol Whether to perform no string parsing and force
 #'   treatment of the string as a single symbol.
 #'
-#' @param implicit_exponents If the unit string is in product power form (e.g.
-#'   \code{"km m-2 s-1"}). Defaults to \code{NULL}, in which case a guess is made
-#'   based on the supplied string. Set to \code{TRUE} or \code{FALSE} if the guess is
-#'   incorrect.
-#'
 #' @section Character strings:
 #'
 #'   Generally speaking, there are 3 types of unit strings are accepted in
 #'   \code{as_units} (and by extension, \code{`units<-`}).
 #'
-#'   The first, and likely most common, is a "standard" format unit
+#'   The first type, and likely most common, is a "standard" format unit
 #'   specification where the relationship between unit symbols or names is
 #'   specified explicitly with arithmetic symbols for division \code{/},
-#'   multiplication \code{*} and power exponents \code{^}, or other mathematical
-#'   functions like \code{log()}. In this case, the string is parsed as an R
-#'   expression via \code{parse(text = )} after backticking all unit symbols and
-#'   names, and then passed on to \code{as_units.call()}. A heuristic is used to
-#'   perform backticking, such that any continuous set of characters
-#'   uninterrupted by one of \code{()\\*^-} are backticked (unless the character
-#'   sequence consists solely of numbers \code{0-9}), with some care to not
-#'   double up on pre-existing backticks. This heuristic appears to be quite
-#'   robust, and works for units would otherwise not be valid R syntax. For
-#'   example, percent (\code{"\%"}), feet (\code{"'"}), inches (\code{"in"}),
-#'   and Tesla (\code{"T"}) are all backticked and parsed correctly.
-#'
-#'   Nevertheless, for certain complex unit expressions, this backticking heuristic
-#'   may give incorrect results.  If the string supplied fails to parse as an R
-#'   expression, then the string is treated as a single symbolic unit and
-#'   \code{symbolic_unit(chr)} is used as a fallback with a warning. In that
-#'   case, automatic unit simplification may not work properly when performing
-#'   operations on unit objects, but unit conversion and other Math operations
-#'   should still give correct results so long as the unit string supplied
-#'   returns \code{TRUE} for \code{ud_is_parsable()}.
+#'   multiplication \code{*} and power exponents \code{^}.
 #'
 #'   The second type of unit string accepted is one with implicit exponents. In
 #'   this format, \code{/}, \code{*}, and \code{^}, may not be present in the
@@ -253,11 +249,16 @@ is_udunits_time <- function(s) {
 #'   symbol may optionally be followed by a single number, specifying the power.
 #'   For example \code{"m2 s-2"} is equivalent to \code{"(m^2)*(s^-2)"}.
 #'
-#'   It must be noted that prepended numbers are supported too, but their
-#'   interpretation slightly varies depending on whether they are separated from
-#'   the unit string or not. E.g., \code{"1000 m"} is interpreted as magnitude
-#'   and unit, but \code{"1000m"} is interpreted as a prefixed unit, and it is
-#'   equivalent to \code{"km"} to all effects.
+#'   If the string supplied fails to parse, then the string is treated as a
+#'   single symbolic unit and \code{symbolic_unit(chr)} is used as a fallback
+#'   with a warning. In that case, automatic unit simplification may not work
+#'   properly when performing operations on unit objects, but unit conversion
+#'   and other Math operations should still give correct results so long as
+#'   the unit string supplied returns \code{TRUE} for \code{ud_is_parsable()}.
+#'
+#'   It must be noted that prepended numbers are supported too, but are not
+#'   treated as magnitudes. For example, \code{"1000 m"} is interpreted as
+#'   a prefixed unit, and it is equivalent to \code{"km"} to all effects.
 #'
 #'   The third type of unit string format accepted is the special case of
 #'   udunits time duration with a reference origin, for example \code{"hours
@@ -268,157 +269,69 @@ is_udunits_time <- function(s) {
 #'   otherwise encouraged to use \code{R}'s date and time functionality provided
 #'   by \code{Date} and \code{POSIXt} classes.
 #'
-as_units.character <- function(x,
+#' @note By default, unit names are automatically substituted with unit names
+#'   (e.g., kilogram --> kg). To turn off this behavior, set
+#'   \code{units_options(auto_convert_names_to_symbols = FALSE)}
+#'
+#' @seealso \code{\link{install_unit}}, \code{\link{valid_udunits}}
+as_units.character <- function(x, ...,
                                check_is_valid = TRUE,
-                               implicit_exponents = NULL,
-                               force_single_symbol = FALSE, ...) {
+                               force_single_symbol = FALSE) {
 
   stopifnot(is.character(x), length(x) == 1)
 
-  if (isTRUE(x == "")) return(unitless)
+  if (any(is.na(x)))
+    stop("a missing value for units is not allowed")
+
+  if (isTRUE(x == "" || x == "1"))
+    return(.as.units(1, unitless))
 
   if(force_single_symbol || is_udunits_time(x))
     return(symbolic_unit(x, check_is_valid = check_is_valid))
 
-  if(is.null(implicit_exponents))
-    implicit_exponents <- are_exponents_implicit(x)
-
-  if(implicit_exponents)
-    x <- convert_implicit_to_explicit_exponents(x)
-
-  x <- backtick(x)
-  o <- try(expr <- parse(text = x)[[1]], silent = TRUE)
-
+  o <- try(su <- parse_unit(x), silent = TRUE)
   if(inherits(o, "try-error")) {
-    warning("Could not parse expression: ", sQuote(x),          # nocov
-      ". Returning as a single symbolic unit()", call. = FALSE) # nocov
-    return(symbolic_unit(x, check_is_valid = check_is_valid))   # nocov
+    warning("Could not parse expression: ", sQuote(x),                # nocov
+            ". Returning as a single symbolic unit()", call. = FALSE) # nocov
+    return(symbolic_unit(x, check_is_valid = check_is_valid))         # nocov
   }
 
-  as_units.call(expr, check_is_valid = check_is_valid)
-}
-
-
-convert_implicit_to_explicit_exponents <- function(x) {
-  if (length(grep(c("[*/]"), x)) > 0)
-    stop("If 'implicit_exponents = TRUE', strings cannot contain `*' or `/'")
-  x <- gsub("\\b([^\\d-]+)([-]?\\d+)\\b", "\\1^(\\2)", x, perl =TRUE)
-  x <- gsub("\\s+", " * ", trimws(x), perl = TRUE)
-  x
-}
-
-#  ----- as_units.call helpers ------
-
-# from package:yasp, paste collapse with serial (oxford) comma
-pc_and <- function(..., sep = "") {
-  x <- paste(..., sep = sep, collapse = NULL)
-  lx <- length(x)
-  if(lx == 0L)
-    ""
-  else if (lx == 1L)
-    x
-  else if (lx == 2L)
-    paste0(x, collapse = " and ")
-  else
-    paste0( paste0(x[-lx], collapse = ", "), ", and ", x[lx])
-}
-
-#`%not_in%` <- function(x, table) match(x, table, nomatch = 0L) == 0L
-
-.msg_units_not_recognized <- function(unrecognized_symbols, full_expr) {
-
-  if (is.language(full_expr))
-    full_expr <- deparse(full_expr)
-
-  is_are <- if (length(unrecognized_symbols) > 1L) "are" else "is"
-
-  paste0("In ", sQuote(full_expr), ", ",
-    pc_and(sQuote(unrecognized_symbols)), " ", is_are, " not recognized by udunits.\n\n",
-    "See a table of valid unit symbols and names with valid_udunits().\n",
-    "Custom user-defined units can be added with install_unit().\n\n",
-    "See a table of valid unit prefixes with valid_udunits_prefixes().\n",
-    "Prefixes will automatically work with any user-defined unit.")
-}
-
-units_eval_env <- new.env(parent = baseenv())
-units_eval_env$ln <- function(x) base::log(x)
-units_eval_env$lg <- function(x) base::log(x, base = 10)
-units_eval_env$lb <- function(x) base::log(x, base = 2)
-
-
-#' @name units
-#' @export
-#'
-#' @param check_is_valid throw an error if all the unit symbols are not either
-#'   recognized by udunits2, or a custom
-#'   user defined via \code{install_unit()}. If \code{FALSE}, no check
-#'   for validity is performed.
-#'
-#' @note By default, unit names are automatically substituted with unit names
-#'   (e.g., kilogram --> kg). To turn off this behavior, set
-#'   \code{units_options(auto_convert_names_to_symbols = FALSE)}
-#'
-#' @section Expressions:
-#'
-#'   In \code{as_units()}, each of the symbols in the unit expression is treated
-#'   individually, such that each symbol must be recognized by the udunits
-#'   database, \emph{or} be a custom,
-#'   user-defined unit symbol that was defined by \code{install_unit()}. To
-#'   see which symbols and names are currently recognized by the udunits
-#'   database, see \code{valid_udunits()}.
-#'
-#' @seealso \code{\link{install_unit}}, \code{\link{valid_udunits}}
-as_units.call <- function(x, check_is_valid = TRUE, ...) {
-
-  if(missing(x) || identical(x, quote(expr =)) ||
-     identical(x, 1) || identical(x, 1L))
-    return(.as.units(1, unitless))
-
-  if (is.vector(x) && !is.expression(x) && any(is.na(x)))
-  	stop("a missing value for units is not allowed")
-
-  stopifnot(is.language(x))
-
-  vars <- all.vars(x)
-  if(!length(vars))
-    stop(call. = FALSE,
-"No symbols found. Please supply bare expressions with this approach.
-See ?as_units for usage examples.")
-
   if (check_is_valid) {
+    vars <- c(su$numerator, su$denominator)
     valid <- vapply(vars, ud_is_parseable, logical(1L))
     if (!all(valid))
       stop(.msg_units_not_recognized(vars[!valid], x), call. = FALSE)
   }
 
-  names(vars) <- vars
-  tmp_env <- lapply(vars, symbolic_unit, check_is_valid = FALSE)
-
-  if (dont_simplify_here <- is.na(.units.simplify())) {
-  	units_options(simplify = FALSE)
-  	on.exit(units_options(simplify = NA))
+  if (units_options("auto_convert_names_to_symbols")) {
+    name_to_symbol <- function(chr)
+      if (length(sym <- ud_get_symbol(chr))) sym else chr
+    su$numerator <- vapply(su$numerator, name_to_symbol, character(1), USE.NAMES=FALSE)
+    su$denominator <- vapply(su$denominator, name_to_symbol, character(1), USE.NAMES=FALSE)
   }
 
-  unit <- tryCatch( eval(x, tmp_env, units_eval_env),
-    error = function(e) stop( paste0( conditionMessage(e), "\n",
-          "Did you try to supply a value in a context where a bare expression was expected?"
-        ), call. = FALSE ))
-
-#  if(as.numeric(unit) %not_in% c(1, 0)) # 0 if log() used.
-#    stop(call. = FALSE,
-#"In ", sQuote(deparse(x)), " the numeric multiplier ", sQuote(as.numeric(unit)), " is invalid.
-#Use `install_unit()` to define a new unit that is a multiple of another unit.")
+  if (is.na(.units.simplify())) {
+    units_options(simplify = FALSE)
+    on.exit(units_options(simplify = NA))
+  }
+  .simplify_units(1, su)
+}
 
-  .as.units(as.numeric(unit), units(unit))
+#' @name units
+#' @export
+as_units.call <- function(x, ...) {
+  as_units(format(x), ...)
 }
 
 #' @name units
 #' @export
-as_units.expression <- as_units.call
+as_units.expression <- function(x, ...) {
+  as_units(as.character(x), ...)
+}
 
 #' @name units
 #' @export
-as_units.name       <- as_units.call
+as_units.name       <- as_units.expression
 
 #' @name units
 #' @export
@@ -442,7 +355,6 @@ as_units.Date = function(x, value, ...) {
 
 
 symbolic_unit <- function(chr, check_is_valid = TRUE) {
-
   stopifnot(is.character(chr), length(chr) == 1)
 
   if (check_is_valid && !ud_is_parseable(chr)) {
diff --git a/man/units.Rd b/man/units.Rd
index 394a65a..4277b58 100644
--- a/man/units.Rd
+++ b/man/units.Rd
@@ -46,14 +46,14 @@ as_units(x, ...)
 
 \method{as_units}{difftime}(x, value, ...)
 
-\method{as_units}{character}(x, check_is_valid = TRUE,
-  implicit_exponents = NULL, force_single_symbol = FALSE, ...)
+\method{as_units}{character}(x, ..., check_is_valid = TRUE,
+  force_single_symbol = FALSE)
 
-\method{as_units}{call}(x, check_is_valid = TRUE, ...)
+\method{as_units}{call}(x, ...)
 
-\method{as_units}{expression}(x, check_is_valid = TRUE, ...)
+\method{as_units}{expression}(x, ...)
 
-\method{as_units}{name}(x, check_is_valid = TRUE, ...)
+\method{as_units}{name}(x, ...)
 
 \method{as_units}{POSIXt}(x, value, ...)
 
@@ -80,11 +80,6 @@ recognized by udunits2, or a custom
 user defined via \code{install_unit()}. If \code{FALSE}, no check
 for validity is performed.}
 
-\item{implicit_exponents}{If the unit string is in product power form (e.g.
-\code{"km m-2 s-1"}). Defaults to \code{NULL}, in which case a guess is made
-based on the supplied string. Set to \code{TRUE} or \code{FALSE} if the guess is
-incorrect.}
-
 \item{force_single_symbol}{Whether to perform no string parsing and force
 treatment of the string as a single symbol.}
 }
@@ -128,29 +123,10 @@ By default, unit names are automatically substituted with unit names
   Generally speaking, there are 3 types of unit strings are accepted in
   \code{as_units} (and by extension, \code{`units<-`}).
 
-  The first, and likely most common, is a "standard" format unit
+  The first type, and likely most common, is a "standard" format unit
   specification where the relationship between unit symbols or names is
   specified explicitly with arithmetic symbols for division \code{/},
-  multiplication \code{*} and power exponents \code{^}, or other mathematical
-  functions like \code{log()}. In this case, the string is parsed as an R
-  expression via \code{parse(text = )} after backticking all unit symbols and
-  names, and then passed on to \code{as_units.call()}. A heuristic is used to
-  perform backticking, such that any continuous set of characters
-  uninterrupted by one of \code{()\\*^-} are backticked (unless the character
-  sequence consists solely of numbers \code{0-9}), with some care to not
-  double up on pre-existing backticks. This heuristic appears to be quite
-  robust, and works for units would otherwise not be valid R syntax. For
-  example, percent (\code{"\%"}), feet (\code{"'"}), inches (\code{"in"}),
-  and Tesla (\code{"T"}) are all backticked and parsed correctly.
-
-  Nevertheless, for certain complex unit expressions, this backticking heuristic
-  may give incorrect results.  If the string supplied fails to parse as an R
-  expression, then the string is treated as a single symbolic unit and
-  \code{symbolic_unit(chr)} is used as a fallback with a warning. In that
-  case, automatic unit simplification may not work properly when performing
-  operations on unit objects, but unit conversion and other Math operations
-  should still give correct results so long as the unit string supplied
-  returns \code{TRUE} for \code{ud_is_parsable()}.
+  multiplication \code{*} and power exponents \code{^}.
 
   The second type of unit string accepted is one with implicit exponents. In
   this format, \code{/}, \code{*}, and \code{^}, may not be present in the
@@ -158,11 +134,16 @@ By default, unit names are automatically substituted with unit names
   symbol may optionally be followed by a single number, specifying the power.
   For example \code{"m2 s-2"} is equivalent to \code{"(m^2)*(s^-2)"}.
 
-  It must be noted that prepended numbers are supported too, but their
-  interpretation slightly varies depending on whether they are separated from
-  the unit string or not. E.g., \code{"1000 m"} is interpreted as magnitude
-  and unit, but \code{"1000m"} is interpreted as a prefixed unit, and it is
-  equivalent to \code{"km"} to all effects.
+  If the string supplied fails to parse, then the string is treated as a
+  single symbolic unit and \code{symbolic_unit(chr)} is used as a fallback
+  with a warning. In that case, automatic unit simplification may not work
+  properly when performing operations on unit objects, but unit conversion
+  and other Math operations should still give correct results so long as
+  the unit string supplied returns \code{TRUE} for \code{ud_is_parsable()}.
+
+  It must be noted that prepended numbers are supported too, but are not
+  treated as magnitudes. For example, \code{"1000 m"} is interpreted as
+  a prefixed unit, and it is equivalent to \code{"km"} to all effects.
 
   The third type of unit string format accepted is the special case of
   udunits time duration with a reference origin, for example \code{"hours
@@ -174,17 +155,6 @@ By default, unit names are automatically substituted with unit names
   by \code{Date} and \code{POSIXt} classes.
 }
 
-\section{Expressions}{
-
-
-  In \code{as_units()}, each of the symbols in the unit expression is treated
-  individually, such that each symbol must be recognized by the udunits
-  database, \emph{or} be a custom,
-  user-defined unit symbol that was defined by \code{install_unit()}. To
-  see which symbols and names are currently recognized by the udunits
-  database, see \code{valid_udunits()}.
-}
-
 \examples{
 x = 1:3
 class(x)

From 0ba30d2a6c3a240997e416273bf77850b532e603 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Sun, 28 Sep 2025 14:51:31 +0200
Subject: [PATCH 03/11] remove unsupported bits

---
 tests/testthat/test_unit_creation.R | 44 -----------------------------
 vignettes/units.Rmd                 |  1 -
 2 files changed, 45 deletions(-)

diff --git a/tests/testthat/test_unit_creation.R b/tests/testthat/test_unit_creation.R
index f6afe2e..c010eba 100644
--- a/tests/testthat/test_unit_creation.R
+++ b/tests/testthat/test_unit_creation.R
@@ -1,47 +1,3 @@
-
-test_that("parse_units() backticks strings correctly", {
-
-  x <- matrix(ncol = 2, byrow = TRUE, c(
-    "in",      "`in`",
-    "`in`",    "`in`",
-    "kelvin",  "`kelvin`",
-    "%",       "`%`",
-    "T",       "`T`",
-    "'/s",     "`'`/`s`",
-    "'",       "`'`" ,
-    '"',       '`"`' ,
-    '"/s' ,    '`"`/`s`',
-    "s/'" ,    "`s`/`'`"  ,
-    "C" ,      "`C`" ,
-    "F" ,      "`F`",
-    "\u00B0C", "`\u00B0C`",
-    "\u2103" , "`\u2103`",
-    "\u00B0F", "`\u00B0F`",
-    "\u2109",  "`\u2109`",
-    "log(ug)", "`log`(`ug`)",
-    "log(ug/l)", "`log`(`ug`/`l`)",
-    "kg*m/s^2", "`kg`*`m`/`s`^2"
-  ))
-  colnames(x) <- c("input", "expected_output")
-
-  expect_identical(units:::backtick(x[,"input"]), x[,"expected_output"])
-})
-
-test_that("explicit exponents identified correctly", {
-  expect_true( are_exponents_implicit("m s") )
-  expect_true( are_exponents_implicit("m2") )
-  expect_true( are_exponents_implicit("m-2") )
-  expect_true( are_exponents_implicit("2 m") )
-  expect_true( are_exponents_implicit("m s-2") )
-  expect_true( are_exponents_implicit("m s-2 kg") )
-  expect_true( are_exponents_implicit("2 m s") )
-
-  expect_false( are_exponents_implicit("m") )
-  expect_false( are_exponents_implicit("m/s") )
-  expect_false( are_exponents_implicit("m^2") )
-  expect_false( are_exponents_implicit("m*s") )
-})
-
 test_that("global options are respected", {
   # rt: round trip
   rt <- function(x) as.character(units(as_units(x)))
diff --git a/vignettes/units.Rmd b/vignettes/units.Rmd
index d1a7d86..3185679 100644
--- a/vignettes/units.Rmd
+++ b/vignettes/units.Rmd
@@ -133,7 +133,6 @@ sum(a)
 min(a)
 max(a)
 range(a)
-make_units(min(m/s, km/h)) # converts to first unit:
 ```
 
 ### Printing

From 2378f5213320a940447031a87ef09cbd409b4b9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Sun, 28 Sep 2025 15:28:11 +0200
Subject: [PATCH 04/11] run full pillar test

---
 tests/testthat/_snaps/tidyverse.md | 11 +++++++++--
 tests/testthat/test_tidyverse.R    |  4 ++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/testthat/_snaps/tidyverse.md b/tests/testthat/_snaps/tidyverse.md
index 4da8963..c779a2e 100644
--- a/tests/testthat/_snaps/tidyverse.md
+++ b/tests/testthat/_snaps/tidyverse.md
@@ -1,15 +1,22 @@
 # pillar methods are available for units objects
 
     Code
-      pillar::pillar(x[1])
+      pillar::pillar(x)
     Output
       <pillar>
       [km]
          1
+         2
+         3
     Code
-      pillar::pillar(m[1])
+      pillar::pillar(m)
     Output
       <pillar>
       <mixed_units>
              1 [km]
+             2 [km]
+             3 [km]
+              4 [g]
+              5 [g]
+              6 [g]
 
diff --git a/tests/testthat/test_tidyverse.R b/tests/testthat/test_tidyverse.R
index a5df966..742fb3f 100644
--- a/tests/testthat/test_tidyverse.R
+++ b/tests/testthat/test_tidyverse.R
@@ -9,8 +9,8 @@ test_that("pillar methods are available for units objects", {
   expect_equal(pillar::type_sum(m), "mixed_units")
 
   expect_snapshot({
-    pillar::pillar(x[1])
-    pillar::pillar(m[1])
+    pillar::pillar(x)
+    pillar::pillar(m)
   })
 })
 

From a85c574d63b714f54b898225ff50d2005f9548c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Sun, 28 Sep 2025 16:02:57 +0200
Subject: [PATCH 05/11] add more tests

---
 tests/testthat/test_unit_creation.R | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/testthat/test_unit_creation.R b/tests/testthat/test_unit_creation.R
index c010eba..66fe786 100644
--- a/tests/testthat/test_unit_creation.R
+++ b/tests/testthat/test_unit_creation.R
@@ -125,3 +125,31 @@ test_that("set_units default enforces NSE", {
   # is it bad if this works?
   # expect_error(set_units(1:3, "m/s"))
 })
+
+expect_symbolic <- function(u, n, d)
+  expect_equal(units(as_units(u)), units:::.symbolic_units(n, d))
+
+test_that("exotic units work", {
+  # check what udunits support
+  # units:::R_ut_format(units:::R_ut_parse(some_string))
+
+  expect_symbolic("2.2 m s", c("2.2", "m", "s"), character(0))
+  expect_symbolic("2.2*m*s", c("2.2", "m", "s"), character(0))
+  #expect_symbolic("2.2.m.s", c("2.2", "m", "s"), character(0))
+
+  expect_symbolic("m2/s", c("m", "m"), "s")
+  expect_symbolic("m^2/s", c("m", "m"), "s")
+  expect_symbolic("m 2/s", c("m", "2"), "s")
+  expect_symbolic("m-2/s", character(0), c("m", "m", "s"))
+  expect_symbolic("m^-2/s", character(0), c("m", "m", "s"))
+  expect_symbolic("m/s2", "m", c("s", "s"))
+  expect_symbolic("m/s^2", "m", c("s", "s"))
+  expect_symbolic("m/s 2", c("m", "2"), "s")
+  expect_symbolic("m/s-2", c("m", "s", "s"), character(0))
+  expect_symbolic("m/s^-2", c("m", "s", "s"), character(0))
+
+  expect_symbolic("ml/min/1.73m^2", c("ml", "m", "m"), c("min", "1.73"))
+  expect_symbolic("ml/min/(1.73m^2)", "ml", c("min", "1.73", "m", "m"))
+  expect_symbolic("ml/min/1.73/m^2", "ml", c("min", "1.73", "m", "m"))
+  expect_symbolic("ml/min/1.73m-2", "ml", c("min", "1.73", "m", "m"))
+})

From 29aa42201e9f2e00b393ab6e8b78ca1f1a04351b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Sun, 28 Sep 2025 18:58:26 +0200
Subject: [PATCH 06/11] treat numbers as prefixes

---
 R/RcppExports.R                     |  4 ++--
 R/make_units.R                      |  2 +-
 R/options.R                         | 12 ++++++++++--
 man/units_options.Rd                |  6 ++++--
 src/RcppExports.cpp                 |  9 +++++----
 src/tokenizer.cpp                   | 19 ++++++++++++++-----
 tests/testthat/test_unit_creation.R |  7 ++++++-
 7 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/R/RcppExports.R b/R/RcppExports.R
index a07eb57..37f8965 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -1,8 +1,8 @@
 # Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-parse_unit <- function(x) {
-    .Call('_units_parse_unit', PACKAGE = 'units', x)
+parse_unit <- function(x, strict = FALSE) {
+    .Call('_units_parse_unit', PACKAGE = 'units', x, strict)
 }
 
 ud_exit <- function() {
diff --git a/R/make_units.R b/R/make_units.R
index 948eefa..38c4cac 100644
--- a/R/make_units.R
+++ b/R/make_units.R
@@ -289,7 +289,7 @@ as_units.character <- function(x, ...,
   if(force_single_symbol || is_udunits_time(x))
     return(symbolic_unit(x, check_is_valid = check_is_valid))
 
-  o <- try(su <- parse_unit(x), silent = TRUE)
+  o <- try(su <- parse_unit(x, units_options("strict_tokenizer")), silent=TRUE)
   if(inherits(o, "try-error")) {
     warning("Could not parse expression: ", sQuote(x),                # nocov
             ". Returning as a single symbolic unit()", call. = FALSE) # nocov
diff --git a/R/options.R b/R/options.R
index 5e6fa47..30ca52b 100644
--- a/R/options.R
+++ b/R/options.R
@@ -6,6 +6,7 @@
   negative_power = FALSE,
   parse = TRUE,
   set_units_mode = "symbols",
+  strict_tokenizer = FALSE,
   auto_convert_names_to_symbols = TRUE,
   simplify = NA,
   allow_mixed = FALSE,
@@ -29,6 +30,7 @@
 #' @param negative_power logical, default \code{FALSE}; should denominators have negative power, or follow a division symbol?
 #' @param parse logical, default \code{TRUE}; should the units be made into an expression (so we get subscripts)? Setting to \code{FALSE} may be useful if \link{parse} fails, e.g. if the unit contains symbols that assume a particular encoding
 #' @param set_units_mode character; either \code{"symbols"} or \code{"standard"}; see \link{set_units}; default is \code{"symbols"}
+#' @param strict_tokenizer logical, default \code{FALSE}; non-strict tokenization attaches constants to the following symbol.
 #' @param auto_convert_names_to_symbols logical, default \code{TRUE}: should names, such as \code{degree_C} be converted to their usual symbol?
 #' @param simplify logical, default \code{NA}; simplify units in expressions?
 #' @param allow_mixed logical; if \code{TRUE}, combining mixed units creates a \code{mixed_units} object, if \code{FALSE} it generates an error
@@ -45,8 +47,10 @@
 #' units_options(sep = c("~", "~"), group = c("[", "]"), negative_power = FALSE, parse = TRUE)
 #' units_options("group")
 #' @export
-units_options = function(..., sep, group, negative_power, parse, set_units_mode, auto_convert_names_to_symbols, simplify,
-		allow_mixed, unitless_symbol, define_bel) {
+units_options = function(..., sep, group, negative_power, parse, set_units_mode,
+                         strict_tokenizer, auto_convert_names_to_symbols, simplify,
+                         allow_mixed, unitless_symbol, define_bel)
+{
 	# op = as.list(units:::.units_options)
 	ret = list()
 	if (!missing(sep)) {
@@ -69,6 +73,10 @@ units_options = function(..., sep, group, negative_power, parse, set_units_mode,
 		stopifnot(is.character(set_units_mode) && length(set_units_mode) == 1)
 		ret$set_units_mode = .setopt(set_units_mode)
 	}
+	if (!missing(strict_tokenizer)) {
+		stopifnot(is.logical(strict_tokenizer))
+		ret$strict_tokenizer = .setopt(strict_tokenizer)
+	}
 	if (!missing(auto_convert_names_to_symbols)) {
 		stopifnot(is.logical(auto_convert_names_to_symbols))
 		ret$auto_convert_names_to_symbols = .setopt(auto_convert_names_to_symbols)
diff --git a/man/units_options.Rd b/man/units_options.Rd
index 0f869fc..5f737bf 100644
--- a/man/units_options.Rd
+++ b/man/units_options.Rd
@@ -5,8 +5,8 @@
 \title{set one or more units global options}
 \usage{
 units_options(..., sep, group, negative_power, parse, set_units_mode,
-  auto_convert_names_to_symbols, simplify, allow_mixed, unitless_symbol,
-  define_bel)
+  strict_tokenizer, auto_convert_names_to_symbols, simplify, allow_mixed,
+  unitless_symbol, define_bel)
 }
 \arguments{
 \item{...}{named options (character) for which the value is queried}
@@ -21,6 +21,8 @@ units_options(..., sep, group, negative_power, parse, set_units_mode,
 
 \item{set_units_mode}{character; either \code{"symbols"} or \code{"standard"}; see \link{set_units}; default is \code{"symbols"}}
 
+\item{strict_tokenizer}{logical, default \code{FALSE}; non-strict tokenization attaches constants to the following symbol.}
+
 \item{auto_convert_names_to_symbols}{logical, default \code{TRUE}: should names, such as \code{degree_C} be converted to their usual symbol?}
 
 \item{simplify}{logical, default \code{NA}; simplify units in expressions?}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 6f3c672..ad8adaa 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -11,13 +11,14 @@ Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
 #endif
 
 // parse_unit
-SEXP parse_unit(std::string_view x);
-RcppExport SEXP _units_parse_unit(SEXP xSEXP) {
+SEXP parse_unit(std::string_view x, bool strict);
+RcppExport SEXP _units_parse_unit(SEXP xSEXP, SEXP strictSEXP) {
 BEGIN_RCPP
     Rcpp::RObject rcpp_result_gen;
     Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< std::string_view >::type x(xSEXP);
-    rcpp_result_gen = Rcpp::wrap(parse_unit(x));
+    Rcpp::traits::input_parameter< bool >::type strict(strictSEXP);
+    rcpp_result_gen = Rcpp::wrap(parse_unit(x, strict));
     return rcpp_result_gen;
 END_RCPP
 }
@@ -280,7 +281,7 @@ END_RCPP
 }
 
 static const R_CallMethodDef CallEntries[] = {
-    {"_units_parse_unit", (DL_FUNC) &_units_parse_unit, 1},
+    {"_units_parse_unit", (DL_FUNC) &_units_parse_unit, 2},
     {"_units_ud_exit", (DL_FUNC) &_units_ud_exit, 0},
     {"_units_ud_init", (DL_FUNC) &_units_ud_init, 1},
     {"_units_ud_set_encoding", (DL_FUNC) &_units_ud_set_encoding, 1},
diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index db5ddbc..544e117 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -3,7 +3,8 @@ using namespace Rcpp;
 
 class SymbolicUnits {
 public:
-  SymbolicUnits(std::string_view x) : x(x), it(x.begin()) { tokenize(); }
+  SymbolicUnits(std::string_view x, bool strict=false)
+    : x(x), it(x.begin()), strict(strict) { tokenize(); }
   SymbolicUnits(std::string_view x, std::string_view::iterator start, std::string_view::iterator end)
     : x(x.substr(start - x.begin(), end - start)), it(this->x.end()) {
     numerator.push_back(std::string(this->x));
@@ -15,6 +16,8 @@ class SymbolicUnits {
   std::string_view::iterator it;
   std::vector<std::string> numerator, denominator;
   bool in_denominator = false;
+  bool stay = false;
+  bool strict = false;
 
   /* helpers -----------------------------------------------------------------*/
 
@@ -48,7 +51,7 @@ class SymbolicUnits {
     if (level != 0)
       stop("unmatched parenthesis");
 
-    return SymbolicUnits(x.substr(start - x.begin(), it++ - start));
+    return SymbolicUnits(x.substr(start - x.begin(), it++ - start), strict);
   }
 
   int get_exponent() {
@@ -85,6 +88,7 @@ class SymbolicUnits {
   SymbolicUnits get_number() {
     auto start = it++;
     for (; it != x.end() && is_number_char(*it); ++it);
+    stay = !strict;
     return SymbolicUnits(x, start, it);
   }
 
@@ -117,7 +121,7 @@ class SymbolicUnits {
         continue;
       }
       if (c == '/') {
-        in_denominator = !in_denominator;
+        in_denominator = true;
         ++it;
         continue;
       }
@@ -126,7 +130,10 @@ class SymbolicUnits {
       auto token = (c == '(') ? get_paren() :
         (std::isdigit(c) ? get_number() : get_name());
       combine(token, get_exponent());
-      in_denominator = false;
+
+      // switch to numerator if needed
+      if (!stay) in_denominator = false;
+      stay = false;
     }
   }
 };
@@ -143,7 +150,9 @@ SymbolicUnits::operator SEXP() {
 };
 
 // [[Rcpp::export]]
-SEXP parse_unit(std::string_view x) { return SymbolicUnits(x); }
+SEXP parse_unit(std::string_view x, bool strict=false) {
+  return SymbolicUnits(x, strict);
+}
 
 /*** R
 parse_unit("m")
diff --git a/tests/testthat/test_unit_creation.R b/tests/testthat/test_unit_creation.R
index 66fe786..4c2128a 100644
--- a/tests/testthat/test_unit_creation.R
+++ b/tests/testthat/test_unit_creation.R
@@ -148,8 +148,13 @@ test_that("exotic units work", {
   expect_symbolic("m/s-2", c("m", "s", "s"), character(0))
   expect_symbolic("m/s^-2", c("m", "s", "s"), character(0))
 
-  expect_symbolic("ml/min/1.73m^2", c("ml", "m", "m"), c("min", "1.73"))
   expect_symbolic("ml/min/(1.73m^2)", "ml", c("min", "1.73", "m", "m"))
   expect_symbolic("ml/min/1.73/m^2", "ml", c("min", "1.73", "m", "m"))
+  expect_symbolic("ml/min/1.73m^2", "ml", c("min", "1.73", "m", "m"))
+  expect_symbolic("ml/min/1.73m-2", c("ml", "m", "m"), c("min", "1.73"))
+
+  old <- unlist(units_options(strict_tokenizer=TRUE))
+  on.exit(units_options(strict_tokenizer=old))
+  expect_symbolic("ml/min/1.73m^2", c("ml", "m", "m"), c("min", "1.73"))
   expect_symbolic("ml/min/1.73m-2", "ml", c("min", "1.73", "m", "m"))
 })

From 8672f3a82762ef2e9a0e3aa2f108730e66c3009e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Sun, 28 Sep 2025 20:16:28 +0200
Subject: [PATCH 07/11] follow strict_tokenizer while formatting too

---
 R/symbolic_units.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/symbolic_units.R b/R/symbolic_units.R
index f185160..f519a8e 100644
--- a/R/symbolic_units.R
+++ b/R/symbolic_units.R
@@ -53,12 +53,16 @@ unitless <- .symbolic_units(vector("character"), vector("character"))
 
   for (i in seq_along(sym)) if (pwr[i] != 1)
     sym[i] <- paste(sym[i], pwr[i], sep = pwr_op)
-  paste0(sym, collapse = paste0(op, sep))
+
+  s <- matrix(c(sym, rep(paste0(op, sep), length(sym)-1), ""), nrow=length(sym))
+  if (!units_options("strict_tokenizer"))
+    s[sym >= "0" & sym <= "9", 2] <- ""
+  paste0(t(s), collapse="")
 }
 
 #' @export
 as.character.symbolic_units <- function(x, ...,
-		neg_power = get(".units.negative_power", envir = .units_options),
+		neg_power = units_options("negative_power"),
 		escape_units = FALSE, prod_sep = "*", plot_sep = "") {
   sep <- plot_sep
 

From b5a0a28fb1158658ca68d7522dd72f3c509d9076 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Sun, 28 Sep 2025 20:47:46 +0200
Subject: [PATCH 08/11] implement lookahead to enable numbers in the middle of
 symbols

---
 src/tokenizer.cpp | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/src/tokenizer.cpp b/src/tokenizer.cpp
index 544e117..535a43a 100644
--- a/src/tokenizer.cpp
+++ b/src/tokenizer.cpp
@@ -29,9 +29,15 @@ class SymbolicUnits {
     return std::isdigit(c) || c == '.' || c == 'e' || c == '-';
   }
 
-  bool is_name_char(const char& c) {
-    return !std::isdigit(c) && !is_multiplicative(c) && c != '/' &&
+  bool is_symbol_char(const char& c) {
+    if (!std::isdigit(c)) return !is_multiplicative(c) && c != '/' &&
       c != '(' && c != ')' && c != '^' && c != '-' && c != '+';
+
+    auto lookahead = it;
+    for (; lookahead != x.end() && std::isdigit(*lookahead); ++lookahead);
+    if (lookahead != x.end() && is_symbol_char(*lookahead))
+      return true;
+    return false;
   }
 
   /* getters -----------------------------------------------------------------*/
@@ -92,9 +98,9 @@ class SymbolicUnits {
     return SymbolicUnits(x, start, it);
   }
 
-  SymbolicUnits get_name() {
+  SymbolicUnits get_symbol() {
     auto start = (*it == '`') ? ++it : it++;
-    for (; it != x.end() && is_name_char(*it); ++it);
+    for (; it != x.end() && is_symbol_char(*it); ++it);
     return SymbolicUnits(x, start, it - (*(it-1) == '`' ? 1 : 0));
   }
 
@@ -128,7 +134,7 @@ class SymbolicUnits {
 
       // handle parenthesis, digits, and symbols/names
       auto token = (c == '(') ? get_paren() :
-        (std::isdigit(c) ? get_number() : get_name());
+        (std::isdigit(c) ? get_number() : get_symbol());
       combine(token, get_exponent());
 
       // switch to numerator if needed
@@ -163,4 +169,6 @@ parse_unit("ml / min / 1.73 / m^2")
 parse_unit("ml/min/1.73m^2")
 parse_unit("ml/min/1.73/m^2")
 parse_unit("ml/min/(1.73m^2)")
+parse_unit("inH2O")
+parse_unit("inH2O2")
 */

From 8e297613e9d54511efd8bbc440a58a55a84c9460 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Sun, 28 Sep 2025 21:35:47 +0200
Subject: [PATCH 09/11] try to convert only if parseable

---
 R/make_units.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/make_units.R b/R/make_units.R
index 38c4cac..3133847 100644
--- a/R/make_units.R
+++ b/R/make_units.R
@@ -305,7 +305,7 @@ as_units.character <- function(x, ...,
 
   if (units_options("auto_convert_names_to_symbols")) {
     name_to_symbol <- function(chr)
-      if (length(sym <- ud_get_symbol(chr))) sym else chr
+      if (ud_is_parseable(chr) && length(sym <- ud_get_symbol(chr))) sym else chr
     su$numerator <- vapply(su$numerator, name_to_symbol, character(1), USE.NAMES=FALSE)
     su$denominator <- vapply(su$denominator, name_to_symbol, character(1), USE.NAMES=FALSE)
   }

From c1c7fbad9d8cfece8dfe61de84812a966b9d4797 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Mon, 29 Sep 2025 12:26:29 +0200
Subject: [PATCH 10/11] bump version, update NEWS and revdep checks

---
 DESCRIPTION        |   2 +-
 NEWS.md            |   9 ++-
 revdep/README.md   |  22 +++---
 revdep/cran.md     |  13 +---
 revdep/problems.md | 181 ++++++++-------------------------------------
 5 files changed, 52 insertions(+), 175 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index fb139f9..0e449b0 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: units
-Version: 0.8-7.3
+Version: 0.8-7.4
 Title: Measurement Units for R Vectors
 Authors@R: c(person("Edzer", "Pebesma", role = c("aut", "cre"), email = "edzer.pebesma@uni-muenster.de", comment = c(ORCID = "0000-0001-8049-7069")),
     person("Thomas", "Mailund", role = "aut", email = "mailund@birc.au.dk"),
diff --git a/NEWS.md b/NEWS.md
index a786c1c..682fbf2 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,11 @@
-# version devel
+# version 1.0-0
+
+* Breaking change: a new tokenizer fixes longstanding issues with parsing
+  complex unit expressions, but may break existing code that relied on the
+  previous (buggy) behavior. The major change is that now numbers are
+  consistently treated as prefixes, so that units like `ml / min / 1.73m^2`
+  used in physiology are now correctly parsed as `ml / (min * 1.73 * m^2)`.
+  See `?as_units` for details; #416 addressing #221, #383
 
 * Vectorize `ud_*()` helpers; #405 addressing #404
 
diff --git a/revdep/README.md b/revdep/README.md
index 302f924..e6f041a 100644
--- a/revdep/README.md
+++ b/revdep/README.md
@@ -2,7 +2,7 @@
 
 |field    |value                        |
 |:--------|:----------------------------|
-|version  |R version 4.4.2 (2024-10-31) |
+|version  |R version 4.4.3 (2025-02-28) |
 |os       |Ubuntu 22.04.5 LTS           |
 |system   |x86_64, linux-gnu            |
 |ui       |X11                          |
@@ -10,22 +10,22 @@
 |collate  |en_US.UTF-8                  |
 |ctype    |en_US.UTF-8                  |
 |tz       |UTC                          |
-|date     |2025-02-14                   |
+|date     |2025-09-29                   |
 |pandoc   |2.9.2.1 @ /usr/bin/pandoc    |
+|quarto   |NA                           |
 
 # Dependencies
 
-|package |old   |new   |Δ  |
-|:-------|:-----|:-----|:--|
-|units   |0.8-5 |0.8-6 |*  |
+|package |old   |new     |Δ  |
+|:-------|:-----|:-------|:--|
+|units   |0.8-7 |0.8-7.3 |*  |
+|Rcpp    |NA    |1.1.0   |*  |
 
 # Revdeps
 
-## New problems (3)
+## New problems (1)
 
-|package   |version |error  |warning |note |
-|:---------|:-------|:------|:-------|:----|
-|[divvy](problems.md#divvy)|1.0.0   |__+3__ |        |     |
-|[Rdistance](problems.md#rdistance)|3.0.0   |__+1__ |        |1    |
-|[vein](problems.md#vein)|1.1.3   |__+1__ |        |1    |
+|package  |version |error  |warning |note |
+|:--------|:-------|:------|:-------|:----|
+|[epocakir](problems.md#epocakir)|0.9.9   |__+2__ |        |     |
 
diff --git a/revdep/cran.md b/revdep/cran.md
index 65b0b56..8210f60 100644
--- a/revdep/cran.md
+++ b/revdep/cran.md
@@ -1,8 +1,8 @@
 ## revdepcheck results
 
-We checked 123 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
+We checked 137 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
 
- * We saw 3 new problems
+ * We saw 1 new problems
  * We failed to check 0 packages
 
 Issues with CRAN packages are summarised below.
@@ -10,14 +10,7 @@ Issues with CRAN packages are summarised below.
 ### New problems
 (This reports the first line of each new failure)
 
-* divvy
+* epocakir
   checking examples ... ERROR
   checking tests ...
-  checking running R code from vignettes ...
-
-* Rdistance
-  checking examples ... ERROR
-
-* vein
-  checking tests ...
 
diff --git a/revdep/problems.md b/revdep/problems.md
index 00270ed..0b3770b 100644
--- a/revdep/problems.md
+++ b/revdep/problems.md
@@ -1,14 +1,14 @@
-# divvy
+# epocakir
 
 <details>
 
-* Version: 1.0.0
-* GitHub: https://github.com/GawainAntell/divvy
-* Source code: https://github.com/cran/divvy
-* Date/Publication: 2023-10-26 08:20:03 UTC
-* Number of recursive dependencies: 91
+* Version: 0.9.9
+* GitHub: https://github.com/alwinw/epocakir
+* Source code: https://github.com/cran/epocakir
+* Date/Publication: 2023-01-06 15:30:06 UTC
+* Number of recursive dependencies: 78
 
-Run `revdepcheck::revdep_details(, "divvy")` for more info
+Run `revdepcheck::revdep_details(, "epocakir")` for more info
 
 </details>
 
@@ -16,26 +16,22 @@ Run `revdepcheck::revdep_details(, "divvy")` for more info
 
 *   checking examples ... ERROR
     ```
-    Running examples in ‘divvy-Ex.R’ failed
+    Running examples in ‘epocakir-Ex.R’ failed
     The error most likely occurred in:
     
-    > ### Name: rangeSize
-    > ### Title: Calculate common metrics of spatial distribution
-    > ### Aliases: rangeSize
+    > ### Name: GFR_staging
+    > ### Title: GFR Staging
+    > ### Aliases: GFR_staging GFR_staging.data.frame GFR_staging.units
+    > ###   GFR_staging.numeric
     > 
     > ### ** Examples
     > 
-    > # generate 20 occurrences for a pseudo-species
-    ...
-    > set.seed(2)
-    > x <- rnorm(20, 110.5885, 2)
-    > y <- rnorm(20,  44.4280, 1)
-    > pts <- cbind(x,y)
+    > df <- tibble::tibble(
+    +   eGFR = units::set_units(c(-1, NA, 100, 70, 50, 35, 20, 10), "mL/min/1.73m2")
+    + )
     > 
-    > rangeSize(pts)
-    Error in UseMethod("units") : 
-      no applicable method for 'units' applied to an object of class "c('integer', 'numeric')"
-    Calls: rangeSize -> cbind -> cbind -> units
+    > GFR_staging(df, "eGFR")
+    Error: cannot convert mL/min/1.73m^2 into mL/min
     Execution halted
     ```
 
@@ -45,139 +41,20 @@ Run `revdepcheck::revdep_details(, "divvy")` for more info
      ERROR
     Running the tests in ‘tests/testthat.R’ failed.
     Last 13 lines of output:
-      Expected `rangeSize(bivAlt[, xyCell], crs = prj)` to run without any conditions.
-      i Actually got a <simpleError> with text:
-        no applicable method for 'units' applied to an object of class "c('integer', 'numeric')"
-      ── Error ('test_calc_eco_variables.R:43:3'): rangeSize and sdSumry accept projected coords ──
-      Error in `UseMethod("units")`: no applicable method for 'units' applied to an object of class "c('integer', 'numeric')"
-      Backtrace:
-          ▆
-       1. └─divvy::rangeSize(bivAlt[, xyCell], crs = prj) at test_calc_eco_variables.R:43:3
-       2.   └─base::cbind(...)
-       3.     └─units (local) cbind(deparse.level, ...)
-       4.       └─base::units(dots[[1]])
+           ▆
+        1. ├─epocakir::GFR_staging(df, "eGFR") at test-ckd.R:378:3
+        2. └─epocakir:::GFR_staging.data.frame(df, "eGFR")
+        3.   ├─epocakir::GFR_staging(.data[[rlang::as_name(rlang::enquo(GFR))]])
+        4.   └─epocakir:::GFR_staging.units(.data[[rlang::as_name(rlang::enquo(GFR))]])
+        5.     ├─epocakir::GFR_staging(as_metric(GFR = GFR, value_only = TRUE))
+        6.     └─epocakir::as_metric(GFR = GFR, value_only = TRUE)
+        7.       ├─units::set_units(meas, conversion$metric_units, mode = "standard")
+        8.       └─units:::set_units.units(meas, conversion$metric_units, mode = "standard")
+        9.         ├─base::`units<-`(`*tmp*`, value = as_units(value, ...))
+       10.         └─units:::`units<-.units`(`*tmp*`, value = as_units(value, ...))
       
-      [ FAIL 4 | WARN 0 | SKIP 0 | PASS 22 ]
+      [ FAIL 1 | WARN 9 | SKIP 0 | PASS 598 ]
       Error: Test failures
       Execution halted
     ```
 
-*   checking running R code from vignettes ...
-    ```
-      ‘habitat-rangesize-case-study.Rmd’ using ‘UTF-8’... OK
-      ‘subsampling-vignette.Rmd’ using ‘UTF-8’... failed
-     ERROR
-    Errors in running code in vignettes:
-    when running code in ‘subsampling-vignette.Rmd’
-      ...
-    > unique(names(bandLocs))
-    [1] "[-50,-30)" "[-10,10)"  "[10,30)"   "[30,50)"   "[50,70)"  
-    
-    > unsamp <- sdSumry(dat = bivalves, taxVar = "genus", 
-    +     collections = "collection_no", xy = xyCell, quotaQ = 0.8, 
-    +     quotaN = 100, omitDom =  .... [TRUNCATED] 
-    
-      When sourcing ‘subsampling-vignette.R’:
-    Error: no applicable method for 'units' applied to an object of class "c('integer', 'numeric')"
-    Execution halted
-    ```
-
-# Rdistance
-
-<details>
-
-* Version: 3.0.0
-* GitHub: https://github.com/tmcd82070/Rdistance
-* Source code: https://github.com/cran/Rdistance
-* Date/Publication: 2023-06-13 07:30:05 UTC
-* Number of recursive dependencies: 95
-
-Run `revdepcheck::revdep_details(, "Rdistance")` for more info
-
-</details>
-
-## Newly broken
-
-*   checking examples ... ERROR
-    ```
-    Running examples in ‘Rdistance-Ex.R’ failed
-    The error most likely occurred in:
-    
-    > ### Name: plot.dfunc
-    > ### Title: plot.dfunc - Plot method for distance (detection) functions
-    > ### Aliases: plot.dfunc
-    > ### Keywords: models
-    > 
-    > ### ** Examples
-    > 
-    ...
-    +    , col="wheat"
-    +    , density=30
-    +    , angle=c(-45,0,45)
-    +    , cex.axis=1.5
-    +    , cex.lab=2
-    +    , ylab="Probability") 
-    Error in UseMethod("units") : 
-      no applicable method for 'units' applied to an object of class "logical"
-    Calls: plot ... barplot.default -> xyrect -> rect -> rbind -> rbind -> units
-    Execution halted
-    ```
-
-## In both
-
-*   checking Rd files ... NOTE
-    ```
-    checkRd: (-1) halfnorm.like.Rd:105: Lost braces; missing escapes or markup?
-       105 |   {f(x|a,b,c_1,c_2,...,c_k) = f(x|a,b)(1 + c(1) h_i1(x) + c(2) h_i2(x) + ... + c(k) h_ik(x)). }
-           |   ^
-    checkRd: (-1) secondDeriv.Rd:15: Lost braces
-        15 | This must be a function of the form FUN <- function(x, ...){...}
-           |                                                            ^
-    ```
-
-# vein
-
-<details>
-
-* Version: 1.1.3
-* GitHub: https://github.com/atmoschem/vein
-* Source code: https://github.com/cran/vein
-* Date/Publication: 2024-05-01 13:50:02 UTC
-* Number of recursive dependencies: 56
-
-Run `revdepcheck::revdep_details(, "vein")` for more info
-
-</details>
-
-## Newly broken
-
-*   checking tests ...
-    ```
-      Running ‘testthat.R’
-     ERROR
-    Running the tests in ‘tests/testthat.R’ failed.
-    Last 13 lines of output:
-      Weighted mean =  1.5 
-      [ FAIL 1 | WARN 0 | SKIP 0 | PASS 701 ]
-      
-      ══ Failed tests ════════════════════════════════════════════════════════════════
-      ── Failure ('test-long_to_wide.R:9:3'): long_to_wide works ─────────────────────
-      long_to_wide(df)$CO[1] not equal to 1.
-      Attributes: < Modes: list, NULL >
-      Attributes: < Lengths: 2, 0 >
-      Attributes: < names for target but not for current >
-      Attributes: < current is not list-like >
-      target is units, current is numeric
-      
-      [ FAIL 1 | WARN 0 | SKIP 0 | PASS 701 ]
-      Error: Test failures
-      Execution halted
-    ```
-
-## In both
-
-*   checking data for non-ASCII characters ... NOTE
-    ```
-      Note: found 49 marked UTF-8 strings
-    ```
-

From 455450acb3d183f37b1ed6c9a956d9ae2bc18834 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?I=C3=B1aki=20=C3=9Acar?= <iucar@fedoraproject.org>
Date: Mon, 29 Sep 2025 13:39:45 +0200
Subject: [PATCH 11/11] more tests

---
 tests/testthat/test_unit_creation.R | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tests/testthat/test_unit_creation.R b/tests/testthat/test_unit_creation.R
index 4c2128a..c3702f0 100644
--- a/tests/testthat/test_unit_creation.R
+++ b/tests/testthat/test_unit_creation.R
@@ -128,6 +128,8 @@ test_that("set_units default enforces NSE", {
 
 expect_symbolic <- function(u, n, d)
   expect_equal(units(as_units(u)), units:::.symbolic_units(n, d))
+expect_symbolic_nocheck <- function(u, n, d)
+  expect_equal(units(as_units(u, check_is_valid=FALSE)), units:::.symbolic_units(n, d))
 
 test_that("exotic units work", {
   # check what udunits support
@@ -138,6 +140,7 @@ test_that("exotic units work", {
   #expect_symbolic("2.2.m.s", c("2.2", "m", "s"), character(0))
 
   expect_symbolic("m2/s", c("m", "m"), "s")
+  expect_symbolic("m20/s", rep("m", 20), "s")
   expect_symbolic("m^2/s", c("m", "m"), "s")
   expect_symbolic("m 2/s", c("m", "2"), "s")
   expect_symbolic("m-2/s", character(0), c("m", "m", "s"))
@@ -154,7 +157,14 @@ test_that("exotic units work", {
   expect_symbolic("ml/min/1.73m-2", c("ml", "m", "m"), c("min", "1.73"))
 
   old <- unlist(units_options(strict_tokenizer=TRUE))
-  on.exit(units_options(strict_tokenizer=old))
   expect_symbolic("ml/min/1.73m^2", c("ml", "m", "m"), c("min", "1.73"))
   expect_symbolic("ml/min/1.73m-2", "ml", c("min", "1.73", "m", "m"))
+  units_options(strict_tokenizer=old)
+
+  expect_symbolic_nocheck("inH2O", "inH2O", character(0))
+  expect_symbolic_nocheck("inH2O2", c("inH2O", "inH2O"), character(0))
+
+  expect_symbolic("m/(g/(s/L))", c("m", "s"), c("g", "L"))
+  expect_error(expect_warning(as_units("m/(g/s")))
+  expect_error(expect_warning(as_units("m^m")))
 })