From 03b6b9d52c65c8db9e3e3a16f923bc79c02a9401 Mon Sep 17 00:00:00 2001
From: David Linke <dr.david.linke@gmail.com>
Date: Mon, 8 Jan 2024 00:29:44 +0100
Subject: [PATCH] Keep grammar in lark file; add pint-ucum-unit-definitions
 file

---
 README.md                       | 10 ++--
 src/ucumvert/parser.py          | 82 ++++++++++++++++++---------------
 src/ucumvert/pint_ucum_defs.txt | 53 +++++++++++++++++++++
 src/ucumvert/ucum_grammar.lark  | 77 +++++++++++++++++++++++++++++++
 tests/test_parser.py            |  1 +
 5 files changed, 181 insertions(+), 42 deletions(-)
 create mode 100644 src/ucumvert/pint_ucum_defs.txt
 create mode 100644 src/ucumvert/ucum_grammar.lark

diff --git a/README.md b/README.md
index b06eab4..6a85035 100644
--- a/README.md
+++ b/README.md
@@ -16,9 +16,10 @@ Note that UCUM does non provide a canonical representation, e.g. `m/s` and `m.s-
 - Converter for creating [pint](https://pypi.org/project/pint/) units from UCUM unit strings
 - Parser for UCUM unit strings
 
-**ucumvert** stores the UCUM grammar in a template that is dynamically filled with unit codes, prefixes etc. by parsing the official [ucum-essence.xml](https://github.com/ucum-org/ucum/blob/main/ucum-essence.xml) file (a copy is included in this repo).
-So updating the parser for new UCUM releases is quasi automatic.
+**ucumvert** generates the UCUM grammar by filling a template with unit codes, prefixes etc. from the official [ucum-essence.xml](https://github.com/ucum-org/ucum/blob/main/ucum-essence.xml) file (a copy is included in this repo). 
+So updating the parser for new UCUM releases is straight forward.
 The parser is built with the great [lark](https://pypi.org/project/lark/) parser toolkit.
+The generated lark grammar file is included in the repository, see [ucum_grammar.lark](https://github.com/dalito/ucumvert/blob/main/src/ucumvert/ucum_grammar.lark).
 
 ## Install
 
@@ -69,7 +70,7 @@ main_term
 
 So the result is a tree:
 
-![](parse_tree.png)
+![parse tree](parse_tree.png)
 
 Conversion to [pint](https://pint.readthedocs.io/) units must happen term by term as part of the tree traversal. (WIP)
 
@@ -97,5 +98,6 @@ $ python src/src/ucumvert/vendor/get_ucum_example_as_tsv.py
 
 ## License
 
-The code in this repository is distributed under MIT license with the exception of the `ucum-*.*` files in the directory `src/ucumvert/vendor` which fall under the [UCUM Copyright Notice and License](https://github.com/ucum-org/ucum/blob/main/LICENSE.md) (Version 1.0).
+The code in this repository is distributed under MIT license with the exception of the `ucum-*.*` files in the directory `src/ucumvert/vendor` 
+that fall under the [UCUM Copyright Notice and License](https://github.com/ucum-org/ucum/blob/main/LICENSE.md) (Version 1.0).
 We consider **ucumvert** according to §1.3 not as "Derivative Works" of UCUM because **ucumvert** only *"interoperates with an unmodified instance of the Work"*.
diff --git a/src/ucumvert/parser.py b/src/ucumvert/parser.py
index b649c65..d898962 100644
--- a/src/ucumvert/parser.py
+++ b/src/ucumvert/parser.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+import textwrap
 from lark import Lark, Transformer, tree
 
 from ucumvert.xml_util import (
@@ -71,6 +73,11 @@
 #   instead of deca-r which does not exist.
 
 UCUM_GRAMMAR = """
+    # Based on UCUM specification (Version 2.1, 2017-11-21)
+    # Includes ucumvert-specific fixes to handle all common UCUM units
+    # and some edge cases not present in the official examples.
+    # This file is auto-created by parser.update_lark_ucum_grammar_file
+
     main_term: DIVIDE term
             | term
     ?term: term OPERATOR component
@@ -90,17 +97,16 @@
             | FACTOR
 
     ANNOTATION: "{{" STRING "}}"
-    STRING: /[\x21-\x7a|~]*/        // ASCII chars 33-126 without curly braces
+    STRING: /[!-z|~]*/  # ASCII chars 33-126 without curly braces
     OPERATOR: "." | DIVIDE
     DIVIDE: "/"
     SHORT_PREFIX: {short_prefix_atoms}
     LONG_PREFIX: {long_prefix_atoms}
     METRIC: {metric_atoms}
     NON_METRIC: {non_metric_atoms}
-
     EXPONENT : ["+"|"-"] NON_ZERO_DIGITS
     FACTOR: NON_ZERO_DIGITS
-    NON_ZERO_DIGITS : /[1-9][0-9]*/   // positive integers > 0
+    NON_ZERO_DIGITS : /[1-9][0-9]*/  # positive integers > 0
 """
 
 
@@ -109,32 +115,6 @@ class UnitsTransformer(Transformer):
 
 
 class xUnitsTransformer(Transformer):
-    def FACTOR(self, args):
-        return {
-            "factor": int(args),
-        }
-
-    def EXPONENT(self, args):
-        if len(args) == 1:
-            return {
-                "exponent": int(args[0]),
-            }
-        if len(args) == 2:
-            return {
-                "exponent": int("".join(args)),
-            }
-        return None
-
-    def start(self, args):
-        # print("DBGs>", repr(args), len(args))
-        if len(args) == 1:
-            return [args[0]]
-        if len(args) == 2:
-            if isinstance(args[1], dict):
-                return [{**args[0], **args[1]}]
-            return [{**args[0], **args[1][0]}] + args[1][1:]
-        return None
-
     def term(self, args):
         # print("DBGt>", repr(args), len(args))
         if len(args) == 1:
@@ -205,21 +185,47 @@ def NON_METRIC(self, args):
         }
 
 
-def ucum_parser(ucum_grammar_template=UCUM_GRAMMAR):
+def update_lark_ucum_grammar_file(ucum_grammar_template=UCUM_GRAMMAR):
+    """
+    Update the lark grammar file with UCUM units and prefixes from ucum-essence.xml
+    """
     prefixes = get_prefixes()
     short_prefixes = [i for i in prefixes if len(i) == 1]
     long_prefixes = [i for i in prefixes if len(i) > 1]
-    short_prefix_atoms = " | ".join(f'"{i}"' for i in short_prefixes)
-    long_prefix_atoms = " | ".join(f'"{i}"' for i in long_prefixes)
-    metric_atoms = " | ".join(f'"{i}"' for i in (get_base_units() + get_metric_units()))
-    non_metric_atoms = " | ".join(f'"{i}"' for i in get_non_metric_units())
+    short_prefix_atoms = " |".join(f'"{i}"' for i in short_prefixes)
+    long_prefix_atoms = " |".join(f'"{i}"' for i in long_prefixes)
+    metric_atoms = " |".join(f'"{i}"' for i in (get_base_units() + get_metric_units()))
+    non_metric_atoms = " |".join(f'"{i}"' for i in get_non_metric_units())
 
     ucum_grammar = ucum_grammar_template.format(
-        short_prefix_atoms = short_prefix_atoms,
-        long_prefix_atoms = long_prefix_atoms,
-        metric_atoms = metric_atoms,
-        non_metric_atoms = non_metric_atoms,
+        short_prefix_atoms=short_prefix_atoms,
+        long_prefix_atoms=long_prefix_atoms,
+        metric_atoms=metric_atoms,
+        non_metric_atoms=non_metric_atoms,
     )
+    # wrap too long lines in ucum_grammar to linewidth of 78
+    wrapped = []
+    for line in textwrap.dedent(ucum_grammar).strip().splitlines():
+        dline = textwrap.fill(
+            line,
+            width=78,
+            subsequent_indent=" " * 8,
+            break_long_words=False,
+            break_on_hyphens=False,
+        )
+        wrapped.append(dline)
+
+    grammar_file = Path(__file__).resolve().parent / "ucum_grammar.lark"
+    with grammar_file.open("w") as f:
+        f.write("\n".join(wrapped))
+        f.write("\n")  # newline at end of file
+
+
+def ucum_parser(grammar_file=None):
+    if grammar_file is None:
+        grammar_file = Path(__file__).resolve().parent / "ucum_grammar.lark"
+    with grammar_file.open("r", encoding="utf8") as f:
+        ucum_grammar = f.read()
     return Lark(ucum_grammar, start="main_term", strict=True)
 
 
diff --git a/src/ucumvert/pint_ucum_defs.txt b/src/ucumvert/pint_ucum_defs.txt
new file mode 100644
index 0000000..9a5703b
--- /dev/null
+++ b/src/ucumvert/pint_ucum_defs.txt
@@ -0,0 +1,53 @@
+# Units definition file to extend Pints default.txt with UCUM units
+# Language: english
+
+# Syntax
+# ======
+# Units
+# -----
+# <canonical name> = <relation to another unit or dimension> [= <symbol>] [= <alias>] [ = <alias> ] [...]
+#
+# The canonical name and aliases should be expressed in singular form.
+# Pint automatically deals with plurals built by adding 's' to the singular form; plural
+# forms that don't follow this rule should be instead explicitly listed as aliases.
+#
+# If a unit has no symbol and one wants to define aliases, then the symbol should be
+# conventionally set to _.
+#
+# Example:
+#     millennium = 1e3 * year = _ = millennia
+#
+#
+# Prefixes
+# --------
+# <prefix>- = <amount> [= <symbol>] [= <alias>] [ = <alias> ] [...]
+#
+# Example:
+#     deca- =  1e+1  = da- = deka-
+#
+#
+# Additional aliases
+# ------------------
+# @alias <canonical name or previous alias> = <alias> [ = <alias> ] [...]
+#
+# Used to add aliases to already existing unit definitions.
+# Particularly useful when one wants to enrich definitions
+# from defaults_en.txt with custom aliases.
+#
+# Example:
+#     @alias meter = my_meter
+
+# See also: https://pint.readthedocs.io/en/stable/advanced/defining.html
+
+
+#### UNITS ####
+# Common and less common, grouped by quantity.
+
+
+#### UNIT GROUPS ####
+
+
+#### Additional aliases ####
+
+@alias degree_Celsius = Cel
+@alias millimeter_Hg = mm[Hg]
diff --git a/src/ucumvert/ucum_grammar.lark b/src/ucumvert/ucum_grammar.lark
new file mode 100644
index 0000000..b420365
--- /dev/null
+++ b/src/ucumvert/ucum_grammar.lark
@@ -0,0 +1,77 @@
+# Based on UCUM specification (Version 2.1, 2017-11-21)
+# Includes ucumvert-specific fixes to handle all common UCUM units
+# and some edge cases not present in the official examples.
+# This file is auto-created by parser.update_lark_ucum_grammar_file
+
+main_term: DIVIDE term
+        | term
+?term: term OPERATOR component
+        | component
+?component: annotatable ANNOTATION
+        | annotatable
+?annotatable: simple_unit EXPONENT
+        | ANNOTATION
+        | simple_unit
+        | "(" main_term ")"
+        | "(" term ")"
+        | "(" component ")"
+simple_unit: METRIC
+        | SHORT_PREFIX METRIC
+        | LONG_PREFIX METRIC
+        | NON_METRIC
+        | FACTOR
+
+ANNOTATION: "{" STRING "}"
+STRING: /[!-z|~]*/  # ASCII chars 33-126 without curly braces
+OPERATOR: "." | DIVIDE
+DIVIDE: "/"
+SHORT_PREFIX: "Y" |"Z" |"E" |"P" |"T" |"G" |"M" |"k" |"h" |"d" |"c" |"m" |"u"
+        |"n" |"p" |"f" |"a" |"z" |"y"
+LONG_PREFIX: "da" |"Ki" |"Mi" |"Gi" |"Ti"
+METRIC: "m" |"s" |"g" |"rad" |"K" |"C" |"cd" |"mol" |"sr" |"Hz" |"N" |"Pa"
+        |"J" |"W" |"A" |"V" |"F" |"Ohm" |"S" |"Wb" |"Cel" |"T" |"H" |"lm"
+        |"lx" |"Bq" |"Gy" |"Sv" |"l" |"L" |"ar" |"t" |"bar" |"u" |"eV" |"pc"
+        |"[c]" |"[h]" |"[k]" |"[eps_0]" |"[mu_0]" |"[e]" |"[m_e]" |"[m_p]"
+        |"[G]" |"[g]" |"[ly]" |"gf" |"Ky" |"Gal" |"dyn" |"erg" |"P" |"Bi"
+        |"St" |"Mx" |"G" |"Oe" |"Gb" |"sb" |"Lmb" |"ph" |"Ci" |"R" |"RAD"
+        |"REM" |"cal_[15]" |"cal_[20]" |"cal_m" |"cal_IT" |"cal_th" |"cal"
+        |"tex" |"m[H2O]" |"m[Hg]" |"eq" |"osm" |"g%" |"kat" |"U" |"[iU]"
+        |"[IU]" |"Np" |"B" |"B[SPL]" |"B[V]" |"B[mV]" |"B[uV]" |"B[10.nV]"
+        |"B[W]" |"B[kW]" |"st" |"mho" |"bit" |"By" |"Bd"
+NON_METRIC: "10*" |"10^" |"[pi]" |"%" |"[ppth]" |"[ppm]" |"[ppb]" |"[pptr]"
+        |"gon" |"deg" |"'" |"''" |"min" |"h" |"d" |"a_t" |"a_j" |"a_g" |"a"
+        |"wk" |"mo_s" |"mo_j" |"mo_g" |"mo" |"AU" |"atm" |"[lbf_av]" |"[in_i]"
+        |"[ft_i]" |"[yd_i]" |"[mi_i]" |"[fth_i]" |"[nmi_i]" |"[kn_i]"
+        |"[sin_i]" |"[sft_i]" |"[syd_i]" |"[cin_i]" |"[cft_i]" |"[cyd_i]"
+        |"[bf_i]" |"[cr_i]" |"[mil_i]" |"[cml_i]" |"[hd_i]" |"[ft_us]"
+        |"[yd_us]" |"[in_us]" |"[rd_us]" |"[ch_us]" |"[lk_us]" |"[rch_us]"
+        |"[rlk_us]" |"[fth_us]" |"[fur_us]" |"[mi_us]" |"[acr_us]" |"[srd_us]"
+        |"[smi_us]" |"[sct]" |"[twp]" |"[mil_us]" |"[in_br]" |"[ft_br]"
+        |"[rd_br]" |"[ch_br]" |"[lk_br]" |"[fth_br]" |"[pc_br]" |"[yd_br]"
+        |"[mi_br]" |"[nmi_br]" |"[kn_br]" |"[acr_br]" |"[gal_us]" |"[bbl_us]"
+        |"[qt_us]" |"[pt_us]" |"[gil_us]" |"[foz_us]" |"[fdr_us]" |"[min_us]"
+        |"[crd_us]" |"[bu_us]" |"[gal_wi]" |"[pk_us]" |"[dqt_us]" |"[dpt_us]"
+        |"[tbs_us]" |"[tsp_us]" |"[cup_us]" |"[foz_m]" |"[cup_m]" |"[tsp_m]"
+        |"[tbs_m]" |"[gal_br]" |"[pk_br]" |"[bu_br]" |"[qt_br]" |"[pt_br]"
+        |"[gil_br]" |"[foz_br]" |"[fdr_br]" |"[min_br]" |"[gr]" |"[lb_av]"
+        |"[oz_av]" |"[dr_av]" |"[scwt_av]" |"[lcwt_av]" |"[ston_av]"
+        |"[lton_av]" |"[stone_av]" |"[pwt_tr]" |"[oz_tr]" |"[lb_tr]"
+        |"[sc_ap]" |"[dr_ap]" |"[oz_ap]" |"[lb_ap]" |"[oz_m]" |"[lne]"
+        |"[pnt]" |"[pca]" |"[pnt_pr]" |"[pca_pr]" |"[pied]" |"[pouce]"
+        |"[ligne]" |"[didot]" |"[cicero]" |"[degF]" |"[degR]" |"[degRe]"
+        |"[Cal]" |"[Btu_39]" |"[Btu_59]" |"[Btu_60]" |"[Btu_m]" |"[Btu_IT]"
+        |"[Btu_th]" |"[Btu]" |"[HP]" |"[den]" |"[in_i'H2O]" |"[in_i'Hg]"
+        |"[PRU]" |"[wood'U]" |"[diop]" |"[p'diop]" |"%[slope]" |"[mesh_i]"
+        |"[Ch]" |"[drp]" |"[hnsf'U]" |"[MET]" |"[hp'_X]" |"[hp'_C]" |"[hp'_M]"
+        |"[hp'_Q]" |"[hp_X]" |"[hp_C]" |"[hp_M]" |"[hp_Q]" |"[kp_X]" |"[kp_C]"
+        |"[kp_M]" |"[kp_Q]" |"[pH]" |"[S]" |"[HPF]" |"[LPF]" |"[arb'U]"
+        |"[USP'U]" |"[GPL'U]" |"[MPL'U]" |"[APL'U]" |"[beth'U]" |"[anti'Xa'U]"
+        |"[todd'U]" |"[dye'U]" |"[smgy'U]" |"[bdsk'U]" |"[ka'U]" |"[knk'U]"
+        |"[mclg'U]" |"[tb'U]" |"[CCID_50]" |"[TCID_50]" |"[EID_50]" |"[PFU]"
+        |"[FFU]" |"[CFU]" |"[IR]" |"[BAU]" |"[AU]" |"[Amb'a'1'U]" |"[PNU]"
+        |"[Lf]" |"[D'ag'U]" |"[FEU]" |"[ELU]" |"[EU]" |"Ao" |"b" |"att"
+        |"[psi]" |"circ" |"sph" |"[car_m]" |"[car_Au]" |"[smoot]"
+        |"[m/s2/Hz^(1/2)]" |"bit_s"
+EXPONENT : ["+"|"-"] NON_ZERO_DIGITS
+FACTOR: NON_ZERO_DIGITS
+NON_ZERO_DIGITS : /[1-9][0-9]*/  # positive integers > 0
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 444c70c..cfe36b2 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -52,6 +52,7 @@ def test_ucum_parser_official_examples(ucum_parse_fcn, ucum_unit):
         "m(/s)",  # invalid parentheses. Note, "(/s)" is valid.
         "(m/s)2",  # invalid since UCUM v 1.9
         "m{ann1}{ann2}",  # invalid double annotation
+        "da",  # invalid prefix-unit combo (a is not metric)
     ],
 )
 def test_ucum_parser_invalid_ucum_codes(ucum_parse_fcn, ucum_unit):