From 03b6b9d52c65c8db9e3e3a16f923bc79c02a9401 Mon Sep 17 00:00:00 2001 From: David Linke Date: Mon, 8 Jan 2024 00:29:44 +0100 Subject: [PATCH] Keep grammar in lark file; add pint-ucum-unit-definitions file --- README.md | 10 ++-- src/ucumvert/parser.py | 82 ++++++++++++++++++--------------- src/ucumvert/pint_ucum_defs.txt | 53 +++++++++++++++++++++ src/ucumvert/ucum_grammar.lark | 77 +++++++++++++++++++++++++++++++ tests/test_parser.py | 1 + 5 files changed, 181 insertions(+), 42 deletions(-) create mode 100644 src/ucumvert/pint_ucum_defs.txt create mode 100644 src/ucumvert/ucum_grammar.lark diff --git a/README.md b/README.md index b06eab4..6a85035 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,10 @@ Note that UCUM does non provide a canonical representation, e.g. `m/s` and `m.s- - Converter for creating [pint](https://pypi.org/project/pint/) units from UCUM unit strings - Parser for UCUM unit strings -**ucumvert** stores the UCUM grammar in a template that is dynamically filled with unit codes, prefixes etc. by parsing the official [ucum-essence.xml](https://github.com/ucum-org/ucum/blob/main/ucum-essence.xml) file (a copy is included in this repo). -So updating the parser for new UCUM releases is quasi automatic. +**ucumvert** generates the UCUM grammar by filling a template with unit codes, prefixes etc. from the official [ucum-essence.xml](https://github.com/ucum-org/ucum/blob/main/ucum-essence.xml) file (a copy is included in this repo). +So updating the parser for new UCUM releases is straight forward. The parser is built with the great [lark](https://pypi.org/project/lark/) parser toolkit. +The generated lark grammar file is included in the repository, see [ucum_grammar.lark](https://github.com/dalito/ucumvert/blob/main/src/ucumvert/ucum_grammar.lark). ## Install @@ -69,7 +70,7 @@ main_term So the result is a tree: -![](parse_tree.png) +![parse tree](parse_tree.png) Conversion to [pint](https://pint.readthedocs.io/) units must happen term by term as part of the tree traversal. (WIP) @@ -97,5 +98,6 @@ $ python src/src/ucumvert/vendor/get_ucum_example_as_tsv.py ## License -The code in this repository is distributed under MIT license with the exception of the `ucum-*.*` files in the directory `src/ucumvert/vendor` which fall under the [UCUM Copyright Notice and License](https://github.com/ucum-org/ucum/blob/main/LICENSE.md) (Version 1.0). +The code in this repository is distributed under MIT license with the exception of the `ucum-*.*` files in the directory `src/ucumvert/vendor` +that fall under the [UCUM Copyright Notice and License](https://github.com/ucum-org/ucum/blob/main/LICENSE.md) (Version 1.0). We consider **ucumvert** according to ยง1.3 not as "Derivative Works" of UCUM because **ucumvert** only *"interoperates with an unmodified instance of the Work"*. diff --git a/src/ucumvert/parser.py b/src/ucumvert/parser.py index b649c65..d898962 100644 --- a/src/ucumvert/parser.py +++ b/src/ucumvert/parser.py @@ -1,3 +1,5 @@ +from pathlib import Path +import textwrap from lark import Lark, Transformer, tree from ucumvert.xml_util import ( @@ -71,6 +73,11 @@ # instead of deca-r which does not exist. UCUM_GRAMMAR = """ + # Based on UCUM specification (Version 2.1, 2017-11-21) + # Includes ucumvert-specific fixes to handle all common UCUM units + # and some edge cases not present in the official examples. + # This file is auto-created by parser.update_lark_ucum_grammar_file + main_term: DIVIDE term | term ?term: term OPERATOR component @@ -90,17 +97,16 @@ | FACTOR ANNOTATION: "{{" STRING "}}" - STRING: /[\x21-\x7a|~]*/ // ASCII chars 33-126 without curly braces + STRING: /[!-z|~]*/ # ASCII chars 33-126 without curly braces OPERATOR: "." | DIVIDE DIVIDE: "/" SHORT_PREFIX: {short_prefix_atoms} LONG_PREFIX: {long_prefix_atoms} METRIC: {metric_atoms} NON_METRIC: {non_metric_atoms} - EXPONENT : ["+"|"-"] NON_ZERO_DIGITS FACTOR: NON_ZERO_DIGITS - NON_ZERO_DIGITS : /[1-9][0-9]*/ // positive integers > 0 + NON_ZERO_DIGITS : /[1-9][0-9]*/ # positive integers > 0 """ @@ -109,32 +115,6 @@ class UnitsTransformer(Transformer): class xUnitsTransformer(Transformer): - def FACTOR(self, args): - return { - "factor": int(args), - } - - def EXPONENT(self, args): - if len(args) == 1: - return { - "exponent": int(args[0]), - } - if len(args) == 2: - return { - "exponent": int("".join(args)), - } - return None - - def start(self, args): - # print("DBGs>", repr(args), len(args)) - if len(args) == 1: - return [args[0]] - if len(args) == 2: - if isinstance(args[1], dict): - return [{**args[0], **args[1]}] - return [{**args[0], **args[1][0]}] + args[1][1:] - return None - def term(self, args): # print("DBGt>", repr(args), len(args)) if len(args) == 1: @@ -205,21 +185,47 @@ def NON_METRIC(self, args): } -def ucum_parser(ucum_grammar_template=UCUM_GRAMMAR): +def update_lark_ucum_grammar_file(ucum_grammar_template=UCUM_GRAMMAR): + """ + Update the lark grammar file with UCUM units and prefixes from ucum-essence.xml + """ prefixes = get_prefixes() short_prefixes = [i for i in prefixes if len(i) == 1] long_prefixes = [i for i in prefixes if len(i) > 1] - short_prefix_atoms = " | ".join(f'"{i}"' for i in short_prefixes) - long_prefix_atoms = " | ".join(f'"{i}"' for i in long_prefixes) - metric_atoms = " | ".join(f'"{i}"' for i in (get_base_units() + get_metric_units())) - non_metric_atoms = " | ".join(f'"{i}"' for i in get_non_metric_units()) + short_prefix_atoms = " |".join(f'"{i}"' for i in short_prefixes) + long_prefix_atoms = " |".join(f'"{i}"' for i in long_prefixes) + metric_atoms = " |".join(f'"{i}"' for i in (get_base_units() + get_metric_units())) + non_metric_atoms = " |".join(f'"{i}"' for i in get_non_metric_units()) ucum_grammar = ucum_grammar_template.format( - short_prefix_atoms = short_prefix_atoms, - long_prefix_atoms = long_prefix_atoms, - metric_atoms = metric_atoms, - non_metric_atoms = non_metric_atoms, + short_prefix_atoms=short_prefix_atoms, + long_prefix_atoms=long_prefix_atoms, + metric_atoms=metric_atoms, + non_metric_atoms=non_metric_atoms, ) + # wrap too long lines in ucum_grammar to linewidth of 78 + wrapped = [] + for line in textwrap.dedent(ucum_grammar).strip().splitlines(): + dline = textwrap.fill( + line, + width=78, + subsequent_indent=" " * 8, + break_long_words=False, + break_on_hyphens=False, + ) + wrapped.append(dline) + + grammar_file = Path(__file__).resolve().parent / "ucum_grammar.lark" + with grammar_file.open("w") as f: + f.write("\n".join(wrapped)) + f.write("\n") # newline at end of file + + +def ucum_parser(grammar_file=None): + if grammar_file is None: + grammar_file = Path(__file__).resolve().parent / "ucum_grammar.lark" + with grammar_file.open("r", encoding="utf8") as f: + ucum_grammar = f.read() return Lark(ucum_grammar, start="main_term", strict=True) diff --git a/src/ucumvert/pint_ucum_defs.txt b/src/ucumvert/pint_ucum_defs.txt new file mode 100644 index 0000000..9a5703b --- /dev/null +++ b/src/ucumvert/pint_ucum_defs.txt @@ -0,0 +1,53 @@ +# Units definition file to extend Pints default.txt with UCUM units +# Language: english + +# Syntax +# ====== +# Units +# ----- +# = [= ] [= ] [ = ] [...] +# +# The canonical name and aliases should be expressed in singular form. +# Pint automatically deals with plurals built by adding 's' to the singular form; plural +# forms that don't follow this rule should be instead explicitly listed as aliases. +# +# If a unit has no symbol and one wants to define aliases, then the symbol should be +# conventionally set to _. +# +# Example: +# millennium = 1e3 * year = _ = millennia +# +# +# Prefixes +# -------- +# - = [= ] [= ] [ = ] [...] +# +# Example: +# deca- = 1e+1 = da- = deka- +# +# +# Additional aliases +# ------------------ +# @alias = [ = ] [...] +# +# Used to add aliases to already existing unit definitions. +# Particularly useful when one wants to enrich definitions +# from defaults_en.txt with custom aliases. +# +# Example: +# @alias meter = my_meter + +# See also: https://pint.readthedocs.io/en/stable/advanced/defining.html + + +#### UNITS #### +# Common and less common, grouped by quantity. + + +#### UNIT GROUPS #### + + +#### Additional aliases #### + +@alias degree_Celsius = Cel +@alias millimeter_Hg = mm[Hg] diff --git a/src/ucumvert/ucum_grammar.lark b/src/ucumvert/ucum_grammar.lark new file mode 100644 index 0000000..b420365 --- /dev/null +++ b/src/ucumvert/ucum_grammar.lark @@ -0,0 +1,77 @@ +# Based on UCUM specification (Version 2.1, 2017-11-21) +# Includes ucumvert-specific fixes to handle all common UCUM units +# and some edge cases not present in the official examples. +# This file is auto-created by parser.update_lark_ucum_grammar_file + +main_term: DIVIDE term + | term +?term: term OPERATOR component + | component +?component: annotatable ANNOTATION + | annotatable +?annotatable: simple_unit EXPONENT + | ANNOTATION + | simple_unit + | "(" main_term ")" + | "(" term ")" + | "(" component ")" +simple_unit: METRIC + | SHORT_PREFIX METRIC + | LONG_PREFIX METRIC + | NON_METRIC + | FACTOR + +ANNOTATION: "{" STRING "}" +STRING: /[!-z|~]*/ # ASCII chars 33-126 without curly braces +OPERATOR: "." | DIVIDE +DIVIDE: "/" +SHORT_PREFIX: "Y" |"Z" |"E" |"P" |"T" |"G" |"M" |"k" |"h" |"d" |"c" |"m" |"u" + |"n" |"p" |"f" |"a" |"z" |"y" +LONG_PREFIX: "da" |"Ki" |"Mi" |"Gi" |"Ti" +METRIC: "m" |"s" |"g" |"rad" |"K" |"C" |"cd" |"mol" |"sr" |"Hz" |"N" |"Pa" + |"J" |"W" |"A" |"V" |"F" |"Ohm" |"S" |"Wb" |"Cel" |"T" |"H" |"lm" + |"lx" |"Bq" |"Gy" |"Sv" |"l" |"L" |"ar" |"t" |"bar" |"u" |"eV" |"pc" + |"[c]" |"[h]" |"[k]" |"[eps_0]" |"[mu_0]" |"[e]" |"[m_e]" |"[m_p]" + |"[G]" |"[g]" |"[ly]" |"gf" |"Ky" |"Gal" |"dyn" |"erg" |"P" |"Bi" + |"St" |"Mx" |"G" |"Oe" |"Gb" |"sb" |"Lmb" |"ph" |"Ci" |"R" |"RAD" + |"REM" |"cal_[15]" |"cal_[20]" |"cal_m" |"cal_IT" |"cal_th" |"cal" + |"tex" |"m[H2O]" |"m[Hg]" |"eq" |"osm" |"g%" |"kat" |"U" |"[iU]" + |"[IU]" |"Np" |"B" |"B[SPL]" |"B[V]" |"B[mV]" |"B[uV]" |"B[10.nV]" + |"B[W]" |"B[kW]" |"st" |"mho" |"bit" |"By" |"Bd" +NON_METRIC: "10*" |"10^" |"[pi]" |"%" |"[ppth]" |"[ppm]" |"[ppb]" |"[pptr]" + |"gon" |"deg" |"'" |"''" |"min" |"h" |"d" |"a_t" |"a_j" |"a_g" |"a" + |"wk" |"mo_s" |"mo_j" |"mo_g" |"mo" |"AU" |"atm" |"[lbf_av]" |"[in_i]" + |"[ft_i]" |"[yd_i]" |"[mi_i]" |"[fth_i]" |"[nmi_i]" |"[kn_i]" + |"[sin_i]" |"[sft_i]" |"[syd_i]" |"[cin_i]" |"[cft_i]" |"[cyd_i]" + |"[bf_i]" |"[cr_i]" |"[mil_i]" |"[cml_i]" |"[hd_i]" |"[ft_us]" + |"[yd_us]" |"[in_us]" |"[rd_us]" |"[ch_us]" |"[lk_us]" |"[rch_us]" + |"[rlk_us]" |"[fth_us]" |"[fur_us]" |"[mi_us]" |"[acr_us]" |"[srd_us]" + |"[smi_us]" |"[sct]" |"[twp]" |"[mil_us]" |"[in_br]" |"[ft_br]" + |"[rd_br]" |"[ch_br]" |"[lk_br]" |"[fth_br]" |"[pc_br]" |"[yd_br]" + |"[mi_br]" |"[nmi_br]" |"[kn_br]" |"[acr_br]" |"[gal_us]" |"[bbl_us]" + |"[qt_us]" |"[pt_us]" |"[gil_us]" |"[foz_us]" |"[fdr_us]" |"[min_us]" + |"[crd_us]" |"[bu_us]" |"[gal_wi]" |"[pk_us]" |"[dqt_us]" |"[dpt_us]" + |"[tbs_us]" |"[tsp_us]" |"[cup_us]" |"[foz_m]" |"[cup_m]" |"[tsp_m]" + |"[tbs_m]" |"[gal_br]" |"[pk_br]" |"[bu_br]" |"[qt_br]" |"[pt_br]" + |"[gil_br]" |"[foz_br]" |"[fdr_br]" |"[min_br]" |"[gr]" |"[lb_av]" + |"[oz_av]" |"[dr_av]" |"[scwt_av]" |"[lcwt_av]" |"[ston_av]" + |"[lton_av]" |"[stone_av]" |"[pwt_tr]" |"[oz_tr]" |"[lb_tr]" + |"[sc_ap]" |"[dr_ap]" |"[oz_ap]" |"[lb_ap]" |"[oz_m]" |"[lne]" + |"[pnt]" |"[pca]" |"[pnt_pr]" |"[pca_pr]" |"[pied]" |"[pouce]" + |"[ligne]" |"[didot]" |"[cicero]" |"[degF]" |"[degR]" |"[degRe]" + |"[Cal]" |"[Btu_39]" |"[Btu_59]" |"[Btu_60]" |"[Btu_m]" |"[Btu_IT]" + |"[Btu_th]" |"[Btu]" |"[HP]" |"[den]" |"[in_i'H2O]" |"[in_i'Hg]" + |"[PRU]" |"[wood'U]" |"[diop]" |"[p'diop]" |"%[slope]" |"[mesh_i]" + |"[Ch]" |"[drp]" |"[hnsf'U]" |"[MET]" |"[hp'_X]" |"[hp'_C]" |"[hp'_M]" + |"[hp'_Q]" |"[hp_X]" |"[hp_C]" |"[hp_M]" |"[hp_Q]" |"[kp_X]" |"[kp_C]" + |"[kp_M]" |"[kp_Q]" |"[pH]" |"[S]" |"[HPF]" |"[LPF]" |"[arb'U]" + |"[USP'U]" |"[GPL'U]" |"[MPL'U]" |"[APL'U]" |"[beth'U]" |"[anti'Xa'U]" + |"[todd'U]" |"[dye'U]" |"[smgy'U]" |"[bdsk'U]" |"[ka'U]" |"[knk'U]" + |"[mclg'U]" |"[tb'U]" |"[CCID_50]" |"[TCID_50]" |"[EID_50]" |"[PFU]" + |"[FFU]" |"[CFU]" |"[IR]" |"[BAU]" |"[AU]" |"[Amb'a'1'U]" |"[PNU]" + |"[Lf]" |"[D'ag'U]" |"[FEU]" |"[ELU]" |"[EU]" |"Ao" |"b" |"att" + |"[psi]" |"circ" |"sph" |"[car_m]" |"[car_Au]" |"[smoot]" + |"[m/s2/Hz^(1/2)]" |"bit_s" +EXPONENT : ["+"|"-"] NON_ZERO_DIGITS +FACTOR: NON_ZERO_DIGITS +NON_ZERO_DIGITS : /[1-9][0-9]*/ # positive integers > 0 diff --git a/tests/test_parser.py b/tests/test_parser.py index 444c70c..cfe36b2 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -52,6 +52,7 @@ def test_ucum_parser_official_examples(ucum_parse_fcn, ucum_unit): "m(/s)", # invalid parentheses. Note, "(/s)" is valid. "(m/s)2", # invalid since UCUM v 1.9 "m{ann1}{ann2}", # invalid double annotation + "da", # invalid prefix-unit combo (a is not metric) ], ) def test_ucum_parser_invalid_ucum_codes(ucum_parse_fcn, ucum_unit):