Skip to content

Commit

Permalink
Keep grammar in lark file; add pint-ucum-unit-definitions file
Browse files Browse the repository at this point in the history
  • Loading branch information
dalito committed Jan 7, 2024
1 parent 47aeebd commit 03b6b9d
Show file tree
Hide file tree
Showing 5 changed files with 181 additions and 42 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@ Note that UCUM does non provide a canonical representation, e.g. `m/s` and `m.s-
- Converter for creating [pint](https://pypi.org/project/pint/) units from UCUM unit strings
- Parser for UCUM unit strings

**ucumvert** stores the UCUM grammar in a template that is dynamically filled with unit codes, prefixes etc. by parsing the official [ucum-essence.xml](https://github.com/ucum-org/ucum/blob/main/ucum-essence.xml) file (a copy is included in this repo).
So updating the parser for new UCUM releases is quasi automatic.
**ucumvert** generates the UCUM grammar by filling a template with unit codes, prefixes etc. from the official [ucum-essence.xml](https://github.com/ucum-org/ucum/blob/main/ucum-essence.xml) file (a copy is included in this repo).
So updating the parser for new UCUM releases is straight forward.
The parser is built with the great [lark](https://pypi.org/project/lark/) parser toolkit.
The generated lark grammar file is included in the repository, see [ucum_grammar.lark](https://github.com/dalito/ucumvert/blob/main/src/ucumvert/ucum_grammar.lark).

## Install

Expand Down Expand Up @@ -69,7 +70,7 @@ main_term

So the result is a tree:

![](parse_tree.png)
![parse tree](parse_tree.png)

Conversion to [pint](https://pint.readthedocs.io/) units must happen term by term as part of the tree traversal. (WIP)

Expand Down Expand Up @@ -97,5 +98,6 @@ $ python src/src/ucumvert/vendor/get_ucum_example_as_tsv.py

## License

The code in this repository is distributed under MIT license with the exception of the `ucum-*.*` files in the directory `src/ucumvert/vendor` which fall under the [UCUM Copyright Notice and License](https://github.com/ucum-org/ucum/blob/main/LICENSE.md) (Version 1.0).
The code in this repository is distributed under MIT license with the exception of the `ucum-*.*` files in the directory `src/ucumvert/vendor`
that fall under the [UCUM Copyright Notice and License](https://github.com/ucum-org/ucum/blob/main/LICENSE.md) (Version 1.0).
We consider **ucumvert** according to §1.3 not as "Derivative Works" of UCUM because **ucumvert** only *"interoperates with an unmodified instance of the Work"*.
82 changes: 44 additions & 38 deletions src/ucumvert/parser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path
import textwrap
from lark import Lark, Transformer, tree

from ucumvert.xml_util import (
Expand Down Expand Up @@ -71,6 +73,11 @@
# instead of deca-r which does not exist.

UCUM_GRAMMAR = """
# Based on UCUM specification (Version 2.1, 2017-11-21)
# Includes ucumvert-specific fixes to handle all common UCUM units
# and some edge cases not present in the official examples.
# This file is auto-created by parser.update_lark_ucum_grammar_file
main_term: DIVIDE term
| term
?term: term OPERATOR component
Expand All @@ -90,17 +97,16 @@
| FACTOR
ANNOTATION: "{{" STRING "}}"
STRING: /[\x21-\x7a|~]*/ // ASCII chars 33-126 without curly braces
STRING: /[!-z|~]*/ # ASCII chars 33-126 without curly braces
OPERATOR: "." | DIVIDE
DIVIDE: "/"
SHORT_PREFIX: {short_prefix_atoms}
LONG_PREFIX: {long_prefix_atoms}
METRIC: {metric_atoms}
NON_METRIC: {non_metric_atoms}
EXPONENT : ["+"|"-"] NON_ZERO_DIGITS
FACTOR: NON_ZERO_DIGITS
NON_ZERO_DIGITS : /[1-9][0-9]*/ // positive integers > 0
NON_ZERO_DIGITS : /[1-9][0-9]*/ # positive integers > 0
"""


Expand All @@ -109,32 +115,6 @@ class UnitsTransformer(Transformer):


class xUnitsTransformer(Transformer):
def FACTOR(self, args):
return {
"factor": int(args),
}

def EXPONENT(self, args):
if len(args) == 1:
return {
"exponent": int(args[0]),
}
if len(args) == 2:
return {
"exponent": int("".join(args)),
}
return None

def start(self, args):
# print("DBGs>", repr(args), len(args))
if len(args) == 1:
return [args[0]]
if len(args) == 2:
if isinstance(args[1], dict):
return [{**args[0], **args[1]}]
return [{**args[0], **args[1][0]}] + args[1][1:]
return None

def term(self, args):
# print("DBGt>", repr(args), len(args))
if len(args) == 1:
Expand Down Expand Up @@ -205,21 +185,47 @@ def NON_METRIC(self, args):
}


def ucum_parser(ucum_grammar_template=UCUM_GRAMMAR):
def update_lark_ucum_grammar_file(ucum_grammar_template=UCUM_GRAMMAR):
"""
Update the lark grammar file with UCUM units and prefixes from ucum-essence.xml
"""
prefixes = get_prefixes()
short_prefixes = [i for i in prefixes if len(i) == 1]
long_prefixes = [i for i in prefixes if len(i) > 1]
short_prefix_atoms = " | ".join(f'"{i}"' for i in short_prefixes)
long_prefix_atoms = " | ".join(f'"{i}"' for i in long_prefixes)
metric_atoms = " | ".join(f'"{i}"' for i in (get_base_units() + get_metric_units()))
non_metric_atoms = " | ".join(f'"{i}"' for i in get_non_metric_units())
short_prefix_atoms = " |".join(f'"{i}"' for i in short_prefixes)
long_prefix_atoms = " |".join(f'"{i}"' for i in long_prefixes)
metric_atoms = " |".join(f'"{i}"' for i in (get_base_units() + get_metric_units()))
non_metric_atoms = " |".join(f'"{i}"' for i in get_non_metric_units())

ucum_grammar = ucum_grammar_template.format(
short_prefix_atoms = short_prefix_atoms,
long_prefix_atoms = long_prefix_atoms,
metric_atoms = metric_atoms,
non_metric_atoms = non_metric_atoms,
short_prefix_atoms=short_prefix_atoms,
long_prefix_atoms=long_prefix_atoms,
metric_atoms=metric_atoms,
non_metric_atoms=non_metric_atoms,
)
# wrap too long lines in ucum_grammar to linewidth of 78
wrapped = []
for line in textwrap.dedent(ucum_grammar).strip().splitlines():
dline = textwrap.fill(
line,
width=78,
subsequent_indent=" " * 8,
break_long_words=False,
break_on_hyphens=False,
)
wrapped.append(dline)

grammar_file = Path(__file__).resolve().parent / "ucum_grammar.lark"
with grammar_file.open("w") as f:
f.write("\n".join(wrapped))
f.write("\n") # newline at end of file


def ucum_parser(grammar_file=None):
if grammar_file is None:
grammar_file = Path(__file__).resolve().parent / "ucum_grammar.lark"
with grammar_file.open("r", encoding="utf8") as f:
ucum_grammar = f.read()
return Lark(ucum_grammar, start="main_term", strict=True)


Expand Down
53 changes: 53 additions & 0 deletions src/ucumvert/pint_ucum_defs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Units definition file to extend Pints default.txt with UCUM units
# Language: english

# Syntax
# ======
# Units
# -----
# <canonical name> = <relation to another unit or dimension> [= <symbol>] [= <alias>] [ = <alias> ] [...]
#
# The canonical name and aliases should be expressed in singular form.
# Pint automatically deals with plurals built by adding 's' to the singular form; plural
# forms that don't follow this rule should be instead explicitly listed as aliases.
#
# If a unit has no symbol and one wants to define aliases, then the symbol should be
# conventionally set to _.
#
# Example:
# millennium = 1e3 * year = _ = millennia
#
#
# Prefixes
# --------
# <prefix>- = <amount> [= <symbol>] [= <alias>] [ = <alias> ] [...]
#
# Example:
# deca- = 1e+1 = da- = deka-
#
#
# Additional aliases
# ------------------
# @alias <canonical name or previous alias> = <alias> [ = <alias> ] [...]
#
# Used to add aliases to already existing unit definitions.
# Particularly useful when one wants to enrich definitions
# from defaults_en.txt with custom aliases.
#
# Example:
# @alias meter = my_meter

# See also: https://pint.readthedocs.io/en/stable/advanced/defining.html


#### UNITS ####
# Common and less common, grouped by quantity.


#### UNIT GROUPS ####


#### Additional aliases ####

@alias degree_Celsius = Cel
@alias millimeter_Hg = mm[Hg]
77 changes: 77 additions & 0 deletions src/ucumvert/ucum_grammar.lark
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Based on UCUM specification (Version 2.1, 2017-11-21)
# Includes ucumvert-specific fixes to handle all common UCUM units
# and some edge cases not present in the official examples.
# This file is auto-created by parser.update_lark_ucum_grammar_file

main_term: DIVIDE term
| term
?term: term OPERATOR component
| component
?component: annotatable ANNOTATION
| annotatable
?annotatable: simple_unit EXPONENT
| ANNOTATION
| simple_unit
| "(" main_term ")"
| "(" term ")"
| "(" component ")"
simple_unit: METRIC
| SHORT_PREFIX METRIC
| LONG_PREFIX METRIC
| NON_METRIC
| FACTOR

ANNOTATION: "{" STRING "}"
STRING: /[!-z|~]*/ # ASCII chars 33-126 without curly braces
OPERATOR: "." | DIVIDE
DIVIDE: "/"
SHORT_PREFIX: "Y" |"Z" |"E" |"P" |"T" |"G" |"M" |"k" |"h" |"d" |"c" |"m" |"u"
|"n" |"p" |"f" |"a" |"z" |"y"
LONG_PREFIX: "da" |"Ki" |"Mi" |"Gi" |"Ti"
METRIC: "m" |"s" |"g" |"rad" |"K" |"C" |"cd" |"mol" |"sr" |"Hz" |"N" |"Pa"
|"J" |"W" |"A" |"V" |"F" |"Ohm" |"S" |"Wb" |"Cel" |"T" |"H" |"lm"
|"lx" |"Bq" |"Gy" |"Sv" |"l" |"L" |"ar" |"t" |"bar" |"u" |"eV" |"pc"
|"[c]" |"[h]" |"[k]" |"[eps_0]" |"[mu_0]" |"[e]" |"[m_e]" |"[m_p]"
|"[G]" |"[g]" |"[ly]" |"gf" |"Ky" |"Gal" |"dyn" |"erg" |"P" |"Bi"
|"St" |"Mx" |"G" |"Oe" |"Gb" |"sb" |"Lmb" |"ph" |"Ci" |"R" |"RAD"
|"REM" |"cal_[15]" |"cal_[20]" |"cal_m" |"cal_IT" |"cal_th" |"cal"
|"tex" |"m[H2O]" |"m[Hg]" |"eq" |"osm" |"g%" |"kat" |"U" |"[iU]"
|"[IU]" |"Np" |"B" |"B[SPL]" |"B[V]" |"B[mV]" |"B[uV]" |"B[10.nV]"
|"B[W]" |"B[kW]" |"st" |"mho" |"bit" |"By" |"Bd"
NON_METRIC: "10*" |"10^" |"[pi]" |"%" |"[ppth]" |"[ppm]" |"[ppb]" |"[pptr]"
|"gon" |"deg" |"'" |"''" |"min" |"h" |"d" |"a_t" |"a_j" |"a_g" |"a"
|"wk" |"mo_s" |"mo_j" |"mo_g" |"mo" |"AU" |"atm" |"[lbf_av]" |"[in_i]"
|"[ft_i]" |"[yd_i]" |"[mi_i]" |"[fth_i]" |"[nmi_i]" |"[kn_i]"
|"[sin_i]" |"[sft_i]" |"[syd_i]" |"[cin_i]" |"[cft_i]" |"[cyd_i]"
|"[bf_i]" |"[cr_i]" |"[mil_i]" |"[cml_i]" |"[hd_i]" |"[ft_us]"
|"[yd_us]" |"[in_us]" |"[rd_us]" |"[ch_us]" |"[lk_us]" |"[rch_us]"
|"[rlk_us]" |"[fth_us]" |"[fur_us]" |"[mi_us]" |"[acr_us]" |"[srd_us]"
|"[smi_us]" |"[sct]" |"[twp]" |"[mil_us]" |"[in_br]" |"[ft_br]"
|"[rd_br]" |"[ch_br]" |"[lk_br]" |"[fth_br]" |"[pc_br]" |"[yd_br]"
|"[mi_br]" |"[nmi_br]" |"[kn_br]" |"[acr_br]" |"[gal_us]" |"[bbl_us]"
|"[qt_us]" |"[pt_us]" |"[gil_us]" |"[foz_us]" |"[fdr_us]" |"[min_us]"
|"[crd_us]" |"[bu_us]" |"[gal_wi]" |"[pk_us]" |"[dqt_us]" |"[dpt_us]"
|"[tbs_us]" |"[tsp_us]" |"[cup_us]" |"[foz_m]" |"[cup_m]" |"[tsp_m]"
|"[tbs_m]" |"[gal_br]" |"[pk_br]" |"[bu_br]" |"[qt_br]" |"[pt_br]"
|"[gil_br]" |"[foz_br]" |"[fdr_br]" |"[min_br]" |"[gr]" |"[lb_av]"
|"[oz_av]" |"[dr_av]" |"[scwt_av]" |"[lcwt_av]" |"[ston_av]"
|"[lton_av]" |"[stone_av]" |"[pwt_tr]" |"[oz_tr]" |"[lb_tr]"
|"[sc_ap]" |"[dr_ap]" |"[oz_ap]" |"[lb_ap]" |"[oz_m]" |"[lne]"
|"[pnt]" |"[pca]" |"[pnt_pr]" |"[pca_pr]" |"[pied]" |"[pouce]"
|"[ligne]" |"[didot]" |"[cicero]" |"[degF]" |"[degR]" |"[degRe]"
|"[Cal]" |"[Btu_39]" |"[Btu_59]" |"[Btu_60]" |"[Btu_m]" |"[Btu_IT]"
|"[Btu_th]" |"[Btu]" |"[HP]" |"[den]" |"[in_i'H2O]" |"[in_i'Hg]"
|"[PRU]" |"[wood'U]" |"[diop]" |"[p'diop]" |"%[slope]" |"[mesh_i]"
|"[Ch]" |"[drp]" |"[hnsf'U]" |"[MET]" |"[hp'_X]" |"[hp'_C]" |"[hp'_M]"
|"[hp'_Q]" |"[hp_X]" |"[hp_C]" |"[hp_M]" |"[hp_Q]" |"[kp_X]" |"[kp_C]"
|"[kp_M]" |"[kp_Q]" |"[pH]" |"[S]" |"[HPF]" |"[LPF]" |"[arb'U]"
|"[USP'U]" |"[GPL'U]" |"[MPL'U]" |"[APL'U]" |"[beth'U]" |"[anti'Xa'U]"
|"[todd'U]" |"[dye'U]" |"[smgy'U]" |"[bdsk'U]" |"[ka'U]" |"[knk'U]"
|"[mclg'U]" |"[tb'U]" |"[CCID_50]" |"[TCID_50]" |"[EID_50]" |"[PFU]"
|"[FFU]" |"[CFU]" |"[IR]" |"[BAU]" |"[AU]" |"[Amb'a'1'U]" |"[PNU]"
|"[Lf]" |"[D'ag'U]" |"[FEU]" |"[ELU]" |"[EU]" |"Ao" |"b" |"att"
|"[psi]" |"circ" |"sph" |"[car_m]" |"[car_Au]" |"[smoot]"
|"[m/s2/Hz^(1/2)]" |"bit_s"
EXPONENT : ["+"|"-"] NON_ZERO_DIGITS
FACTOR: NON_ZERO_DIGITS
NON_ZERO_DIGITS : /[1-9][0-9]*/ # positive integers > 0
1 change: 1 addition & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def test_ucum_parser_official_examples(ucum_parse_fcn, ucum_unit):
"m(/s)", # invalid parentheses. Note, "(/s)" is valid.
"(m/s)2", # invalid since UCUM v 1.9
"m{ann1}{ann2}", # invalid double annotation
"da", # invalid prefix-unit combo (a is not metric)
],
)
def test_ucum_parser_invalid_ucum_codes(ucum_parse_fcn, ucum_unit):
Expand Down

0 comments on commit 03b6b9d

Please sign in to comment.