Skip to content

Commit 307ca3b

Browse files
committed
Added pint-preprocessor & helpers for ucum-pint mappings
1 parent c1d8c32 commit 307ca3b

File tree

5 files changed

+207
-23
lines changed

5 files changed

+207
-23
lines changed

README.md

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# Easier access to UCUM from Python
22

33
> **This is almost done. Feedback welcome!**
4-
The lark grammar to parse UCUM codes and the transformer that converts UCUM units to pint are implemented.
4+
> The lark grammar to parse UCUM codes and the transformer that converts UCUM units to pint are implemented.
5+
> For some UCUM units we still have to define pint units or aliases and for some also name mappings.
56
67
[UCUM](https://ucum.org/) (Unified Code for Units of Measure) is a code system intended to cover all units of measures.
78
It provides a formalism to express units in an unambiguous way suitable for electronic communication.
@@ -18,12 +19,16 @@ So updating the parser for new UCUM releases is straight forward.
1819
The parser is built with the great [lark](https://pypi.org/project/lark/) parser toolkit.
1920
The generated lark grammar file for case-sensitive UCUM codes is included in the repository, see [ucum_grammar.lark](https://github.com/dalito/ucumvert/blob/main/src/ucumvert/ucum_grammar.lark).
2021

22+
Some of the UCUM unit atoms are invalid unit names in pint, for example `cal_[15]`, `m[H2O]`, `10*`, `[in_i'H2O]`.
23+
For all of them we define mappings to valid pint unit names in [ucum_pint.py](https://github.com/dalito/ucumvert/blob/main/src/ucumvert/ucum_pint.py), e.g. `{"cal_[15]": "cal_15"}`.
24+
2125
## Install
2226

2327
Installation from git in developer mode including creation of virtual environment (pip should be newer than 23.1):
2428

2529
Linux
26-
```
30+
31+
```bash
2732
git clone https://github.com/dalito/ucumvert.git
2833
cd ucumvert
2934
python -m venv .venv
@@ -32,7 +37,8 @@ pip install -e .[dev]
3237
```
3338

3439
Windows
35-
```
40+
41+
```bash
3642
git clone https://github.com/dalito/ucumvert.git
3743
cd ucumvert
3844
py -m venv .venv
@@ -88,17 +94,17 @@ You may use the package in your code for converting UCUM codes to pint like this
8894

8995
The unit tests include a test to parse all common UCUM unit codes from the official repo. To see this run
9096

91-
```cmd
92-
$ pytest
97+
```bash
98+
pytest
9399
```
94100

95101
The common UCUM unit codes are available only in binary form (xlsx, docs, pdf).
96102
Here we keep a copy in tsv-format `ucum_examples.tsv`.
97103
To (re)generate this tsv-file from the official xlsx-file in the [UCUM repository](https://github.com/ucum-org/ucum/tree/main/common-units) run
98104

99-
```cmd
100-
$ pip install openpyxl
101-
$ python src/src/ucumvert/vendor/get_ucum_example_as_tsv.py
105+
```bash
106+
pip install openpyxl
107+
python src/src/ucumvert/vendor/get_ucum_example_as_tsv.py
102108
```
103109

104110
## Useful links

src/ucumvert/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@
33
make_parse_tree_png,
44
update_lark_ucum_grammar_file,
55
)
6-
from ucumvert.ucum_pint import UcumToPintTransformer
6+
from ucumvert.ucum_pint import (
7+
UcumToPintTransformer,
8+
get_pint_registry,
9+
ucum_preprocessor,
10+
)
711

812
try:
913
from ucumvert._version import __version__, __version_tuple__
@@ -13,7 +17,9 @@
1317

1418
__all__ = [
1519
"get_ucum_parser",
20+
"get_pint_registry",
1621
"make_parse_tree_png",
22+
"ucum_preprocessor",
1723
"update_lark_ucum_grammar_file",
1824
"UcumToPintTransformer",
1925
]

src/ucumvert/ucum_pint.py

Lines changed: 167 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,41 @@
22

33
import pint
44
from lark import Transformer
5+
from lark.exceptions import VisitError
56

67
from ucumvert.parser import (
78
get_ucum_parser,
89
make_parse_tree_png,
910
update_lark_ucum_grammar_file,
1011
)
12+
from ucumvert.xml_util import get_metric_units, get_non_metric_units, get_prefixes
1113

1214
# Some UCUM unit atoms are syntactically incompatiple with pint. For these we
1315
# map to a pint-compatible unit name which we define in pint_ucum_defs.txt
14-
# as alias or new unit.
15-
# TODO Define the commented out units in pint_ucum_defs.txt
16+
# as alias or new unit. To determine what needs a mapping, use the function
17+
# "find_ucum_codes_that_need_mapping()" below.
1618

17-
mappings_ucum_to_pint = {
19+
MAPPINGS_UCUM_TO_PINT = {
1820
# "UCUM_unit_atom": "pint_unit_name_or_alias"
21+
# === prefixes ===
22+
# all good!
1923
# === metric units ===
2024
"cal_[20]": "cal_20",
2125
"cal_[15]": "cal_15",
26+
# m[H2O]
27+
# m[Hg]
28+
"g%": "g%", # invalid as unit name but correctly parsed as <Unit('gram * percent')>
29+
# B[SPL]
30+
# B[V]
31+
# B[mV]
32+
# B[uV]
33+
# B[10.nV]
34+
# B[W]
35+
# B[kW]
2236
# === non-metric units ===
2337
"10*": "_10",
2438
"10^": "_10",
39+
"%": "%", # invalid as unit name but correctly parsed as <Unit('percent')>
2540
"'": "minute",
2641
"''": "second",
2742
"[in_i'H2O]": "in_i_H2O",
@@ -58,13 +73,13 @@
5873
class UcumToPintTransformer(Transformer):
5974
def __init__(self, ureg=None):
6075
if ureg is None:
61-
self.ureg = pint.UnitRegistry()
62-
# Append the local definitions for ucum units to the default registry
63-
self.ureg.load_definitions(
64-
Path(__file__).resolve().parent / "pint_ucum_defs.txt"
65-
)
76+
self.ureg = pint.UnitRegistry(on_redefinition="raise")
6677
else:
6778
self.ureg = ureg
79+
# Append the local definitions for ucum units to the default registry
80+
self.ureg.load_definitions(
81+
Path(__file__).resolve().parent / "pint_ucum_defs.txt"
82+
)
6883

6984
def main_term(self, args):
7085
# print("DBGmt>", repr(args), len(args))
@@ -104,7 +119,7 @@ def simple_unit(self, args):
104119
return self.ureg(args[0] + args[1])
105120

106121
# Substitute UCUM atoms that cannot be defined in pint as units or aliases.
107-
return self.ureg(mappings_ucum_to_pint.get(args[0], args[0]))
122+
return self.ureg(MAPPINGS_UCUM_TO_PINT.get(args[0], args[0]))
108123

109124
def annotatable(self, args):
110125
# print("DBGan>", repr(args), len(args))
@@ -113,6 +128,133 @@ def annotatable(self, args):
113128
return args[0]
114129

115130

131+
class UcumToPintStrTransformer(Transformer):
132+
def main_term(self, args):
133+
# print("DBGmt>", repr(args), len(args))
134+
if len(args) == 2: # unary DIVIDE # noqa: PLR2004
135+
if getattr(args[1], "type", None): # no unit, only an ANNOTATION
136+
return "1" # will create <Quantity(1, 'dimensionless')>
137+
return f"(1 / {args[1]})"
138+
return f"({args[0]})"
139+
140+
def term(self, args):
141+
# print("DBGt>", repr(args), len(args))
142+
if len(args) == 3: # noqa: PLR2004
143+
if (
144+
getattr(args[0], "type", None) == "ANNOTATION"
145+
): # first term is annotation
146+
args[0] = "1"
147+
if (
148+
getattr(args[2], "type", None) == "ANNOTATION"
149+
): # second term is annotation
150+
args[2] = "1"
151+
if args[1] == ".": # multiplication
152+
return f"({args[0]} * {args[2]})"
153+
# division
154+
return f"({args[0]} / {args[2]})"
155+
return f"({args[0]})" # no operator, return single component
156+
157+
def component(self, args):
158+
# print("DBGc>", repr(args), len(args))
159+
if args[1].type == "ANNOTATION": # ignore annotations
160+
# print(f"dropping annotation: {args[1]}")
161+
return f"({args[0]})"
162+
return args[:]
163+
164+
def simple_unit(self, args):
165+
# print("DBGsu>", repr(args), len(args))
166+
if len(args) == 2: # prefix is present # noqa: PLR2004
167+
return f"({args[0]} + {args[1]})"
168+
169+
# Substitute UCUM atoms that cannot be defined in pint as units or aliases.
170+
ret = MAPPINGS_UCUM_TO_PINT.get(args[0], args[0])
171+
return f"({ret})"
172+
173+
def annotatable(self, args):
174+
# print("DBGan>", repr(args), len(args))
175+
if len(args) == 2: # exponent is present # noqa: PLR2004
176+
return f"{args[0]}**{int(args[1])}"
177+
return f"({args[0]})"
178+
179+
180+
def ucum_preprocessor(unit_input):
181+
"""Preprocess UCUM code before parsing as pint unit.
182+
183+
Usage:
184+
ureg = pint.UnitRegistry()
185+
ureg.preprocessors.append(ucum_preprocessor)
186+
"""
187+
ucum_parser = get_ucum_parser()
188+
transformer = UcumToPintStrTransformer()
189+
# print("DBGpp in >", repr(unit_input))
190+
parsed_data = ucum_parser.parse(unit_input)
191+
# pintified_str = str(transformer.transform(parsed_data))
192+
# print(f"DBGpp out> {pintified_str}")
193+
return str(transformer.transform(parsed_data))
194+
195+
196+
def find_ucum_codes_that_need_mapping(existing_mappings=MAPPINGS_UCUM_TO_PINT):
197+
"""Find UCUM atoms that are syntactically incompatiple with pint."""
198+
print("The following UCUM atoms must be mapped to valid pint unit names.")
199+
ureg = pint.UnitRegistry()
200+
sections = {
201+
"prefixes": get_prefixes,
202+
"metric": get_metric_units,
203+
"non-metric": get_non_metric_units,
204+
}
205+
need_mappings = {k: [] for k in sections}
206+
for section, get_fcn in sections.items():
207+
print(f"\n=== {section} ===")
208+
for ucum_code in get_fcn():
209+
if ucum_code in existing_mappings:
210+
continue
211+
def_str = (
212+
f"{ucum_code}- = 1" if section == "prefixes" else f"{ucum_code} = 1"
213+
)
214+
try:
215+
ureg.define(def_str)
216+
except pint.DefinitionSyntaxError:
217+
need_mappings[section].append(ucum_code)
218+
print(f"{ucum_code}")
219+
continue
220+
if not need_mappings[section]:
221+
print("all good!")
222+
return need_mappings
223+
224+
225+
def find_matching_pint_definitions(ureg=None):
226+
"""Find Pint units that match UCUM units."""
227+
if ureg is None:
228+
ureg = pint.UnitRegistry()
229+
sections = {
230+
"prefixes": get_prefixes,
231+
"metric": get_metric_units,
232+
"non-metric": get_non_metric_units,
233+
}
234+
ucum_parser = get_ucum_parser()
235+
transformer = UcumToPintTransformer(ureg=ureg)
236+
for section, get_fcn in sections.items():
237+
print(f"\n=== {section} ===")
238+
for ucum_code in get_fcn():
239+
lookup_str = f"{ucum_code} m" if section == "prefixes" else ucum_code
240+
try:
241+
parsed_data = ucum_parser.parse(lookup_str)
242+
except VisitError as exc:
243+
print(f"PARSER ERROR: {exc.args[0]}")
244+
raise
245+
try:
246+
pint_quantity = transformer.transform(parsed_data)
247+
except pint.UndefinedUnitError as exc:
248+
msg = getattr(exc, "msg", "")
249+
print(f"NOT DEFINED: {msg}")
250+
continue
251+
except VisitError as exc:
252+
msg = exc.args[0].splitlines()[-1]
253+
print(f"TRANSFORM ERROR: {msg}")
254+
continue
255+
print(f"{ucum_code} --> {pint_quantity!r}")
256+
257+
116258
def run_examples():
117259
test_ucum_units = [
118260
# "Cel",
@@ -129,6 +271,21 @@ def run_examples():
129271
print(f"Pint {q!r}")
130272

131273

274+
def get_pint_registry(ureg=None):
275+
"""Return a pint registry with the UCUM definitions loaded."""
276+
if ureg is None:
277+
ureg = pint.UnitRegistry(on_redefinition="raise")
278+
ureg.preprocessors.append(ucum_preprocessor)
279+
ureg.load_definitions(Path(__file__).resolve().parent / "pint_ucum_defs.txt")
280+
return ureg
281+
282+
132283
if __name__ == "__main__":
133284
update_lark_ucum_grammar_file()
134-
run_examples()
285+
# run_examples()
286+
# find_ucum_codes_that_need_mapping()
287+
# find_matching_pint_definitions()
288+
289+
# ureg = get_pint_registry()
290+
# print(ureg("Cel"))
291+
# print(ureg("'"))

tests/conftest.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,14 @@ def ucum_parser():
1111

1212

1313
@pytest.fixture(scope="session")
14-
def transform():
14+
def ureg_std():
15+
import pint
16+
17+
return pint.UnitRegistry()
18+
19+
20+
@pytest.fixture(scope="session")
21+
def transform(ureg_std):
1522
from ucumvert import UcumToPintTransformer
1623

17-
return UcumToPintTransformer().transform
24+
return UcumToPintTransformer(ureg_std).transform

tests/test_ucum_pint.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from pint import UnitRegistry
33
from test_parser import ucum_examples_valid
44
from ucumvert import UcumToPintTransformer
5+
from ucumvert.ucum_pint import find_ucum_codes_that_need_mapping
56
from ucumvert.xml_util import get_metric_units, get_non_metric_units
67

78
ureg = UnitRegistry()
@@ -12,10 +13,17 @@ def get_unit_atoms():
1213
return get_metric_units() + get_non_metric_units()
1314

1415

15-
def test_ucum_to_pint(ucum_parser):
16+
def test_find_ucum_codes_that_need_mapping():
17+
mappings = find_ucum_codes_that_need_mapping(existing_mappings={})
18+
assert len(mappings["prefixes"]) == 0
19+
assert len(mappings["metric"]) == 12 # noqa: PLR2004
20+
assert len(mappings["non-metric"]) == 33 # noqa: PLR2004
21+
22+
23+
def test_ucum_to_pint(ucum_parser, ureg_std):
1624
expected_quantity = ureg("millimeter")
1725
parsed_data = ucum_parser.parse("mm")
18-
result = UcumToPintTransformer().transform(parsed_data)
26+
result = UcumToPintTransformer(ureg=ureg_std).transform(parsed_data)
1927
assert result == expected_quantity
2028

2129

0 commit comments

Comments
 (0)