Skip to content

Commit

Permalink
Distinguish short and long prefixes in parser
Browse files Browse the repository at this point in the history
With this change "dar" does no longer require special casing.
  • Loading branch information
dalito committed Jan 7, 2024
1 parent f6af89e commit 47aeebd
Showing 1 changed file with 20 additions and 13 deletions.
33 changes: 20 additions & 13 deletions src/ucumvert/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,15 @@
# """

# Below is a fixed grammar that can parse all UCUM units in the official UCUM examples.
# and fixes some more edge cases not present in the official examples.
#
# Changes made:
# - to fix "100/{cells}" issue, we moved FACTOR from component to the simple_unit rule
# - to fix "(8.h){shift}" issue, we moved "(" term ")" from component to the annotatable rule
# - Don't allow "0" as EXPONENT or FACTOR, see https://github.com/ucum-org/ucum/issues/121

# - Special case parsing of "dar" as deci-are instead of deca-r which does not exist.
# - Distinguish short prefixes (1 char) form long ones to handle parsing of "dar" as deci-are
# instead of deca-r which does not exist.

UCUM_GRAMMAR = """
main_term: DIVIDE term
Expand All @@ -82,19 +84,19 @@
| "(" term ")"
| "(" component ")"
simple_unit: METRIC
| SHORT_PREFIX METRIC
| LONG_PREFIX METRIC
| NON_METRIC
| PREFIX? METRIC
| FACTOR
| EXCEPTIONS -> special
ANNOTATION: "{{" STRING "}}"
STRING: /[\x21-\x7a|~]*/ // ASCII chars 33-126 without curly braces
OPERATOR: "." | DIVIDE
DIVIDE: "/"
PREFIX: {prefix_rule}
METRIC: {metric_rule}
NON_METRIC: {non_metric_rule}
EXCEPTIONS: "dar"
SHORT_PREFIX: {short_prefix_atoms}
LONG_PREFIX: {long_prefix_atoms}
METRIC: {metric_atoms}
NON_METRIC: {non_metric_atoms}
EXPONENT : ["+"|"-"] NON_ZERO_DIGITS
FACTOR: NON_ZERO_DIGITS
Expand Down Expand Up @@ -204,14 +206,19 @@ def NON_METRIC(self, args):


def ucum_parser(ucum_grammar_template=UCUM_GRAMMAR):
prefix_rule = " | ".join(f'"{i}"' for i in get_prefixes())
metric_rule = " | ".join(f'"{i}"' for i in (get_base_units() + get_metric_units()))
non_metric_rule = " | ".join(f'"{i}"' for i in get_non_metric_units())
prefixes = get_prefixes()
short_prefixes = [i for i in prefixes if len(i) == 1]
long_prefixes = [i for i in prefixes if len(i) > 1]
short_prefix_atoms = " | ".join(f'"{i}"' for i in short_prefixes)
long_prefix_atoms = " | ".join(f'"{i}"' for i in long_prefixes)
metric_atoms = " | ".join(f'"{i}"' for i in (get_base_units() + get_metric_units()))
non_metric_atoms = " | ".join(f'"{i}"' for i in get_non_metric_units())

ucum_grammar = ucum_grammar_template.format(
prefix_rule=prefix_rule,
metric_rule=metric_rule,
non_metric_rule=non_metric_rule,
short_prefix_atoms = short_prefix_atoms,
long_prefix_atoms = long_prefix_atoms,
metric_atoms = metric_atoms,
non_metric_atoms = non_metric_atoms,
)
return Lark(ucum_grammar, start="main_term", strict=True)

Expand Down

0 comments on commit 47aeebd

Please sign in to comment.