Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse edits #179

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
52 changes: 28 additions & 24 deletions chempy/util/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,17 @@ def _get_formula_parser():

BNF for simple chemical formula (no nesting)

integer :: '0'..'9'+
real :: '0'..'9'+
element :: 'A'..'Z' 'a'..'z'*
term :: element [integer]
term :: element [real]
formula :: term+


BNF for nested chemical formula

integer :: '0'..'9'+
real :: '0'..'9'+
element :: 'A'..'Z' 'a'..'z'*
term :: (element | '(' formula ')') [integer]
term :: (element | '(' formula ')') [real]
formula :: term+

Notes
Expand All @@ -80,14 +80,14 @@ def _get_formula_parser():
_p = __import__(parsing_library)
Forward, Group, OneOrMore = _p.Forward, _p.Group, _p.OneOrMore
Optional, ParseResults, Regex = _p.Optional, _p.ParseResults, _p.Regex
Suppress, Word, nums = _p.Suppress, _p.Word, _p.nums
Suppress, Word, nums, Combine = _p.Suppress, _p.Word, _p.nums, _p.Combine

LPAR, RPAR = map(Suppress, "()")
integer = Word(nums)
real = Combine(Word(nums) + Optional('.' + Word(nums)))

# add parse action to convert integers to ints, to support doing addition
# add parse action to convert reals to floats, to support doing addition
# and multiplication at parse time
integer.setParseAction(lambda t: int(t[0]))
real.setParseAction(lambda t: float(t[0]))

# element = Word(alphas.upper(), alphas.lower())
# or if you want to be more specific, use this Regex
Expand All @@ -101,7 +101,7 @@ def _get_formula_parser():
formula = Forward()

term = Group((element | Group(LPAR + formula + RPAR)("subgroup")) +
Optional(integer, default=1)("mult"))
Optional(real, default=1)("mult"))

# add parse actions for parse-time processing

Expand Down Expand Up @@ -205,6 +205,7 @@ def _formula_to_parts(formula, prefixes, suffixes):
def _parse_stoich(stoich):
if stoich == 'e': # special case, the electron is not an element
return {}

return {symbols.index(k)+1: n for k, n
in _get_formula_parser().parseString(stoich)}

Expand All @@ -218,19 +219,22 @@ def _parse_stoich(stoich):
_latex_mapping = {k + '-': '\\' + k + '-' for k in _greek_letters}
_latex_mapping['epsilon-'] = '\\varepsilon-'
_latex_mapping['omicron-'] = 'o-'
_latex_mapping['.'] = '^\\bullet '
_latex_infix_mapping = {'.': '\\cdot '}
_latex_mapping['.'] = '.'
_latex_mapping[':'] = '\\mathpunct{:} '
_latex_infix_mapping = {':': '\\mathpunct{:} '}

_unicode_mapping = {k + '-': v + '-' for k, v in zip(_greek_letters, _greek_u)}
_unicode_mapping['.'] = u'⋅'
_unicode_infix_mapping = {'.': u'·'}
_unicode_mapping[':'] = u':'
_unicode_infix_mapping = {':': u':'}

_html_mapping = {k + '-': '&' + k + ';-' for k in _greek_letters}
_html_mapping['.'] = '⋅'
_html_mapping[':'] = ':'
_html_infix_mapping = _html_mapping


def _get_leading_integer(s):
def _get_leading_coeff(s):
m = re.findall(r'^\d+', s)
if len(m) == 0:
m = 1
Expand All @@ -254,30 +258,30 @@ def formula_to_composition(formula, prefixes=None,
formula: str
Chemical formula, e.g. 'H2O', 'Fe+3', 'Cl-'
prefixes: iterable strings
Prefixes to ignore, e.g. ('.', 'alpha-')
Prefixes to ignore, e.g. ( 'alpha-')
suffixes: tuple of strings
Suffixes to ignore, e.g. ('(g)', '(s)')

Examples
--------
>>> formula_to_composition('NH4+') == {0: 1, 1: 4, 7: 1}
True
>>> formula_to_composition('.NHO-(aq)') == {0: -1, 1: 1, 7: 1, 8: 1}
>>> formula_to_composition(':NHO-(aq)') == {0: -1, 1: 1, 7: 1, 8: 1}
True
>>> formula_to_composition('Na2CO3.7H2O') == {11: 2, 6: 1, 8: 10, 1: 14}
>>> formula_to_composition('Na2CO3:7H2O') == {11: 2, 6: 1, 8: 10, 1: 14}
True

"""
if prefixes is None:
prefixes = _latex_mapping.keys()
stoich_tok, chg_tok = _formula_to_parts(formula, prefixes, suffixes)[:2]
tot_comp = {}
parts = stoich_tok.split('.')
parts = stoich_tok.split(':')
for idx, stoich in enumerate(parts):
if idx == 0:
m = 1
else:
m, stoich = _get_leading_integer(stoich)
m, stoich = _get_leading_coeff(stoich)
comp = _parse_stoich(stoich)
for k, v in comp.items():
if k not in tot_comp:
Expand Down Expand Up @@ -321,7 +325,7 @@ def _parse_multiplicity(strings, substance_keys=None):
elif len(items) == 2:
if items[1] not in result:
result[items[1]] = 0
result[items[1]] += float(items[0]) if '.' in items[0] or 'e' in items[0] else int(items[0])
result[items[1]] += float(items[0]) if ':' in items[0] or 'e' in items[0] else int(items[0])
else:
raise ValueError("To many parts in substring")
if substance_keys is not None:
Expand Down Expand Up @@ -398,14 +402,14 @@ def to_reaction(line, substance_keys, token, Cls, globals_=None, **kwargs):
def _formula_to_format(sub, sup, formula, prefixes=None,
infixes=None, suffixes=('(s)', '(l)', '(g)', '(aq)')):
parts = _formula_to_parts(formula, prefixes.keys(), suffixes)
stoichs = parts[0].split('.')
stoichs = parts[0].split(':')
string = ''
for idx, stoich in enumerate(stoichs):
if idx == 0:
m = 1
else:
m, stoich = _get_leading_integer(stoich)
string += _subs('.', infixes)
m, stoich = _get_leading_coeff(stoich)
string += _subs(':', infixes)
if m != 1:
string += str(m)
string += re.sub(r'([0-9]+)', lambda m: sub(m.group(1)), stoich)
Expand Down Expand Up @@ -445,8 +449,8 @@ def formula_to_latex(formula, prefixes=None, infixes=None, **kwargs):
'Fe(CN)_{6}^{2+}'
>>> formula_to_latex('Fe(CN)6+2(aq)')
'Fe(CN)_{6}^{2+}(aq)'
>>> formula_to_latex('.NHO-(aq)')
'^\\bullet NHO^{-}(aq)'
>>> formula_to_latex(':NHO-(aq)')
'\\mathpunct{:} NHO^{-}(aq)'
>>> formula_to_latex('alpha-FeOOH(s)')
'\\alpha-FeOOH(s)'

Expand Down
54 changes: 27 additions & 27 deletions chempy/util/tests/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ def test_formula_to_composition():
assert formula_to_composition('SO4-2(aq)') == {0: -2, 8: 4, 16: 1}

# prefixes and suffixes
assert formula_to_composition('.NO2(g)') == {7: 1, 8: 2}
assert formula_to_composition('.NH2') == {1: 2, 7: 1}
assert formula_to_composition(':NO2(g)') == {7: 1, 8: 2}
assert formula_to_composition(':NH2') == {1: 2, 7: 1}
assert formula_to_composition('ONOOH') == {1: 1, 7: 1, 8: 3}
assert formula_to_composition('.ONOO') == {7: 1, 8: 3}
assert formula_to_composition('.NO3/2-') == {0: -2, 7: 1, 8: 3}
assert formula_to_composition('.NO3-2') == {0: -2, 7: 1, 8: 3}
assert formula_to_composition(':ONOO') == {7: 1, 8: 3}
assert formula_to_composition(':NO3/2-') == {0: -2, 7: 1, 8: 3}
assert formula_to_composition(':NO3-2') == {0: -2, 7: 1, 8: 3}

with pytest.raises(ValueError):
formula_to_composition('F-F')
Expand All @@ -51,7 +51,7 @@ def test_formula_to_composition():
assert formula_to_composition('epsilon-Zn(OH)2(s)') == {1: 2, 8: 2, 30: 1}

# crystal water
assert formula_to_composition('Na2CO3.7H2O(s)') == {11: 2, 6: 1, 8: 10, 1: 14}
assert formula_to_composition('Na2CO3:7H2O(s)') == {11: 2, 6: 1, 8: 10, 1: 14}


@requires(parsing_library)
Expand Down Expand Up @@ -123,17 +123,17 @@ def test_formula_to_latex():
assert formula_to_latex('NaCl(s)') == 'NaCl(s)'
assert formula_to_latex('e-(aq)') == 'e^{-}(aq)'
assert formula_to_latex('Ca+2(aq)') == 'Ca^{2+}(aq)'
assert formula_to_latex('.NO2(g)') == r'^\bullet NO_{2}(g)'
assert formula_to_latex('.NH2') == r'^\bullet NH_{2}'
assert formula_to_latex(':NO2(g)') == r'\mathpunct{:} NO_{2}(g)'
assert formula_to_latex(':NH2') == r'\mathpunct{:} NH_{2}'
assert formula_to_latex('ONOOH') == 'ONOOH'
assert formula_to_latex('.ONOO') == r'^\bullet ONOO'
assert formula_to_latex('.NO3/2-') == r'^\bullet NO_{3}^{2-}'
assert formula_to_latex('.NO3-2') == r'^\bullet NO_{3}^{2-}'
assert formula_to_latex(':ONOO') == r'\mathpunct{:} ONOO'
assert formula_to_latex(':NO3/2-') == r'\mathpunct{:} NO_{3}^{2-}'
assert formula_to_latex(':NO3-2') == r'\mathpunct{:} NO_{3}^{2-}'
assert formula_to_latex('alpha-FeOOH(s)') == r'\alpha-FeOOH(s)'
assert formula_to_latex('epsilon-Zn(OH)2(s)') == (
r'\varepsilon-Zn(OH)_{2}(s)')
assert formula_to_latex('Na2CO3.7H2O(s)') == r'Na_{2}CO_{3}\cdot 7H_{2}O(s)'
assert formula_to_latex('Na2CO3.1H2O(s)') == r'Na_{2}CO_{3}\cdot H_{2}O(s)'
assert formula_to_latex('Na2CO3:7H2O(s)') == r'Na_{2}CO_{3}\mathpunct{:} 7H_{2}O(s)'
assert formula_to_latex('Na2CO3:1H2O(s)') == r'Na_{2}CO_{3}\mathpunct{:} H_{2}O(s)'


@requires(parsing_library)
Expand All @@ -151,16 +151,16 @@ def test_formula_to_unicoce():
assert formula_to_unicode('NaCl(s)') == u'NaCl(s)'
assert formula_to_unicode('e-(aq)') == u'e⁻(aq)'
assert formula_to_unicode('Ca+2(aq)') == u'Ca²⁺(aq)'
assert formula_to_unicode('.NO2(g)') == u'NO₂(g)'
assert formula_to_unicode('.NH2') == u'NH₂'
assert formula_to_unicode(':NO2(g)') == u':NO₂(g)'
assert formula_to_unicode(':NH2') == u':NH₂'
assert formula_to_unicode('ONOOH') == u'ONOOH'
assert formula_to_unicode('.ONOO') == u'ONOO'
assert formula_to_unicode('.NO3/2-') == u'NO₃²⁻'
assert formula_to_unicode('.NO3-2') == u'NO₃²⁻'
assert formula_to_unicode(':ONOO') == u':ONOO'
assert formula_to_unicode(':NO3/2-') == u':NO₃²⁻'
assert formula_to_unicode(':NO3-2') == u':NO₃²⁻'
assert formula_to_unicode('alpha-FeOOH(s)') == u'α-FeOOH(s)'
assert formula_to_unicode('epsilon-Zn(OH)2(s)') == u'ε-Zn(OH)₂(s)'
assert formula_to_unicode('Na2CO3.7H2O(s)') == u'Na₂CO₃·7H₂O(s)'
assert formula_to_unicode('Na2CO3.1H2O(s)') == u'Na₂CO₃·H₂O(s)'
assert formula_to_unicode('Na2CO3:7H2O(s)') == u'Na₂CO₃:7H₂O(s)'
assert formula_to_unicode('Na2CO3:1H2O(s)') == u'Na₂CO₃:H₂O(s)'


@requires(parsing_library)
Expand All @@ -177,14 +177,14 @@ def test_formula_to_html():
assert formula_to_html('NaCl(s)') == 'NaCl(s)'
assert formula_to_html('e-(aq)') == 'e<sup>-</sup>(aq)'
assert formula_to_html('Ca+2(aq)') == 'Ca<sup>2+</sup>(aq)'
assert formula_to_html('.NO2(g)') == r'&sdot;NO<sub>2</sub>(g)'
assert formula_to_html('.NH2') == r'&sdot;NH<sub>2</sub>'
assert formula_to_html(':NO2(g)') == r'&#58;NO<sub>2</sub>(g)'
assert formula_to_html(':NH2') == r'&#58;NH<sub>2</sub>'
assert formula_to_html('ONOOH') == 'ONOOH'
assert formula_to_html('.ONOO') == r'&sdot;ONOO'
assert formula_to_html('.NO3/2-') == r'&sdot;NO<sub>3</sub><sup>2-</sup>'
assert formula_to_html('.NO3-2') == r'&sdot;NO<sub>3</sub><sup>2-</sup>'
assert formula_to_html(':ONOO') == r'&#58;ONOO'
assert formula_to_html(':NO3/2-') == r'&#58;NO<sub>3</sub><sup>2-</sup>'
assert formula_to_html(':NO3-2') == r'&#58;NO<sub>3</sub><sup>2-</sup>'
assert formula_to_html('alpha-FeOOH(s)') == r'&alpha;-FeOOH(s)'
assert formula_to_html('epsilon-Zn(OH)2(s)') == (
r'&epsilon;-Zn(OH)<sub>2</sub>(s)')
assert formula_to_html('Na2CO3.7H2O(s)') == 'Na<sub>2</sub>CO<sub>3</sub>&sdot;7H<sub>2</sub>O(s)'
assert formula_to_html('Na2CO3.1H2O(s)') == 'Na<sub>2</sub>CO<sub>3</sub>&sdot;H<sub>2</sub>O(s)'
assert formula_to_html('Na2CO3:7H2O(s)') == 'Na<sub>2</sub>CO<sub>3</sub>&#58;7H<sub>2</sub>O(s)'
assert formula_to_html('Na2CO3:1H2O(s)') == 'Na<sub>2</sub>CO<sub>3</sub>&#58;H<sub>2</sub>O(s)'