Skip to content

Commit

Permalink
Merge pull request #146 from NIXBLACK11/parity-with-perl-normalize
Browse files Browse the repository at this point in the history
Parity with perl normalize.
  • Loading branch information
jelmervdl authored Oct 26, 2023
2 parents 303ae7f + 4164186 commit 8d32b0e
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 0 deletions.
9 changes: 9 additions & 0 deletions sacremoses/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def __init__(
norm_numbers=True,
pre_replace_unicode_punct=False,
post_remove_control_chars=False,
perl_parity=False
):
"""
:param language: The two-letter language code.
Expand All @@ -142,7 +143,15 @@ def __init__(
:type norm_quote_commas: bool
:param norm_numbers: Normalize numbers
:type norm_numbers: bool
:param perl_parity: exact parity with perl script
:type perl_parity: bool
"""

if perl_parity:
self.NORMALIZE_UNICODE[11] = ("’", r'"')
self.FRENCH_QUOTES[0] = ("\u00A0«\u00A0", r' "')
self.FRENCH_QUOTES[3] = ("\u00A0»\u00A0", r'" ')

self.substitutions = [
self.EXTRA_WHITESPACE,
self.NORMALIZE_UNICODE,
Expand Down
6 changes: 6 additions & 0 deletions sacremoses/test/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,9 @@ def test_normalization_pipeline(self):
text = "0《123》 456% '' 【789】"
expected = '0"123" 456% " [789]'
assert moses_norm_unicode.normalize(text) == expected

def test_moses_normalize_with_perl_parity(self):
moses_perl_parity = MosesPunctNormalizer(perl_parity=True)
text = 'from the ‘bad bank’, Northern, wala\u00A0«\u00A0dox ci jawwu Les «\u00A0wagonways\u00A0»\u00A0étaient construits'
expected = '''from the 'bad bank," Northern, wala "dox ci jawwu Les "wagonways" étaient construits'''
assert moses_perl_parity.normalize(text) == expected

0 comments on commit 8d32b0e

Please sign in to comment.