diff --git a/sacremoses/normalize.py b/sacremoses/normalize.py index b29d316..21e1e4f 100644 --- a/sacremoses/normalize.py +++ b/sacremoses/normalize.py @@ -132,6 +132,7 @@ def __init__( norm_numbers=True, pre_replace_unicode_punct=False, post_remove_control_chars=False, + perl_parity=False ): """ :param language: The two-letter language code. @@ -142,7 +143,15 @@ def __init__( :type norm_quote_commas: bool :param norm_numbers: Normalize numbers :type norm_numbers: bool + :param perl_parity: exact parity with perl script + :type perl_parity: bool """ + + if perl_parity: + self.NORMALIZE_UNICODE[11] = ("’", r'"') + self.FRENCH_QUOTES[0] = ("\u00A0«\u00A0", r' "') + self.FRENCH_QUOTES[3] = ("\u00A0»\u00A0", r'" ') + self.substitutions = [ self.EXTRA_WHITESPACE, self.NORMALIZE_UNICODE, diff --git a/sacremoses/test/test_normalizer.py b/sacremoses/test/test_normalizer.py index 74e1ff4..c6a0152 100644 --- a/sacremoses/test/test_normalizer.py +++ b/sacremoses/test/test_normalizer.py @@ -70,3 +70,9 @@ def test_normalization_pipeline(self): text = "0《123》 456% '' 【789】" expected = '0"123" 456% " [789]' assert moses_norm_unicode.normalize(text) == expected + + def test_moses_normalize_with_perl_parity(self): + moses_perl_parity = MosesPunctNormalizer(perl_parity=True) + text = 'from the ‘bad bank’, Northern, wala\u00A0«\u00A0dox ci jawwu Les «\u00A0wagonways\u00A0»\u00A0étaient construits' + expected = '''from the 'bad bank," Northern, wala "dox ci jawwu Les "wagonways" étaient construits''' + assert moses_perl_parity.normalize(text) == expected