From cf86159113db213844f132eac865e7eb10cf3fa8 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 14 Sep 2023 15:52:55 +0200 Subject: [PATCH 1/2] Move the dynamic pattern construction out of the hot loop --- sacremoses/tokenize.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/sacremoses/tokenize.py b/sacremoses/tokenize.py index 3709012..4ae3efa 100644 --- a/sacremoses/tokenize.py +++ b/sacremoses/tokenize.py @@ -660,6 +660,14 @@ class MosesDetokenizer(object): "|".join(FINNISH_MORPHSET_3), )) + IS_CURRENCY_SYMBOL = re.compile(r"^[{}\(\[\{{\¿\¡]+$".format(IsSc)) + + IS_ENGLISH_CONTRACTION = re.compile(r"^['][{}]".format(IsAlpha)) + + IS_FRENCH_CONRTACTION = re.compile(r"[{}][']$".format(IsAlpha)) + + STARTS_WITH_ALPHA = re.compile(r"^[{}]".format(IsAlpha)) + def __init__(self, lang="en"): super(MosesDetokenizer, self).__init__() self.lang = lang @@ -708,7 +716,7 @@ def tokenize(self, tokens, return_str=True, unescape=True): detokenized_text += prepend_space + token prepend_space = " " # If it's a currency symbol. - elif re.search(r"^[" + self.IsSc + r"\(\[\{\¿\¡]+$", token): + elif re.search(self.IS_CURRENCY_SYMBOL, token): # Perform right shift on currency and other random punctuation items detokenized_text += prepend_space + token prepend_space = "" @@ -724,7 +732,7 @@ def tokenize(self, tokens, return_str=True, unescape=True): elif ( self.lang == "en" and i > 0 - and re.search(r"^['][{}]".format(self.IsAlpha), token) + and re.search(self.IS_ENGLISH_CONTRACTION, token) ): # and re.search('[{}]$'.format(self.IsAlnum), tokens[i-1])): # For English, left-shift the contraction. @@ -747,8 +755,8 @@ def tokenize(self, tokens, return_str=True, unescape=True): elif ( self.lang in ["fr", "it", "ga"] and i <= len(tokens) - 2 - and re.search(r"[{}][']$".format(self.IsAlpha), token) - and re.search(r"^[{}]".format(self.IsAlpha), tokens[i + 1]) + and re.search(self.IS_FRENCH_CONRTACTION, token) + and re.search(self.STARTS_WITH_ALPHA, tokens[i + 1]) ): # If the next token is alpha. # For French and Italian, right-shift the contraction. detokenized_text += prepend_space + token @@ -757,7 +765,7 @@ def tokenize(self, tokens, return_str=True, unescape=True): elif ( self.lang == "cs" and i <= len(tokens) - 3 - and re.search(r"[{}][']$".format(self.IsAlpha), token) + and re.search(self.IS_FRENCH_CONRTACTION, token) and re.search(r"^[-–]$", tokens[i + 1]) and re.search(r"^li$|^mail.*", tokens[i + 2], re.IGNORECASE) ): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i) From 38d83b5e13bd95bd775eb7fab6d9e2bad318dd80 Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Thu, 14 Sep 2023 17:36:14 +0200 Subject: [PATCH 2/2] Lift a couple more hot regexps out of the loop --- sacremoses/tokenize.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/sacremoses/tokenize.py b/sacremoses/tokenize.py index 4ae3efa..634834f 100644 --- a/sacremoses/tokenize.py +++ b/sacremoses/tokenize.py @@ -668,6 +668,10 @@ class MosesDetokenizer(object): STARTS_WITH_ALPHA = re.compile(r"^[{}]".format(IsAlpha)) + IS_PUNCT = re.compile(r"^[\,\.\?\!\:\;\\\%\}\]\)]+$") + + IS_OPEN_QUOTE = re.compile(r"""^[\'\"„“`]+$""") + def __init__(self, lang="en"): super(MosesDetokenizer, self).__init__() self.lang = lang @@ -716,12 +720,12 @@ def tokenize(self, tokens, return_str=True, unescape=True): detokenized_text += prepend_space + token prepend_space = " " # If it's a currency symbol. - elif re.search(self.IS_CURRENCY_SYMBOL, token): + elif self.IS_CURRENCY_SYMBOL.search(token): # Perform right shift on currency and other random punctuation items detokenized_text += prepend_space + token prepend_space = "" - elif re.search(r"^[\,\.\?\!\:\;\\\%\}\]\)]+$", token): + elif self.IS_PUNCT.search(token): # In French, these punctuations are prefixed with a non-breakable space. if self.lang == "fr" and re.search(r"^[\?\!\:\;\\\%]$", token): detokenized_text += " " @@ -732,7 +736,7 @@ def tokenize(self, tokens, return_str=True, unescape=True): elif ( self.lang == "en" and i > 0 - and re.search(self.IS_ENGLISH_CONTRACTION, token) + and self.IS_ENGLISH_CONTRACTION.search(token) ): # and re.search('[{}]$'.format(self.IsAlnum), tokens[i-1])): # For English, left-shift the contraction. @@ -755,8 +759,8 @@ def tokenize(self, tokens, return_str=True, unescape=True): elif ( self.lang in ["fr", "it", "ga"] and i <= len(tokens) - 2 - and re.search(self.IS_FRENCH_CONRTACTION, token) - and re.search(self.STARTS_WITH_ALPHA, tokens[i + 1]) + and self.IS_FRENCH_CONRTACTION.search(token) + and self.STARTS_WITH_ALPHA.search(tokens[i + 1]) ): # If the next token is alpha. # For French and Italian, right-shift the contraction. detokenized_text += prepend_space + token @@ -765,7 +769,7 @@ def tokenize(self, tokens, return_str=True, unescape=True): elif ( self.lang == "cs" and i <= len(tokens) - 3 - and re.search(self.IS_FRENCH_CONRTACTION, token) + and self.IS_FRENCH_CONRTACTION.search(token) and re.search(r"^[-–]$", tokens[i + 1]) and re.search(r"^li$|^mail.*", tokens[i + 2], re.IGNORECASE) ): # In Perl, ($words[$i+2] =~ /^li$|^mail.*/i) @@ -775,7 +779,7 @@ def tokenize(self, tokens, return_str=True, unescape=True): prepend_space = "" # Combine punctuation smartly. - elif re.search(r"""^[\'\"„“`]+$""", token): + elif self.IS_OPEN_QUOTE.search(token): normalized_quo = token if re.search(r"^[„“”]+$", token): normalized_quo = '"' @@ -811,7 +815,7 @@ def tokenize(self, tokens, return_str=True, unescape=True): elif ( self.lang == "fi" and re.search(r":$", tokens[i - 1]) - and re.search(self.FINNISH_REGEX, token) + and self.FINNISH_REGEX.search(token) ): # Finnish : without intervening space if followed by case suffix # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...