From 6f7029a8d4f9f302c63ebacf5721e2284b31ebe2 Mon Sep 17 00:00:00 2001 From: Aleksandr Zdorovets Date: Wed, 22 Mar 2023 17:07:18 -0400 Subject: [PATCH 1/3] Moved the instance param to before it is called --- text_matcher/matcher.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/text_matcher/matcher.py b/text_matcher/matcher.py index 93b9ea4..92f3834 100644 --- a/text_matcher/matcher.py +++ b/text_matcher/matcher.py @@ -99,6 +99,7 @@ def __init__(self, textObjA, textObjB, threshold=3, cutoff=5, ngramSize=3, remov self.threshold = threshold self.ngramSize = ngramSize self.minDistance = minDistance + self.silent = silent self.textA = textObjA self.textB = textObjB @@ -121,8 +122,6 @@ def __init__(self, textObjA, textObjB, threshold=3, cutoff=5, ngramSize=3, remov self.numMatches = len(self.extended_matches) - self.silent = silent - def get_initial_matches(self): """ This does the main work of finding matching n-gram sequences between From de99008cdd235faa5aead33c9b3adc112a57b89f Mon Sep 17 00:00:00 2001 From: Aleksandr Zdorovets Date: Wed, 22 Mar 2023 17:10:21 -0400 Subject: [PATCH 2/3] When the texts are the same or close, there is IndexError. Per #15, add a check. seems to work idk --- text_matcher/matcher.py | 57 +++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/text_matcher/matcher.py b/text_matcher/matcher.py index 92f3834..4ca080c 100644 --- a/text_matcher/matcher.py +++ b/text_matcher/matcher.py @@ -246,34 +246,35 @@ def extend_matches(self, cutoff=0.4): extended = False for match in self.healed_matches: # Look one word before. - wordA = self.textAgrams[(match.a - 1)][0] - wordB = self.textBgrams[(match.b - 1)][0] - if self.edit_ratio(wordA, wordB) < cutoff: - if self.silent is not True: - print('Extending match backwards with words: %s %s' % - (wordA, wordB)) - match.a -= 1 - match.b -= 1 - match.sizeA += 1 - match.sizeB += 1 - match.extendedBackwards += 1 - extended = True - # Look one word after. - idxA = match.a + match.sizeA + 1 - idxB = match.b + match.sizeB + 1 - if idxA > len(self.textAgrams) - 1 or idxB > len(self.textBgrams) - 1: - # We've gone too far, and we're actually at the end of the text. - continue - wordA = self.textAgrams[idxA][-1] - wordB = self.textBgrams[idxB][-1] - if self.edit_ratio(wordA, wordB) < cutoff: - if self.silent is not True: - print('Extending match forwards with words: %s %s' % - (wordA, wordB)) - match.sizeA += 1 - match.sizeB += 1 - match.extendedForwards += 1 - extended = True + if match.a > 0 and match.b > 0: + wordA = self.textAgrams[(match.a - 1)][0] + wordB = self.textBgrams[(match.b - 1)][0] + if self.edit_ratio(wordA, wordB) < cutoff: + if self.silent is not True: + print('Extending match backwards with words: %s %s' % + (wordA, wordB)) + match.a -= 1 + match.b -= 1 + match.sizeA += 1 + match.sizeB += 1 + match.extendedBackwards += 1 + extended = True + # Look one word after. + idxA = match.a + match.sizeA + 1 + idxB = match.b + match.sizeB + 1 + if idxA > len(self.textAgrams) - 1 or idxB > len(self.textBgrams) - 1: + # We've gone too far, and we're actually at the end of the text. + continue + wordA = self.textAgrams[idxA][-1] + wordB = self.textBgrams[idxB][-1] + if self.edit_ratio(wordA, wordB) < cutoff: + if self.silent is not True: + print('Extending match forwards with words: %s %s' % + (wordA, wordB)) + match.sizeA += 1 + match.sizeB += 1 + match.extendedForwards += 1 + extended = True if extended: # If we've gone through the whole list and there's nothing From 9e3c6f856e1337cda5de2c138503cc7c904dc4de Mon Sep 17 00:00:00 2001 From: Aleksandr Zdorovets Date: Mon, 27 Mar 2023 16:27:56 -0400 Subject: [PATCH 3/3] If texts are the same, there is an IndexError. Add a check to extend_matches() to avoid that --- text_matcher/matcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/text_matcher/matcher.py b/text_matcher/matcher.py index 4ca080c..a5ddac9 100644 --- a/text_matcher/matcher.py +++ b/text_matcher/matcher.py @@ -99,7 +99,6 @@ def __init__(self, textObjA, textObjB, threshold=3, cutoff=5, ngramSize=3, remov self.threshold = threshold self.ngramSize = ngramSize self.minDistance = minDistance - self.silent = silent self.textA = textObjA self.textB = textObjB @@ -122,6 +121,8 @@ def __init__(self, textObjA, textObjB, threshold=3, cutoff=5, ngramSize=3, remov self.numMatches = len(self.extended_matches) + self.silent = silent + def get_initial_matches(self): """ This does the main work of finding matching n-gram sequences between