From 6f004305cdbc5379b8bd0f0b8823de418c0f453f Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Thu, 29 Aug 2024 11:08:02 -0500 Subject: [PATCH] fix(scrapers.DupChecker): ensure raising SingleDuplicateError A bug was introduced in the previous modification, where SingleDuplicateError was not raised when item was duplicated --- cl/scrapers/DupChecker.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cl/scrapers/DupChecker.py b/cl/scrapers/DupChecker.py index 10cac56334..a5a5df729d 100644 --- a/cl/scrapers/DupChecker.py +++ b/cl/scrapers/DupChecker.py @@ -135,19 +135,20 @@ def press_on( else: already_scraped_next_date = True + # When in a full crawl, we do not raise a loop breaking + # `ConsecutiveDuplicatesError` if not self.full_crawl: if already_scraped_next_date: if self.court.pk == "mich": # Michigan sometimes has multiple occurrences of the # same case with different dates on a page. raise SingleDuplicateError(logger=logger) - else: - message = "Next case occurs prior to when we found a duplicate. Court is up to date." - raise ConsecutiveDuplicatesError(message, logger=logger) + + message = "Next case occurs prior to when we found a duplicate. Court is up to date." + raise ConsecutiveDuplicatesError(message, logger=logger) elif self.dup_count >= self.dup_threshold: message = f"Found {self.dup_count} duplicates in a row. Court is up to date." raise ConsecutiveDuplicatesError(message, logger=logger) - else: - # This is a full crawl. Do not raise a loop breaking `ConsecutiveDuplicatesError`, - # but say that we shouldn't press on, since the item already exists. - raise SingleDuplicateError(logger=logger) + + # Full crawl or not, this is a duplicate and we shouldn't store it + raise SingleDuplicateError(logger=logger)