Skip to content

Commit

Permalink
Merge pull request #4375 from grossir/fix_dup_checker
Browse files Browse the repository at this point in the history
fix(scrapers.DupChecker): ensure raising SingleDuplicateError
  • Loading branch information
mlissner authored Aug 29, 2024
2 parents dc84996 + 881b857 commit a2cf919
Showing 1 changed file with 8 additions and 7 deletions.
15 changes: 8 additions & 7 deletions cl/scrapers/DupChecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,19 +135,20 @@ def press_on(
else:
already_scraped_next_date = True

# When in a full crawl, we do not raise a loop breaking
# `ConsecutiveDuplicatesError`
if not self.full_crawl:
if already_scraped_next_date:
if self.court.pk == "mich":
# Michigan sometimes has multiple occurrences of the
# same case with different dates on a page.
raise SingleDuplicateError(logger=logger)
else:
message = "Next case occurs prior to when we found a duplicate. Court is up to date."
raise ConsecutiveDuplicatesError(message, logger=logger)

message = "Next case occurs prior to when we found a duplicate. Court is up to date."
raise ConsecutiveDuplicatesError(message, logger=logger)
elif self.dup_count >= self.dup_threshold:
message = f"Found {self.dup_count} duplicates in a row. Court is up to date."
raise ConsecutiveDuplicatesError(message, logger=logger)
else:
# This is a full crawl. Do not raise a loop breaking `ConsecutiveDuplicatesError`,
# but say that we shouldn't press on, since the item already exists.
raise SingleDuplicateError(logger=logger)

# Full crawl or not, this is a duplicate and we shouldn't store it
raise SingleDuplicateError(logger=logger)

0 comments on commit a2cf919

Please sign in to comment.