Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(cl_back_scrape_citations): command to scrape citations #4303

Merged
merged 12 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 48 additions & 51 deletions cl/scrapers/DupChecker.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from juriscraper.AbstractSite import logger

from cl.scrapers.exceptions import (
ConsecutiveDuplicatesError,
SingleDuplicateError,
)
from cl.scrapers.models import UrlHash
from cl.search.models import Court

Expand All @@ -19,7 +23,6 @@ def __init__(
self.url_hash = None
self.dup_count = 0
self.last_found_date = None
self.emulate_break = False
super().__init__(*args, **kwargs)

def _increment(self, current_date):
Expand Down Expand Up @@ -83,29 +86,29 @@ def press_on(
lookup_by="sha1",
):
"""Checks if a we have an `object_type` with identical content in the CL
corpus by looking up `lookup_value` in the `lookup_by` field. Depending
on the result of that, we either return True or False. True represents
the fact that the next item should be processed. False means that either
the item was a duplicate or that we've hit so many duplicates that we've
stopped checking (we hit a duplicate threshold). Either way, the caller
should move to the next item and try it.
corpus by looking up `lookup_value` in the `lookup_by` field.

The effect of this is that this emulates for loop constructs for
continue (False), break (False), return (True).
If the item is not a duplicate, we will return None, and the caller
will proceed normally

If the item is a duplicate, we will raise SingleDuplicateError

If the item is a duplicate following a series of duplicates greater than
our tolerance threshold, we will raise ConsecutiveDuplicatesError

If the item is a duplicate and the next item is from an already scraped
date, we will raise ConsecutiveDuplicatesError

Following logic applies:
- if we do not have the item
- early return
- if we have the item already
- and if the next date is before this date
- or if this is our duplicate threshold is exceeded
- break
- otherwise
- continue
- if not
- carry on
"""
if self.emulate_break:
return False

# check for a duplicate in the db.
if lookup_by == "sha1":
exists = object_type.objects.filter(sha1=lookup_value).exists()
Expand All @@ -116,41 +119,35 @@ def press_on(
else:
raise NotImplementedError("Unknown lookup_by parameter.")

if exists:
logger.info(
f"Duplicate found on date: {current_date}, with lookup value: {lookup_value}"
)
self._increment(current_date)

# If the next date in the Site object is less than (before) the
# current date, we needn't continue because we should already have
# that item.
if next_date:
already_scraped_next_date = next_date < current_date
else:
already_scraped_next_date = True
if not self.full_crawl:
if already_scraped_next_date:
if self.court.pk == "mich":
# Michigan sometimes has multiple occurrences of the
# same case with different dates on a page.
return False
else:
logger.info(
"Next case occurs prior to when we found a "
"duplicate. Court is up to date."
)
self.emulate_break = True
return False
elif self.dup_count >= self.dup_threshold:
logger.info(
f"Found {self.dup_count} duplicates in a row. Court is up to date."
)
self.emulate_break = True
return False
else:
# This is a full crawl. Do not emulate a break, BUT be sure to
# say that we shouldn't press on, since the item already exists.
return False
if not exists:
return

logger.info(
f"Duplicate found on date: {current_date}, with lookup value: {lookup_value}"
)
self._increment(current_date)

# If the next date in the Site object is less than (before) the
# current date, we needn't continue because we should already have
# that item.
if next_date:
already_scraped_next_date = next_date < current_date
else:
already_scraped_next_date = True

if not self.full_crawl:
if already_scraped_next_date:
if self.court.pk == "mich":
# Michigan sometimes has multiple occurrences of the
# same case with different dates on a page.
raise SingleDuplicateError(logger=logger)
else:
message = "Next case occurs prior to when we found a duplicate. Court is up to date."
raise ConsecutiveDuplicatesError(message, logger=logger)
elif self.dup_count >= self.dup_threshold:
message = f"Found {self.dup_count} duplicates in a row. Court is up to date."
raise ConsecutiveDuplicatesError(message, logger=logger)
else:
return True
# This is a full crawl. Do not raise a loop breaking `ConsecutiveDuplicatesError`,
# but say that we shouldn't press on, since the item already exists.
raise SingleDuplicateError(logger=logger)
82 changes: 82 additions & 0 deletions cl/scrapers/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import logging
from typing import Optional

from cl.lib.command_utils import logger


class AutoLoggingException(Exception):
"""Exception with defaults for logging, to be subclassed

We log expected exceptions to better understand what went wrong
Logger calls with level `logging.ERROR` are sent to Sentry, and
it's useful to send a `fingerprint` to force a specific grouping by court

Other `logger` calls are just printed on the console when using a
VerboseCommand with proper verbosity levels
"""

logging_level = logging.DEBUG
message = ""
logger = logger

def __init__(
self,
message: str = "",
logger: Optional[logging.Logger] = None,
logging_level: Optional[int] = None,
fingerprint: Optional[list[str]] = None,
):
if not message:
message = self.message
if not logger:
logger = self.logger
if not logging_level:
logging_level = self.logging_level

log_kwargs = {}
if fingerprint:
log_kwargs["extra"] = {"fingerprint": fingerprint}

logger.log(logging_level, message, **log_kwargs)
super().__init__(message)


class ConsecutiveDuplicatesError(AutoLoggingException):
"""Occurs when consecutive `SingleDuplicateError` are found,
which may be used as a signal to break the scraping loop
"""

message = "DupChecker emulate break triggered."


class SingleDuplicateError(AutoLoggingException):
"""Occurs when an opinion or audio file already exists
in our database
"""

message = "Skipping opinion due to duplicated content hash"


class BadContentError(AutoLoggingException):
"""Parent class for errors raised when downloading binary content"""


class UnexpectedContentTypeError(BadContentError):
"""Occurs when the content received from the server has
a different content type than the ones listed on
site.expected_content_types
"""

logging_level = logging.ERROR


class NoDownloadUrlError(BadContentError):
"""Occurs when a DeferredList fetcher fails."""

logging_level = logging.ERROR


class EmptyFileError(BadContentError):
"""Occurs when the content of the response has lenght 0"""

logging_level = logging.ERROR
148 changes: 148 additions & 0 deletions cl/scrapers/management/commands/cl_back_scrape_citations.py
mlissner marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""
When opinions are first published on the courts' sites, they won't have
all their citations assigned. Some courts will publish the citations
in the same pages we scrape, but months later

This command re-uses the (back)scraper we use to get opinions, to get
the lagged citations and associate them with the Opinions we first
downloaded. If we find an Opinion we don't have in the database,
we ingest it as in a regular scrape
"""

from django.db import IntegrityError
from django.utils.encoding import force_bytes

from cl.lib.command_utils import logger
from cl.lib.crypto import sha1
from cl.scrapers.DupChecker import DupChecker
from cl.scrapers.exceptions import BadContentError
from cl.scrapers.management.commands import cl_back_scrape_opinions
from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
from cl.scrapers.utils import get_binary_content
from cl.search.models import Citation, Court, Opinion


class Command(cl_back_scrape_opinions.Command):
def scrape_court(
self,
site,
full_crawl: bool = False,
ocr_available: bool = True,
backscrape: bool = False,
):
"""
If the scraped case has citation data
Check for Opinion existance via content hash
If we have the Opinion
if we don't have the citation -> ingest
if we already have the citation -> pass
If we don't have the Opinion
ingest the opinion with it's citation, that is to say,
use the regular scraping process!

:param site: scraper object that has already downloaded
it's case data
"""
court_str = site.court_id.split(".")[-1].split("_")[0]
court = Court.objects.get(id=court_str)
dup_checker = DupChecker(court, full_crawl=True)

for case in site:
citation = case.get("citations")
parallel_citation = case.get("parallel_citations")
if not citation and not parallel_citation:
logger.debug(
"No citation, skipping row for case %s",
case.get("case_names"),
)
continue

try:
content = get_binary_content(case["download_urls"], site)
except BadContentError:
continue

sha1_hash = sha1(force_bytes(content))

try:
cluster = Opinion.objects.get(sha1=sha1_hash).cluster
except Opinion.DoesNotExist:
# populate special key to avoid downloading the file again
case["content"] = content

logger.info(
"Case '%s', opinion '%s' has no matching hash in the DB. "
"Has a citation '%s'. Will try to ingest all objects",
case["case_names"],
case["download_urls"],
citation or parallel_citation,
)

self.ingest_a_case(case, None, True, site, dup_checker, court)
continue

for cite in [citation, parallel_citation]:
if not cite:
continue

citation_candidate = make_citation(cite, cluster, court_str)
if not citation_candidate:
continue

if self.citation_is_duplicated(citation_candidate, cite):
continue

try:
citation_candidate.save()
logger.info(
"Saved citation %s for cluster %s", cite, cluster
)
except IntegrityError:
logger.warning(
"Error when saving citation %s for cluster %s",
cite,
cluster,
)

def citation_is_duplicated(
self, citation_candidate: Citation, cite: str
) -> bool:
"""Checks if the citation is duplicated for the cluster

Following corpus_importer.utils.add_citations_to_cluster we
identify 2 types of duplication:
- exact: a citation with the same fields already exists for the cluster
- duplication in the same reporter: the cluster already has a citation
in that reporter

:param citation_candidate: the citation object
:param cite: citation string

:return: True if citation is duplicated, False if not
"""
citation_params = {**citation_candidate.__dict__}
citation_params.pop("_state", "")
citation_params.pop("id", "")
cluster_id = citation_candidate.cluster.id

# Exact duplication
if Citation.objects.filter(**citation_params).exists():
logger.info(
"Citation '%s' already exists for cluster %s",
cite,
cluster_id,
)
return True

# Duplication in the same reporter
if Citation.objects.filter(
cluster_id=cluster_id, reporter=citation_candidate.reporter
).exists():
logger.info(
"Another citation in the same reporter '%s' exists for cluster %s",
citation_candidate.reporter,
cluster_id,
)
return True

return False
Loading
Loading