Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(cl_back_scrape_citations): command to scrape citations #4303

Merged
merged 12 commits into from
Aug 22, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions cl/scrapers/management/commands/cl_back_scrape_citations.py
mlissner marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""
When opinions are first published on the courts' sites, they won't have
all their citations assigned. Some courts will publish the citations
in the same pages we scrape, but months later

This command re-uses the (back)scraper we use to get opinions, to get
the lagged citations and associate them with the Opinions we first
downloaded. If we find an Opinion we don't have in the database,
we ingest it as in a regular scrape
"""

from django.utils.encoding import force_bytes

from cl.lib.command_utils import logger
from cl.lib.crypto import sha1
from cl.scrapers.management.commands import cl_back_scrape_opinions
from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
from cl.scrapers.utils import get_binary_content
from cl.search.models import Citation, Opinion, OpinionCluster


class Command(cl_back_scrape_opinions.Command):
def scrape_court(self, site, full_crawl=False, ocr_available=True):
"""
If the scraped case has citation data
Check for Opinion existance via content hash
If we have the Opinion
if we don't have the citation -> ingest
if we already have the citation -> pass
If we don't have the Opinion
ingest the opinion with it's citation, that is to say,
use the regular scraping process!

:param site: scraper object that has already downloaded
it's case data
"""
missing_opinions = []
court_str = site.court_id.split(".")[-1].split("_")[0]

for case in site:
citation = case.get("citations")
parallel_citation = case.get("parallel_citations")
if not citation and not parallel_citation:
logger.debug(
"No citation, skipping row for case %s",
case.get("case_names"),
)
continue

content = get_binary_content(case["download_urls"], site)
if not content:
# Errors are logged by get_binary_content itself
continue
sha1_hash = sha1(force_bytes(content))

try:
cluster = Opinion.objects.get(sha1=sha1_hash).cluster
except Opinion.DoesNotExist:
missing_opinions.append(case)
logger.info(
"Case '%s', opinion '%s' has no matching hash in the DB. "
"Has a citation '%s'. Will try to ingest all objects",
case["case_names"],
case["download_urls"],
citation or parallel_citation,
)
continue

for cite in [citation, parallel_citation]:
if not cite:
continue

citation_candidate = make_citation(cite, cluster, court_str)
if not citation_candidate:
continue

if self.citation_is_duplicated(
citation_candidate, cluster, cite
):
continue

logger.info("Saving citation %s for cluster %s", cite, cluster)
citation_candidate.save()
mlissner marked this conversation as resolved.
Show resolved Hide resolved

# We don't have these opinions. Since we are backscraping, if the citation
# exists, it will be in the case dictionary, and will be saved in a
# regular ingestion process
if missing_opinions:
# It is easy to ingest a filtered list of cases for OpinionSiteLinear
# but not for plain OpinionSite
if hasattr(site, "cases"):
site.cases = missing_opinions
super().scrape_court(site, full_crawl=True)
mlissner marked this conversation as resolved.
Show resolved Hide resolved
else:
logger.info("Run the backscraper to collect missing opinions")

def citation_is_duplicated(
self, citation_candidate: Citation, cluster: OpinionCluster, cite: str
) -> bool:
"""Checks for exact or reporter duplication of citation in the cluster
Inspired on corpus_importer.utils.add_citations_to_cluster
"""
mlissner marked this conversation as resolved.
Show resolved Hide resolved
citation_params = {**citation_candidate.__dict__}
citation_params.pop("_state", "")
citation_params.pop("id", "")

# Exact duplication
if Citation.objects.filter(**citation_params).exists():
logger.info(
"Citation '%s' already exists for cluster %s",
cite,
cluster.id,
)
return True

# Duplication in the same reporter
if Citation.objects.filter(
cluster_id=cluster.id, reporter=citation_candidate.reporter
).exists():
logger.info(
"Another citation in the same reporter '%s' exists for cluster %s",
citation_candidate.reporter,
cluster.id,
)
return True

return False
12 changes: 2 additions & 10 deletions cl/scrapers/management/commands/cl_scrape_opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,18 +277,10 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
logger.debug(f"#{len(site)} opinions found.")
added = 0
for i, item in enumerate(site):
msg, r = get_binary_content(
item["download_urls"],
site,
method=site.method,
)
if msg:
fingerprint = [f"{court_str}-unexpected-content-type"]
logger.error(msg, extra={"fingerprint": fingerprint})
content = get_binary_content(item["download_urls"], site)
if not content:
continue

content = site.cleanup_content(r.content)

current_date = item["case_dates"]
try:
next_date = site[i + 1]["case_dates"]
Expand Down
12 changes: 2 additions & 10 deletions cl/scrapers/management/commands/cl_scrape_oral_arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,18 +125,10 @@ def scrape_court(
if site.cookies:
logger.info(f"Using cookies: {site.cookies}")
for i, item in enumerate(site):
msg, r = get_binary_content(
item["download_urls"],
site,
method=site.method,
)
if msg:
fingerprint = [f"{court_str}-unexpected-content-type"]
logger.error(msg, extra={"fingerprint": fingerprint})
content = get_binary_content(item["download_urls"], site)
if not content:
continue

content = site.cleanup_content(r.content)

current_date = item["case_dates"]
try:
next_date = site[i + 1]["case_dates"]
Expand Down
94 changes: 79 additions & 15 deletions cl/scrapers/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from django.conf import settings
from django.core.files.base import ContentFile
from django.utils.timezone import now
from juriscraper.AbstractSite import logger

from cl.alerts.factories import AlertFactory
from cl.alerts.models import Alert
Expand All @@ -21,15 +22,21 @@
from cl.lib.test_helpers import generate_docket_target_sources
from cl.scrapers.DupChecker import DupChecker
from cl.scrapers.management.commands import (
cl_back_scrape_citations,
cl_scrape_opinions,
cl_scrape_oral_arguments,
)
from cl.scrapers.models import UrlHash
from cl.scrapers.tasks import extract_doc_content, process_audio_file
from cl.scrapers.test_assets import test_opinion_scraper, test_oral_arg_scraper
from cl.scrapers.utils import get_binary_content, get_extension
from cl.search.factories import CourtFactory, DocketFactory
from cl.search.models import Court, Docket, Opinion
from cl.search.factories import (
CourtFactory,
DocketFactory,
OpinionClusterFactory,
OpinionFactory,
)
from cl.search.models import Citation, Court, Docket, Opinion
from cl.settings import MEDIA_ROOT
from cl.tests.cases import ESIndexTestCase, SimpleTestCase, TestCase
from cl.tests.fixtures import ONE_SECOND_MP3_BYTES, SMALL_WAV_BYTES
Expand Down Expand Up @@ -626,37 +633,94 @@ def setUp(self):
self.mock_response.content = b"not empty"
self.mock_response.headers = {"Content-Type": "application/pdf"}
self.site = test_opinion_scraper.Site()
self.site.method = "GET"
self.logger = logger

@mock.patch("requests.Session.get")
def test_unexpected_content_type(self, mock_get):
"""Test when content type doesn't match scraper expectation."""
mock_get.return_value = self.mock_response
self.site.expected_content_types = ["text/html"]

msg, _ = get_binary_content("/dummy/url/", self.site)
self.assertIn("UnexpectedContentTypeError:", msg)
with mock.patch.object(self.logger, "error") as error_mock:
get_binary_content("/dummy/url/", self.site)
self.assertIn(
"UnexpectedContentTypeError:", error_mock.call_args_list[0][0][0]
)

@mock.patch("requests.Session.get")
def test_correct_content_type(self, mock_get):
"""Test when content type matches scraper expectation."""
mock_get.return_value = self.mock_response
self.site.expected_content_types = ["application/pdf"]

msg, _ = get_binary_content("/dummy/url/", self.site)
self.assertEqual("", msg)
with mock.patch.object(self.logger, "error") as error_mock:
_ = get_binary_content("/dummy/url/", self.site)

self.mock_response.headers = {
"Content-Type": "application/pdf;charset=utf-8"
}
mock_get.return_value = self.mock_response
msg, _ = get_binary_content("/dummy/url/", self.site)
self.assertEqual("", msg)
self.mock_response.headers = {
"Content-Type": "application/pdf;charset=utf-8"
}
mock_get.return_value = self.mock_response
_ = get_binary_content("/dummy/url/", self.site)
error_mock.assert_not_called()

@mock.patch("requests.Session.get")
def test_no_content_type(self, mock_get):
"""Test for no content type expected (ie. Montana)"""
mock_get.return_value = self.mock_response
self.site.expected_content_types = None

msg, _ = get_binary_content("/dummy/url/", self.site)
self.assertEqual("", msg)
with mock.patch.object(self.logger, "error") as error_mock:
_ = get_binary_content("/dummy/url/", self.site)
error_mock.assert_not_called()


class ScrapeCitationsTestCase(TestCase):
"""This class only tests the update of existing clusters
Since the ingestion of new clusters and their citations call
super().scrape_court(), it should be tested in the superclass
"""

def setUp(self):
keys = [
"download_urls",
"case_names",
"citations",
"parallel_citations",
]
self.mock_site = mock.MagicMock()
self.mock_site.__iter__.return_value = [
# update
dict(zip(keys, ["", "something", "482 Md. 342", ""])),
# exact duplicate
dict(zip(keys, ["", "something", "", "482 Md. 342"])),
# reporter duplicate
dict(zip(keys, ["", "something", "485 Md. 111", ""])),
# no citation, ignore
dict(zip(keys, ["", "something", "", ""])),
]
self.mock_site.court_id = "juriscraper.md"
self.hash = "1234" * 10
self.hashes = [self.hash, self.hash, self.hash, "111"]

court = CourtFactory(id="md")
docket = DocketFactory(
case_name="Attorney Grievance v. Taniform",
docket_number="40ag/21",
court_id="md",
source=Docket.SCRAPER,
pacer_case_id=None,
)
self.cluster = OpinionClusterFactory(docket=docket)
opinion = OpinionFactory(sha1=self.hash, cluster=self.cluster)

def test_citation_scraper(self):
"""Test if citation scraper creates a citation or ignores duplicates"""
cmd = "cl.scrapers.management.commands.cl_back_scrape_citations"
with mock.patch(f"{cmd}.sha1", side_effect=self.hashes):
with mock.patch(
f"{cmd}.get_binary_content", return_value="placeholder"
):
cl_back_scrape_citations.Command().scrape_court(self.mock_site)
mlissner marked this conversation as resolved.
Show resolved Hide resolved

citations = Citation.objects.filter(cluster=self.cluster).count()
self.assertEqual(citations, 1, "Exactly 1 citation was expected")
37 changes: 22 additions & 15 deletions cl/scrapers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,30 +155,33 @@ def get_extension(content: bytes) -> str:
def get_binary_content(
download_url: str,
site: AbstractSite,
method: str = "GET",
) -> Tuple[str, Optional[Response]]:
) -> Optional[bytes | str]:
"""Downloads the file, covering a few special cases such as invalid SSL
certificates and empty file errors.

:param download_url: The URL for the item you wish to download.
:param site: Site object used to download data
:param method: The HTTP method used to get the item, or "LOCAL" to get an
item during testing

:return: Two values. The first is a msg indicating any errors encountered.
mlissner marked this conversation as resolved.
Show resolved Hide resolved
If blank, that indicates success. The second value is the response object
containing the downloaded file.
"""
court_str = site.court_id.split(".")[-1].split("_")[0]
fingerprint = [f"{court_str}-unexpected-content-type"]

if not download_url:
# Occurs when a DeferredList fetcher fails.
msg = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}"
return msg, None
error = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}"
logger.error(error, extra={"fingerprint": fingerprint})
return

# noinspection PyBroadException
if method == "LOCAL":
if site.method == "LOCAL":
# "LOCAL" is the method when testing
url = os.path.join(settings.MEDIA_ROOT, download_url)
mr = MockRequest(url=url)
r = mr.get()
r = follow_redirections(r, requests.Session())
r.raise_for_status()
s = requests.Session()
else:
# some sites require a custom ssl_context, contained in the Site's
# session. However, we can't send a request with both a
Expand All @@ -203,8 +206,9 @@ def get_binary_content(

# test for empty files (thank you CA1)
if len(r.content) == 0:
msg = f"EmptyFileError: {download_url}\n{traceback.format_exc()}"
return msg, None
error = f"EmptyFileError: {download_url}\n{traceback.format_exc()}"
logger.error(error, extra={"fingerprint": fingerprint})
return

# test for expected content type (thanks mont for nil)
if site.expected_content_types:
Expand All @@ -217,19 +221,22 @@ def get_binary_content(
content_type in mime.lower()
for mime in site.expected_content_types
)

if not m:
msg = (
error = (
f"UnexpectedContentTypeError: {download_url}\n"
f'\'"{content_type}" not in {site.expected_content_types}'
)
return msg, None
logger.error(error, extra={"fingerprint": fingerprint})
return

# test for and follow meta redirects
r = follow_redirections(r, s)
r.raise_for_status()

# Success!
return "", r
content = site.cleanup_content(r.content)

return content


def signal_handler(signal, frame):
Expand Down
Loading