freelawproject · mlissner · Aug 22, 2024 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/cl/scrapers/management/commands/cl_back_scrape_citations.py b/cl/scrapers/management/commands/cl_back_scrape_citations.py
@@ -0,0 +1,127 @@
+"""
+When opinions are first published on the courts' sites, they won't have
+all their citations assigned. Some courts will publish the citations
+in the same pages we scrape, but months later
+
+This command re-uses the (back)scraper we use to get opinions, to get
+the lagged citations and associate them with the Opinions we first
+downloaded. If we find an Opinion we don't have in the database,
+we ingest it as in a regular scrape
+"""
+
+from django.utils.encoding import force_bytes
+
+from cl.lib.command_utils import logger
+from cl.lib.crypto import sha1
+from cl.scrapers.management.commands import cl_back_scrape_opinions
+from cl.scrapers.management.commands.cl_scrape_opinions import make_citation
+from cl.scrapers.utils import get_binary_content
+from cl.search.models import Citation, Opinion, OpinionCluster
+
+
+class Command(cl_back_scrape_opinions.Command):
+    def scrape_court(self, site, full_crawl=False, ocr_available=True):
+        """
+        If the scraped case has citation data
+            Check for Opinion existance via content hash
+            If we have the Opinion
+                if we don't have the citation -> ingest
+                if we already have the citation -> pass
+            If we don't have the Opinion
+                ingest the opinion with it's citation, that is to say,
+                use the regular scraping process!
+
+        :param site: scraper object that has already downloaded
+            it's case data
+        """
+        missing_opinions = []
+        court_str = site.court_id.split(".")[-1].split("_")[0]
+
+        for case in site:
+            citation = case.get("citations")
+            parallel_citation = case.get("parallel_citations")
+            if not citation and not parallel_citation:
+                logger.debug(
+                    "No citation, skipping row for case %s",
+                    case.get("case_names"),
+                )
+                continue
+
+            content = get_binary_content(case["download_urls"], site)
+            if not content:
+                # Errors are logged by get_binary_content itself
+                continue
+            sha1_hash = sha1(force_bytes(content))
+
+            try:
+                cluster = Opinion.objects.get(sha1=sha1_hash).cluster
+            except Opinion.DoesNotExist:
+                missing_opinions.append(case)
+                logger.info(
+                    "Case '%s', opinion '%s' has no matching hash in the DB. "
+                    "Has a citation '%s'. Will try to ingest all objects",
+                    case["case_names"],
+                    case["download_urls"],
+                    citation or parallel_citation,
+                )
+                continue
+
+            for cite in [citation, parallel_citation]:
+                if not cite:
+                    continue
+
+                citation_candidate = make_citation(cite, cluster, court_str)
+                if not citation_candidate:
+                    continue
+
+                if self.citation_is_duplicated(
+                    citation_candidate, cluster, cite
+                ):
+                    continue
+
+                logger.info("Saving citation %s for cluster %s", cite, cluster)
+                citation_candidate.save()
+
+        # We don't have these opinions. Since we are backscraping, if the citation
+        # exists, it will be in the case dictionary, and will be saved in a
+        # regular ingestion process
+        if missing_opinions:
+            # It is easy to ingest a filtered list of cases for OpinionSiteLinear
+            # but not for plain OpinionSite
+            if hasattr(site, "cases"):
+                site.cases = missing_opinions
+                super().scrape_court(site, full_crawl=True)
+            else:
+                logger.info("Run the backscraper to collect missing opinions")
+
+    def citation_is_duplicated(
+        self, citation_candidate: Citation, cluster: OpinionCluster, cite: str
+    ) -> bool:
+        """Checks for exact or reporter duplication of citation in the cluster
+        Inspired on corpus_importer.utils.add_citations_to_cluster
+        """
+        citation_params = {**citation_candidate.__dict__}
+        citation_params.pop("_state", "")
+        citation_params.pop("id", "")
+
+        # Exact duplication
+        if Citation.objects.filter(**citation_params).exists():
+            logger.info(
+                "Citation '%s' already exists for cluster %s",
+                cite,
+                cluster.id,
+            )
+            return True
+
+        # Duplication in the same reporter
+        if Citation.objects.filter(
+            cluster_id=cluster.id, reporter=citation_candidate.reporter
+        ).exists():
+            logger.info(
+                "Another citation in the same reporter '%s' exists for cluster %s",
+                citation_candidate.reporter,
+                cluster.id,
+            )
+            return True
+
+        return False
diff --git a/cl/scrapers/management/commands/cl_scrape_opinions.py b/cl/scrapers/management/commands/cl_scrape_opinions.py
@@ -277,18 +277,10 @@ def scrape_court(self, site, full_crawl=False, ocr_available=True):
         logger.debug(f"#{len(site)} opinions found.")
         added = 0
         for i, item in enumerate(site):
-            msg, r = get_binary_content(
-                item["download_urls"],
-                site,
-                method=site.method,
-            )
-            if msg:
-                fingerprint = [f"{court_str}-unexpected-content-type"]
-                logger.error(msg, extra={"fingerprint": fingerprint})
+            content = get_binary_content(item["download_urls"], site)
+            if not content:
                 continue
 
-            content = site.cleanup_content(r.content)
-
             current_date = item["case_dates"]
             try:
                 next_date = site[i + 1]["case_dates"]

diff --git a/cl/scrapers/management/commands/cl_scrape_oral_arguments.py b/cl/scrapers/management/commands/cl_scrape_oral_arguments.py
@@ -125,18 +125,10 @@ def scrape_court(
         if site.cookies:
             logger.info(f"Using cookies: {site.cookies}")
         for i, item in enumerate(site):
-            msg, r = get_binary_content(
-                item["download_urls"],
-                site,
-                method=site.method,
-            )
-            if msg:
-                fingerprint = [f"{court_str}-unexpected-content-type"]
-                logger.error(msg, extra={"fingerprint": fingerprint})
+            content = get_binary_content(item["download_urls"], site)
+            if not content:
                 continue
 
-            content = site.cleanup_content(r.content)
-
             current_date = item["case_dates"]
             try:
                 next_date = site[i + 1]["case_dates"]

diff --git a/cl/scrapers/tests.py b/cl/scrapers/tests.py
@@ -8,6 +8,7 @@
 from django.conf import settings
 from django.core.files.base import ContentFile
 from django.utils.timezone import now
+from juriscraper.AbstractSite import logger
 
 from cl.alerts.factories import AlertFactory
 from cl.alerts.models import Alert
@@ -21,15 +22,21 @@
 from cl.lib.test_helpers import generate_docket_target_sources
 from cl.scrapers.DupChecker import DupChecker
 from cl.scrapers.management.commands import (
+    cl_back_scrape_citations,
     cl_scrape_opinions,
     cl_scrape_oral_arguments,
 )
 from cl.scrapers.models import UrlHash
 from cl.scrapers.tasks import extract_doc_content, process_audio_file
 from cl.scrapers.test_assets import test_opinion_scraper, test_oral_arg_scraper
 from cl.scrapers.utils import get_binary_content, get_extension
-from cl.search.factories import CourtFactory, DocketFactory
-from cl.search.models import Court, Docket, Opinion
+from cl.search.factories import (
+    CourtFactory,
+    DocketFactory,
+    OpinionClusterFactory,
+    OpinionFactory,
+)
+from cl.search.models import Citation, Court, Docket, Opinion
 from cl.settings import MEDIA_ROOT
 from cl.tests.cases import ESIndexTestCase, SimpleTestCase, TestCase
 from cl.tests.fixtures import ONE_SECOND_MP3_BYTES, SMALL_WAV_BYTES
@@ -626,37 +633,94 @@ def setUp(self):
         self.mock_response.content = b"not empty"
         self.mock_response.headers = {"Content-Type": "application/pdf"}
         self.site = test_opinion_scraper.Site()
+        self.site.method = "GET"
+        self.logger = logger
 
     @mock.patch("requests.Session.get")
     def test_unexpected_content_type(self, mock_get):
         """Test when content type doesn't match scraper expectation."""
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = ["text/html"]
-
-        msg, _ = get_binary_content("/dummy/url/", self.site)
-        self.assertIn("UnexpectedContentTypeError:", msg)
+        with mock.patch.object(self.logger, "error") as error_mock:
+            get_binary_content("/dummy/url/", self.site)
+        self.assertIn(
+            "UnexpectedContentTypeError:", error_mock.call_args_list[0][0][0]
+        )
 
     @mock.patch("requests.Session.get")
     def test_correct_content_type(self, mock_get):
         """Test when content type matches scraper expectation."""
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = ["application/pdf"]
 
-        msg, _ = get_binary_content("/dummy/url/", self.site)
-        self.assertEqual("", msg)
+        with mock.patch.object(self.logger, "error") as error_mock:
+            _ = get_binary_content("/dummy/url/", self.site)
 
-        self.mock_response.headers = {
-            "Content-Type": "application/pdf;charset=utf-8"
-        }
-        mock_get.return_value = self.mock_response
-        msg, _ = get_binary_content("/dummy/url/", self.site)
-        self.assertEqual("", msg)
+            self.mock_response.headers = {
+                "Content-Type": "application/pdf;charset=utf-8"
+            }
+            mock_get.return_value = self.mock_response
+            _ = get_binary_content("/dummy/url/", self.site)
+            error_mock.assert_not_called()
 
     @mock.patch("requests.Session.get")
     def test_no_content_type(self, mock_get):
         """Test for no content type expected (ie. Montana)"""
         mock_get.return_value = self.mock_response
         self.site.expected_content_types = None
 
-        msg, _ = get_binary_content("/dummy/url/", self.site)
-        self.assertEqual("", msg)
+        with mock.patch.object(self.logger, "error") as error_mock:
+            _ = get_binary_content("/dummy/url/", self.site)
+            error_mock.assert_not_called()
+
+
+class ScrapeCitationsTestCase(TestCase):
+    """This class only tests the update of existing clusters
+    Since the ingestion of new clusters and their citations call
+    super().scrape_court(), it should be tested in the superclass
+    """
+
+    def setUp(self):
+        keys = [
+            "download_urls",
+            "case_names",
+            "citations",
+            "parallel_citations",
+        ]
+        self.mock_site = mock.MagicMock()
+        self.mock_site.__iter__.return_value = [
+            # update
+            dict(zip(keys, ["", "something", "482 Md. 342", ""])),
+            # exact duplicate
+            dict(zip(keys, ["", "something", "", "482 Md. 342"])),
+            # reporter duplicate
+            dict(zip(keys, ["", "something", "485 Md. 111", ""])),
+            # no citation, ignore
+            dict(zip(keys, ["", "something", "", ""])),
+        ]
+        self.mock_site.court_id = "juriscraper.md"
+        self.hash = "1234" * 10
+        self.hashes = [self.hash, self.hash, self.hash, "111"]
+
+        court = CourtFactory(id="md")
+        docket = DocketFactory(
+            case_name="Attorney Grievance v. Taniform",
+            docket_number="40ag/21",
+            court_id="md",
+            source=Docket.SCRAPER,
+            pacer_case_id=None,
+        )
+        self.cluster = OpinionClusterFactory(docket=docket)
+        opinion = OpinionFactory(sha1=self.hash, cluster=self.cluster)
+
+    def test_citation_scraper(self):
+        """Test if citation scraper creates a citation or ignores duplicates"""
+        cmd = "cl.scrapers.management.commands.cl_back_scrape_citations"
+        with mock.patch(f"{cmd}.sha1", side_effect=self.hashes):
+            with mock.patch(
+                f"{cmd}.get_binary_content", return_value="placeholder"
+            ):
+                cl_back_scrape_citations.Command().scrape_court(self.mock_site)
+
+        citations = Citation.objects.filter(cluster=self.cluster).count()
+        self.assertEqual(citations, 1, "Exactly 1 citation was expected")
diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py
@@ -155,30 +155,33 @@ def get_extension(content: bytes) -> str:
 def get_binary_content(
     download_url: str,
     site: AbstractSite,
-    method: str = "GET",
-) -> Tuple[str, Optional[Response]]:
+) -> Optional[bytes | str]:
     """Downloads the file, covering a few special cases such as invalid SSL
     certificates and empty file errors.
 
     :param download_url: The URL for the item you wish to download.
     :param site: Site object used to download data
-    :param method: The HTTP method used to get the item, or "LOCAL" to get an
-    item during testing
+
     :return: Two values. The first is a msg indicating any errors encountered.
     If blank, that indicates success. The second value is the response object
     containing the downloaded file.
     """
+    court_str = site.court_id.split(".")[-1].split("_")[0]
+    fingerprint = [f"{court_str}-unexpected-content-type"]
+
     if not download_url:
         # Occurs when a DeferredList fetcher fails.
-        msg = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}"
-        return msg, None
+        error = f"NoDownloadUrlError: {download_url}\n{traceback.format_exc()}"
+        logger.error(error, extra={"fingerprint": fingerprint})
+        return
+
     # noinspection PyBroadException
-    if method == "LOCAL":
+    if site.method == "LOCAL":
+        # "LOCAL" is the method when testing
         url = os.path.join(settings.MEDIA_ROOT, download_url)
         mr = MockRequest(url=url)
         r = mr.get()
-        r = follow_redirections(r, requests.Session())
-        r.raise_for_status()
+        s = requests.Session()
     else:
         # some sites require a custom ssl_context, contained in the Site's
         # session. However, we can't send a request with both a
@@ -203,8 +206,9 @@ def get_binary_content(
 
         # test for empty files (thank you CA1)
         if len(r.content) == 0:
-            msg = f"EmptyFileError: {download_url}\n{traceback.format_exc()}"
-            return msg, None
+            error = f"EmptyFileError: {download_url}\n{traceback.format_exc()}"
+            logger.error(error, extra={"fingerprint": fingerprint})
+            return
 
         # test for expected content type (thanks mont for nil)
         if site.expected_content_types:
@@ -217,19 +221,22 @@ def get_binary_content(
                 content_type in mime.lower()
                 for mime in site.expected_content_types
             )
+
             if not m:
-                msg = (
+                error = (
                     f"UnexpectedContentTypeError: {download_url}\n"
                     f'\'"{content_type}" not in {site.expected_content_types}'
                 )
-                return msg, None
+                logger.error(error, extra={"fingerprint": fingerprint})
+                return
 
         # test for and follow meta redirects
         r = follow_redirections(r, s)
         r.raise_for_status()
 
-    # Success!
-    return "", r
+    content = site.cleanup_content(r.content)
+
+    return content
 
 
 def signal_handler(signal, frame):