Merge pull request #13 from datamade/chancery

Implement chancery scraper and add case hashing
datamade · Nov 28, 2023 · b707709 · b707709
2 parents 4b196e2 + 006423f
commit b707709
Show file tree

Hide file tree

Showing 14 changed files with 523 additions and 167 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,7 @@
+[flake8]
+exclude =
+  venv,
+# So flake8 plays nicely with black
+# https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html
+max-line-length = 88
+extend-ignore = E203
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,11 @@
 # Data files
 scrape/*.json
+*.csv
+*.jl
+*.db
+
+# Perl
+*.pl
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,23 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+
+  - repo: https://github.com/psf/black
+    rev: 23.10.1
+    hooks:
+      - id: black
+
+  - repo: https://github.com/pycqa/flake8
+    rev: "7ef0350"
+    hooks:
+      - id: flake8
+        args: [--config=.flake8]
+
+  - repo: https://github.com/sqlfluff/sqlfluff
+    rev: 2.3.5
+    hooks:
+    -   id: sqlfluff-lint
diff --git a/.sqlfluff b/.sqlfluff
@@ -0,0 +1,68 @@
+[sqlfluff]
+
+# Supported dialects https://docs.sqlfluff.com/en/stable/dialects.html
+# Or run 'sqlfluff dialects'
+dialect = sqlite
+
+# One of [raw|jinja|python|placeholder]
+templater = jinja
+
+# Comma separated list of rules to exclude, or None
+# See https://docs.sqlfluff.com/en/stable/configuration.html#enabling-and-disabling-rules
+# AM04 (ambiguous.column_count) and ST06 (structure.column_order) are
+# two of the more controversial rules included to illustrate usage.
+exclude_rules = ambiguous.column_count, structure.column_order
+
+# The standard max_line_length is 80 in line with the convention of
+# other tools and several style guides. Many projects however prefer
+# something a little longer.
+# Set to zero or negative to disable checks.
+max_line_length = 80
+
+# CPU processes to use while linting.
+# The default is "single threaded" to allow easy debugging, but this
+# is often undesirable at scale.
+# If positive, just implies number of processes.
+# If negative or zero, implies number_of_cpus - specified_number.
+# e.g. -1 means use all processors but one. 0 means all cpus.
+processes = -1
+
+# If using the dbt templater, we recommend setting the project dir.
+[sqlfluff:templater:dbt]
+project_dir = ./
+
+[sqlfluff:indentation]
+# While implicit indents are not enabled by default. Many of the
+# SQLFluff maintainers do use them in their projects.
+allow_implicit_indents = True
+
+# The default configuration for aliasing rules is "consistent"
+# which will auto-detect the setting from the rest of the file. This
+# is less desirable in a new project and you may find this (slightly
+# more strict) setting more useful.
+[sqlfluff:rules:aliasing.table]
+aliasing = explicit
+[sqlfluff:rules:aliasing.column]
+aliasing = explicit
+[sqlfluff:rules:aliasing.length]
+min_alias_length = 3
+
+# The default configuration for capitalisation rules is "consistent"
+# which will auto-detect the setting from the rest of the file. This
+# is less desirable in a new project and you may find this (slightly
+# more strict) setting more useful.
+# Typically we find users rely on syntax highlighting rather than
+# capitalisation to distinguish between keywords and identifiers.
+# Clearly, if your organisation has already settled on uppercase
+# formatting for any of these syntax elements then set them to "upper".
+# See https://stackoverflow.com/questions/608196/why-should-i-capitalize-my-sql-keywords-is-there-a-good-reason
+[sqlfluff:rules:capitalisation.keywords]
+capitalisation_policy = upper
+[sqlfluff:rules:capitalisation.identifiers]
+capitalisation_policy = lower
+[sqlfluff:rules:capitalisation.functions]
+extended_capitalisation_policy = lower
+[sqlfluff:rules:capitalisation.literals]
+capitalisation_policy = lower
+[sqlfluff:rules:capitalisation.types]
+extended_capitalisation_policy = lower
diff --git a/Makefile b/Makefile
@@ -1,17 +1,21 @@
 .PHONY: all
 all: upload
 
-civil.zip : civil.db
-	- rm -rf civil_csv
-	mkdir civil_csv
-	echo "select * from court_case" | sqlite3 -csv -header civil.db > civil_csv/court_case.csv
-	echo "select * from plaintiff" | sqlite3 -csv -header civil.db > civil_csv/plaintiff.csv
-	echo "select * from defendant" | sqlite3 -csv -header civil.db > civil_csv/defendant.csv
-	echo "select * from attorney" | sqlite3 -csv -header civil.db > civil_csv/attorney.csv
-	echo "select * from event" | sqlite3 -csv -header civil.db > civil_csv/event.csv
-	zip -r $@ civil_csv
-
-civil.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
+.PHONY: clean
+clean:
+	rm *.jl *.json *.db
+
+cases.zip : cases.db
+	- rm -rf cases_csv
+	mkdir cases_csv
+	echo "select * from court_case" | sqlite3 -csv -header cases.db > cases_csv/court_case.csv
+	echo "select * from plaintiff" | sqlite3 -csv -header cases.db > cases_csv/plaintiff.csv
+	echo "select * from defendant" | sqlite3 -csv -header cases.db > cases_csv/defendant.csv
+	echo "select * from attorney" | sqlite3 -csv -header cases.db > cases_csv/attorney.csv
+	echo "select * from event" | sqlite3 -csv -header cases.db > cases_csv/event.csv
+	zip -r $@ cases_csv
+
+cases.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
 	csvs-to-sqlite $^ $@
 	cat scripts/foreign_key.sql | sqlite3 $@
 	sqlite-utils add-column $@ court_case subdivision text
@@ -56,6 +60,26 @@ civil.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
 	sqlite-utils convert $@ court_case filing_date 'r.parsedate(value)'
 	sqlite-utils convert $@ event date 'r.parsedate(value)'
 
+.PHONY : update_civil_db
+update_civil_db : rescraped_civil_cases.csv
+	tail -n +2 $< | sqlite3 cases.db -init scripts/update.sql -bail
+
+.PHONY : update_chancery_db
+update_chancery_db : rescraped_chancery_cases.csv
+	tail -n +2 $< | sqlite3 cases.db -init scripts/update.sql -bail
+
+rescraped_civil_cases.csv : to_rescrape.civil.csv
+	 scrapy crawl civil -a update=True -a case_numbers_file=$< -O $@
+
+rescraped_chancery_cases.csv : to_rescrape.chancery.csv
+	 scrapy crawl chancery -a update=True -a case_numbers_file=$< -O $@
+
+to_rescrape.civil.csv : cases.db
+	sqlite3 cases.db < scripts/to_scrape.civil.sql > $@
+
+to_rescrape.chancery.csv : cases.db
+	sqlite3 cases.db < scripts/to_scrape.chancery.sql > $@
+
 %.csv: court_case_raw.%.csv
 	cat $< | \
            sed '1s/court_case_raw\._key/case_number/g' | \
@@ -64,22 +88,27 @@ civil.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
 court_case.csv : court_case_raw.csv
 	cat $< | sed -r '1s/[a-z0-9_]+\.//g' > $@
 
-court_case_raw.attorney.csv court_case_raw.defendant.csv court_case_raw.plaintiff.csv court_case_raw.csv court_case_raw.event.csv : civil.json
-	json-to-multicsv.pl --file $< \
+court_case_raw.attorney.csv court_case_raw.defendant.csv court_case_raw.plaintiff.csv court_case_raw.csv court_case_raw.event.csv : cases.json
+	perl json-to-multicsv.pl --file $< \
             --path /:table:court_case_raw \
             --path /*/events/:table:event \
             --path /*/plaintiffs/:table:plaintiff \
             --path /*/defendants/:table:defendant \
             --path /*/attorneys/:table:attorney
 
-civil.json : 2022_civil.jl 2023_civil.jl
+cases.json : 2022_civil.jl 2023_civil.jl 2022_chancery.jl 2023_chancery.jl
 	cat $^ | sort | python scripts/remove_dupe_cases.py | jq --slurp '.' > $@
 
 %_civil.jl : %_civil-2.jl %_civil-3.jl %_civil-4.jl %_civil-5.jl	\
              %_civil-6.jl %_civil-101.jl %_civil-104.jl %_civil-11.jl	\
              %_civil-13.jl %_civil-14.jl %_civil-15.jl %_civil-17.jl
 	cat $^ > $@
 
+2022_chancery.jl :
+	 scrapy crawl chancery -a year=2022 -O $@
+
+2023_chancery.jl :
+	 scrapy crawl chancery -a year=2023 -O $@
 
 2022_civil-%.jl :
 	 scrapy crawl civil -a division=$* -a year=2022 -O $@
@@ -89,4 +118,4 @@ civil.json : 2022_civil.jl 2023_civil.jl
 
 .PHONY : upload
 upload : 2022_civil.json
-	python scripts/upload_scrapes.py	
+	python scripts/upload_scrapes.py
diff --git a/courtscraper/spiders/base.py b/courtscraper/spiders/base.py
@@ -0,0 +1,184 @@
+from abc import ABC, abstractmethod
+from datetime import datetime, timezone
+
+from scrapy import Spider
+from scrapy.exceptions import CloseSpider
+from scrapy.spidermiddlewares.httperror import HttpError
+
+from scripts.hash import dict_hash
+
+
+class UnsuccessfulAutomation(Exception):
+    ...
+
+
+class CourtSpiderBase(ABC, Spider):
+    def __init__(self, division="2", year=2022, case_numbers_file=None, **kwargs):
+        self.year = year
+        self.misses = set()
+        self.failures = set()
+        self.last_successful_case_number = None
+        self.update = bool(case_numbers_file)
+
+        if case_numbers_file:
+            self.case_numbers = self.case_numbers_from_file(case_numbers_file)
+        else:
+            self.case_numbers = self.get_case_numbers(self.year)
+
+        super().__init__(**kwargs)
+
+    @property
+    @abstractmethod
+    def name(self):
+        pass
+
+    @property
+    @abstractmethod
+    def url(self):
+        pass
+
+    @abstractmethod
+    def start_requests(self):
+        pass
+
+    @abstractmethod
+    def get_case_numbers(self):
+        pass
+
+    def case_numbers_from_file(self, filename):
+        with open(filename) as f:
+            for case_number in f:
+                yield case_number
+
+    def parse(self, response):
+        now = datetime.now(tz=timezone.utc).isoformat()
+
+        case_info = self.get_case_info(response)
+        case_info.update(
+            {
+                "events": self.get_activities(response),
+                "court": self.name,
+                "updated_at": None if self.update else now,
+                "scraped_at": now,
+            }
+        )
+        case_info["hash"] = dict_hash(case_info)
+
+        self._success(response)
+
+        return case_info
+
+    def get_case_info(self, response):
+        case_number = response.xpath(
+            "//span[@id='MainContent_lblCaseNumber']/text()"
+        ).get()
+        calendar = response.xpath("//span[@id='MainContent_lblCalendar']/text()").get()
+        filing_date = response.xpath(
+            "//span[@id='MainContent_lblDateFiled']/text()"
+        ).get()
+        division = response.xpath(".//span[@id='MainContent_lblDivision']/text()").get()
+        case_type = response.xpath("//span[@id='MainContent_lblCaseType']/text()").get()
+
+        plaintiffs = response.xpath(
+            "//td/span[@id='MainContent_lblPlaintiffs']/text()"
+        ).getall()
+
+        defendants = response.xpath(
+            "//td/span[@id='MainContent_lblDefendants']/text()"
+        ).getall()
+
+        attorneys = response.xpath(
+            "//td/span[@id='MainContent_lblAttorney']/text()"
+        ).getall()
+
+        ad_damnum = response.xpath("//span[@id='MainContent_lblAdDamnum']/text()").get()
+
+        return {
+            "case_number": case_number.strip(),
+            "calendar": calendar.strip(),
+            "filing_date": filing_date.strip(),
+            "division": division.strip(),
+            "case_type": case_type.strip(),
+            "ad_damnum": ad_damnum.strip(),
+            "plaintiffs": [plaintiff.strip() for plaintiff in plaintiffs],
+            "defendants": [defendant.strip() for defendant in defendants],
+            "attorneys": [attorney.strip() for attorney in attorneys],
+        }
+
+    def get_activities(self, response):
+        case_activities = []
+
+        case_activity_tables = response.xpath(
+            ".//td[contains(text(), 'Activity Date')]/ancestor::table"
+        )
+
+        for activity_table in case_activity_tables:
+            activity = {}
+            cells = activity_table.xpath("./tbody/tr/td")
+
+            for i in range(0, len(cells), 2):
+                key = cells[i].xpath("./text()").get().strip(": \n")
+                value = cells[i + 1].xpath("./text()").get()
+                if value is None:
+                    value = ""
+                activity[key] = value.strip()
+
+            case_activities.append(
+                {
+                    "description": activity["Event Desc"],
+                    "date": activity["Activity Date"],
+                    "comments": activity["Comments"],
+                }
+            )
+
+        return case_activities[::-1]
+
+    def handle_error(self, failure):
+        if failure.check(HttpError):
+            response = failure.value.response
+            if response.status == 404:
+                self._missing_case(response)
+            elif response.status == 500:
+                self._failing_responses(response)
+        else:
+            self.logger.error(repr(failure))
+
+    def _missing_case(self, response):
+        missing_case_number = response.meta["case_number"]
+        if self.last_successful_case_number is None:
+            self.misses.add(missing_case_number)
+        elif missing_case_number > self.last_successful_case_number:
+            self.misses.add(missing_case_number)
+
+        if self.misses:
+            self.logger.info(f'misses: {", ".join(sorted(self.misses))}')
+
+        if len(self.misses) > 50:
+            raise CloseSpider("run of missing case number")
+
+    def _failing_responses(self, response):
+        failing_case_number = response.meta["case_number"]
+        self.failures.add(failing_case_number)
+
+        self.logger.info(f'failures: {", ".join(sorted(self.failures))}')
+
+        if len(self.failures) > 20:
+            raise CloseSpider("run of failures")
+
+    def _success(self, response):
+        successful_case_number = response.meta["case_number"]
+
+        if self.last_successful_case_number is None:
+            self.last_successful_case_number = successful_case_number
+        elif self.last_successful_case_number < successful_case_number:
+            self.last_successful_case_number = successful_case_number
+
+        if successful_case_number == self.last_successful_case_number:
+            self.misses = {
+                case_number
+                for case_number in self.misses
+                if case_number > successful_case_number
+            }
+
+            if hasattr(response, "raw_api_response"):
+                self.failures = set()