-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from datamade/chancery
Implement chancery scraper and add case hashing
- Loading branch information
Showing
14 changed files
with
523 additions
and
167 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[flake8] | ||
exclude = | ||
venv, | ||
# So flake8 plays nicely with black | ||
# https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html | ||
max-line-length = 88 | ||
extend-ignore = E203 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,11 @@ | ||
# Data files | ||
scrape/*.json | ||
*.csv | ||
*.jl | ||
*.db | ||
|
||
# Perl | ||
*.pl | ||
|
||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
repos: | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v4.5.0 | ||
hooks: | ||
- id: check-yaml | ||
- id: end-of-file-fixer | ||
- id: trailing-whitespace | ||
|
||
- repo: https://github.com/psf/black | ||
rev: 23.10.1 | ||
hooks: | ||
- id: black | ||
|
||
- repo: https://github.com/pycqa/flake8 | ||
rev: "7ef0350" | ||
hooks: | ||
- id: flake8 | ||
args: [--config=.flake8] | ||
|
||
- repo: https://github.com/sqlfluff/sqlfluff | ||
rev: 2.3.5 | ||
hooks: | ||
- id: sqlfluff-lint |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
[sqlfluff] | ||
|
||
# Supported dialects https://docs.sqlfluff.com/en/stable/dialects.html | ||
# Or run 'sqlfluff dialects' | ||
dialect = sqlite | ||
|
||
# One of [raw|jinja|python|placeholder] | ||
templater = jinja | ||
|
||
# Comma separated list of rules to exclude, or None | ||
# See https://docs.sqlfluff.com/en/stable/configuration.html#enabling-and-disabling-rules | ||
# AM04 (ambiguous.column_count) and ST06 (structure.column_order) are | ||
# two of the more controversial rules included to illustrate usage. | ||
exclude_rules = ambiguous.column_count, structure.column_order | ||
|
||
# The standard max_line_length is 80 in line with the convention of | ||
# other tools and several style guides. Many projects however prefer | ||
# something a little longer. | ||
# Set to zero or negative to disable checks. | ||
max_line_length = 80 | ||
|
||
# CPU processes to use while linting. | ||
# The default is "single threaded" to allow easy debugging, but this | ||
# is often undesirable at scale. | ||
# If positive, just implies number of processes. | ||
# If negative or zero, implies number_of_cpus - specified_number. | ||
# e.g. -1 means use all processors but one. 0 means all cpus. | ||
processes = -1 | ||
|
||
# If using the dbt templater, we recommend setting the project dir. | ||
[sqlfluff:templater:dbt] | ||
project_dir = ./ | ||
|
||
[sqlfluff:indentation] | ||
# While implicit indents are not enabled by default. Many of the | ||
# SQLFluff maintainers do use them in their projects. | ||
allow_implicit_indents = True | ||
|
||
# The default configuration for aliasing rules is "consistent" | ||
# which will auto-detect the setting from the rest of the file. This | ||
# is less desirable in a new project and you may find this (slightly | ||
# more strict) setting more useful. | ||
[sqlfluff:rules:aliasing.table] | ||
aliasing = explicit | ||
[sqlfluff:rules:aliasing.column] | ||
aliasing = explicit | ||
[sqlfluff:rules:aliasing.length] | ||
min_alias_length = 3 | ||
|
||
# The default configuration for capitalisation rules is "consistent" | ||
# which will auto-detect the setting from the rest of the file. This | ||
# is less desirable in a new project and you may find this (slightly | ||
# more strict) setting more useful. | ||
# Typically we find users rely on syntax highlighting rather than | ||
# capitalisation to distinguish between keywords and identifiers. | ||
# Clearly, if your organisation has already settled on uppercase | ||
# formatting for any of these syntax elements then set them to "upper". | ||
# See https://stackoverflow.com/questions/608196/why-should-i-capitalize-my-sql-keywords-is-there-a-good-reason | ||
[sqlfluff:rules:capitalisation.keywords] | ||
capitalisation_policy = upper | ||
[sqlfluff:rules:capitalisation.identifiers] | ||
capitalisation_policy = lower | ||
[sqlfluff:rules:capitalisation.functions] | ||
extended_capitalisation_policy = lower | ||
[sqlfluff:rules:capitalisation.literals] | ||
capitalisation_policy = lower | ||
[sqlfluff:rules:capitalisation.types] | ||
extended_capitalisation_policy = lower |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,184 @@ | ||
from abc import ABC, abstractmethod | ||
from datetime import datetime, timezone | ||
|
||
from scrapy import Spider | ||
from scrapy.exceptions import CloseSpider | ||
from scrapy.spidermiddlewares.httperror import HttpError | ||
|
||
from scripts.hash import dict_hash | ||
|
||
|
||
class UnsuccessfulAutomation(Exception): | ||
... | ||
|
||
|
||
class CourtSpiderBase(ABC, Spider): | ||
def __init__(self, division="2", year=2022, case_numbers_file=None, **kwargs): | ||
self.year = year | ||
self.misses = set() | ||
self.failures = set() | ||
self.last_successful_case_number = None | ||
self.update = bool(case_numbers_file) | ||
|
||
if case_numbers_file: | ||
self.case_numbers = self.case_numbers_from_file(case_numbers_file) | ||
else: | ||
self.case_numbers = self.get_case_numbers(self.year) | ||
|
||
super().__init__(**kwargs) | ||
|
||
@property | ||
@abstractmethod | ||
def name(self): | ||
pass | ||
|
||
@property | ||
@abstractmethod | ||
def url(self): | ||
pass | ||
|
||
@abstractmethod | ||
def start_requests(self): | ||
pass | ||
|
||
@abstractmethod | ||
def get_case_numbers(self): | ||
pass | ||
|
||
def case_numbers_from_file(self, filename): | ||
with open(filename) as f: | ||
for case_number in f: | ||
yield case_number | ||
|
||
def parse(self, response): | ||
now = datetime.now(tz=timezone.utc).isoformat() | ||
|
||
case_info = self.get_case_info(response) | ||
case_info.update( | ||
{ | ||
"events": self.get_activities(response), | ||
"court": self.name, | ||
"updated_at": None if self.update else now, | ||
"scraped_at": now, | ||
} | ||
) | ||
case_info["hash"] = dict_hash(case_info) | ||
|
||
self._success(response) | ||
|
||
return case_info | ||
|
||
def get_case_info(self, response): | ||
case_number = response.xpath( | ||
"//span[@id='MainContent_lblCaseNumber']/text()" | ||
).get() | ||
calendar = response.xpath("//span[@id='MainContent_lblCalendar']/text()").get() | ||
filing_date = response.xpath( | ||
"//span[@id='MainContent_lblDateFiled']/text()" | ||
).get() | ||
division = response.xpath(".//span[@id='MainContent_lblDivision']/text()").get() | ||
case_type = response.xpath("//span[@id='MainContent_lblCaseType']/text()").get() | ||
|
||
plaintiffs = response.xpath( | ||
"//td/span[@id='MainContent_lblPlaintiffs']/text()" | ||
).getall() | ||
|
||
defendants = response.xpath( | ||
"//td/span[@id='MainContent_lblDefendants']/text()" | ||
).getall() | ||
|
||
attorneys = response.xpath( | ||
"//td/span[@id='MainContent_lblAttorney']/text()" | ||
).getall() | ||
|
||
ad_damnum = response.xpath("//span[@id='MainContent_lblAdDamnum']/text()").get() | ||
|
||
return { | ||
"case_number": case_number.strip(), | ||
"calendar": calendar.strip(), | ||
"filing_date": filing_date.strip(), | ||
"division": division.strip(), | ||
"case_type": case_type.strip(), | ||
"ad_damnum": ad_damnum.strip(), | ||
"plaintiffs": [plaintiff.strip() for plaintiff in plaintiffs], | ||
"defendants": [defendant.strip() for defendant in defendants], | ||
"attorneys": [attorney.strip() for attorney in attorneys], | ||
} | ||
|
||
def get_activities(self, response): | ||
case_activities = [] | ||
|
||
case_activity_tables = response.xpath( | ||
".//td[contains(text(), 'Activity Date')]/ancestor::table" | ||
) | ||
|
||
for activity_table in case_activity_tables: | ||
activity = {} | ||
cells = activity_table.xpath("./tbody/tr/td") | ||
|
||
for i in range(0, len(cells), 2): | ||
key = cells[i].xpath("./text()").get().strip(": \n") | ||
value = cells[i + 1].xpath("./text()").get() | ||
if value is None: | ||
value = "" | ||
activity[key] = value.strip() | ||
|
||
case_activities.append( | ||
{ | ||
"description": activity["Event Desc"], | ||
"date": activity["Activity Date"], | ||
"comments": activity["Comments"], | ||
} | ||
) | ||
|
||
return case_activities[::-1] | ||
|
||
def handle_error(self, failure): | ||
if failure.check(HttpError): | ||
response = failure.value.response | ||
if response.status == 404: | ||
self._missing_case(response) | ||
elif response.status == 500: | ||
self._failing_responses(response) | ||
else: | ||
self.logger.error(repr(failure)) | ||
|
||
def _missing_case(self, response): | ||
missing_case_number = response.meta["case_number"] | ||
if self.last_successful_case_number is None: | ||
self.misses.add(missing_case_number) | ||
elif missing_case_number > self.last_successful_case_number: | ||
self.misses.add(missing_case_number) | ||
|
||
if self.misses: | ||
self.logger.info(f'misses: {", ".join(sorted(self.misses))}') | ||
|
||
if len(self.misses) > 50: | ||
raise CloseSpider("run of missing case number") | ||
|
||
def _failing_responses(self, response): | ||
failing_case_number = response.meta["case_number"] | ||
self.failures.add(failing_case_number) | ||
|
||
self.logger.info(f'failures: {", ".join(sorted(self.failures))}') | ||
|
||
if len(self.failures) > 20: | ||
raise CloseSpider("run of failures") | ||
|
||
def _success(self, response): | ||
successful_case_number = response.meta["case_number"] | ||
|
||
if self.last_successful_case_number is None: | ||
self.last_successful_case_number = successful_case_number | ||
elif self.last_successful_case_number < successful_case_number: | ||
self.last_successful_case_number = successful_case_number | ||
|
||
if successful_case_number == self.last_successful_case_number: | ||
self.misses = { | ||
case_number | ||
for case_number in self.misses | ||
if case_number > successful_case_number | ||
} | ||
|
||
if hasattr(response, "raw_api_response"): | ||
self.failures = set() |
Oops, something went wrong.