Skip to content

Commit

Permalink
Merge pull request #13 from datamade/chancery
Browse files Browse the repository at this point in the history
Implement chancery scraper and add case hashing
  • Loading branch information
antidipyramid authored Nov 28, 2023
2 parents 4b196e2 + 006423f commit b707709
Show file tree
Hide file tree
Showing 14 changed files with 523 additions and 167 deletions.
7 changes: 7 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[flake8]
exclude =
venv,
# So flake8 plays nicely with black
# https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html
max-line-length = 88
extend-ignore = E203
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Data files
scrape/*.json
*.csv
*.jl
*.db

# Perl
*.pl

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
23 changes: 23 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: check-yaml
- id: end-of-file-fixer
- id: trailing-whitespace

- repo: https://github.com/psf/black
rev: 23.10.1
hooks:
- id: black

- repo: https://github.com/pycqa/flake8
rev: "7ef0350"
hooks:
- id: flake8
args: [--config=.flake8]

- repo: https://github.com/sqlfluff/sqlfluff
rev: 2.3.5
hooks:
- id: sqlfluff-lint
68 changes: 68 additions & 0 deletions .sqlfluff
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
[sqlfluff]

# Supported dialects https://docs.sqlfluff.com/en/stable/dialects.html
# Or run 'sqlfluff dialects'
dialect = sqlite

# One of [raw|jinja|python|placeholder]
templater = jinja

# Comma separated list of rules to exclude, or None
# See https://docs.sqlfluff.com/en/stable/configuration.html#enabling-and-disabling-rules
# AM04 (ambiguous.column_count) and ST06 (structure.column_order) are
# two of the more controversial rules included to illustrate usage.
exclude_rules = ambiguous.column_count, structure.column_order

# The standard max_line_length is 80 in line with the convention of
# other tools and several style guides. Many projects however prefer
# something a little longer.
# Set to zero or negative to disable checks.
max_line_length = 80

# CPU processes to use while linting.
# The default is "single threaded" to allow easy debugging, but this
# is often undesirable at scale.
# If positive, just implies number of processes.
# If negative or zero, implies number_of_cpus - specified_number.
# e.g. -1 means use all processors but one. 0 means all cpus.
processes = -1

# If using the dbt templater, we recommend setting the project dir.
[sqlfluff:templater:dbt]
project_dir = ./

[sqlfluff:indentation]
# While implicit indents are not enabled by default. Many of the
# SQLFluff maintainers do use them in their projects.
allow_implicit_indents = True

# The default configuration for aliasing rules is "consistent"
# which will auto-detect the setting from the rest of the file. This
# is less desirable in a new project and you may find this (slightly
# more strict) setting more useful.
[sqlfluff:rules:aliasing.table]
aliasing = explicit
[sqlfluff:rules:aliasing.column]
aliasing = explicit
[sqlfluff:rules:aliasing.length]
min_alias_length = 3

# The default configuration for capitalisation rules is "consistent"
# which will auto-detect the setting from the rest of the file. This
# is less desirable in a new project and you may find this (slightly
# more strict) setting more useful.
# Typically we find users rely on syntax highlighting rather than
# capitalisation to distinguish between keywords and identifiers.
# Clearly, if your organisation has already settled on uppercase
# formatting for any of these syntax elements then set them to "upper".
# See https://stackoverflow.com/questions/608196/why-should-i-capitalize-my-sql-keywords-is-there-a-good-reason
[sqlfluff:rules:capitalisation.keywords]
capitalisation_policy = upper
[sqlfluff:rules:capitalisation.identifiers]
capitalisation_policy = lower
[sqlfluff:rules:capitalisation.functions]
extended_capitalisation_policy = lower
[sqlfluff:rules:capitalisation.literals]
capitalisation_policy = lower
[sqlfluff:rules:capitalisation.types]
extended_capitalisation_policy = lower
59 changes: 44 additions & 15 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
.PHONY: all
all: upload

civil.zip : civil.db
- rm -rf civil_csv
mkdir civil_csv
echo "select * from court_case" | sqlite3 -csv -header civil.db > civil_csv/court_case.csv
echo "select * from plaintiff" | sqlite3 -csv -header civil.db > civil_csv/plaintiff.csv
echo "select * from defendant" | sqlite3 -csv -header civil.db > civil_csv/defendant.csv
echo "select * from attorney" | sqlite3 -csv -header civil.db > civil_csv/attorney.csv
echo "select * from event" | sqlite3 -csv -header civil.db > civil_csv/event.csv
zip -r $@ civil_csv

civil.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
.PHONY: clean
clean:
rm *.jl *.json *.db

cases.zip : cases.db
- rm -rf cases_csv
mkdir cases_csv
echo "select * from court_case" | sqlite3 -csv -header cases.db > cases_csv/court_case.csv
echo "select * from plaintiff" | sqlite3 -csv -header cases.db > cases_csv/plaintiff.csv
echo "select * from defendant" | sqlite3 -csv -header cases.db > cases_csv/defendant.csv
echo "select * from attorney" | sqlite3 -csv -header cases.db > cases_csv/attorney.csv
echo "select * from event" | sqlite3 -csv -header cases.db > cases_csv/event.csv
zip -r $@ cases_csv

cases.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
csvs-to-sqlite $^ $@
cat scripts/foreign_key.sql | sqlite3 $@
sqlite-utils add-column $@ court_case subdivision text
Expand Down Expand Up @@ -56,6 +60,26 @@ civil.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
sqlite-utils convert $@ court_case filing_date 'r.parsedate(value)'
sqlite-utils convert $@ event date 'r.parsedate(value)'

.PHONY : update_civil_db
update_civil_db : rescraped_civil_cases.csv
tail -n +2 $< | sqlite3 cases.db -init scripts/update.sql -bail

.PHONY : update_chancery_db
update_chancery_db : rescraped_chancery_cases.csv
tail -n +2 $< | sqlite3 cases.db -init scripts/update.sql -bail

rescraped_civil_cases.csv : to_rescrape.civil.csv
scrapy crawl civil -a update=True -a case_numbers_file=$< -O $@

rescraped_chancery_cases.csv : to_rescrape.chancery.csv
scrapy crawl chancery -a update=True -a case_numbers_file=$< -O $@

to_rescrape.civil.csv : cases.db
sqlite3 cases.db < scripts/to_scrape.civil.sql > $@

to_rescrape.chancery.csv : cases.db
sqlite3 cases.db < scripts/to_scrape.chancery.sql > $@

%.csv: court_case_raw.%.csv
cat $< | \
sed '1s/court_case_raw\._key/case_number/g' | \
Expand All @@ -64,22 +88,27 @@ civil.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv
court_case.csv : court_case_raw.csv
cat $< | sed -r '1s/[a-z0-9_]+\.//g' > $@

court_case_raw.attorney.csv court_case_raw.defendant.csv court_case_raw.plaintiff.csv court_case_raw.csv court_case_raw.event.csv : civil.json
json-to-multicsv.pl --file $< \
court_case_raw.attorney.csv court_case_raw.defendant.csv court_case_raw.plaintiff.csv court_case_raw.csv court_case_raw.event.csv : cases.json
perl json-to-multicsv.pl --file $< \
--path /:table:court_case_raw \
--path /*/events/:table:event \
--path /*/plaintiffs/:table:plaintiff \
--path /*/defendants/:table:defendant \
--path /*/attorneys/:table:attorney

civil.json : 2022_civil.jl 2023_civil.jl
cases.json : 2022_civil.jl 2023_civil.jl 2022_chancery.jl 2023_chancery.jl
cat $^ | sort | python scripts/remove_dupe_cases.py | jq --slurp '.' > $@

%_civil.jl : %_civil-2.jl %_civil-3.jl %_civil-4.jl %_civil-5.jl \
%_civil-6.jl %_civil-101.jl %_civil-104.jl %_civil-11.jl \
%_civil-13.jl %_civil-14.jl %_civil-15.jl %_civil-17.jl
cat $^ > $@

2022_chancery.jl :
scrapy crawl chancery -a year=2022 -O $@

2023_chancery.jl :
scrapy crawl chancery -a year=2023 -O $@

2022_civil-%.jl :
scrapy crawl civil -a division=$* -a year=2022 -O $@
Expand All @@ -89,4 +118,4 @@ civil.json : 2022_civil.jl 2023_civil.jl

.PHONY : upload
upload : 2022_civil.json
python scripts/upload_scrapes.py
python scripts/upload_scrapes.py
184 changes: 184 additions & 0 deletions courtscraper/spiders/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
from abc import ABC, abstractmethod
from datetime import datetime, timezone

from scrapy import Spider
from scrapy.exceptions import CloseSpider
from scrapy.spidermiddlewares.httperror import HttpError

from scripts.hash import dict_hash


class UnsuccessfulAutomation(Exception):
...


class CourtSpiderBase(ABC, Spider):
def __init__(self, division="2", year=2022, case_numbers_file=None, **kwargs):
self.year = year
self.misses = set()
self.failures = set()
self.last_successful_case_number = None
self.update = bool(case_numbers_file)

if case_numbers_file:
self.case_numbers = self.case_numbers_from_file(case_numbers_file)
else:
self.case_numbers = self.get_case_numbers(self.year)

super().__init__(**kwargs)

@property
@abstractmethod
def name(self):
pass

@property
@abstractmethod
def url(self):
pass

@abstractmethod
def start_requests(self):
pass

@abstractmethod
def get_case_numbers(self):
pass

def case_numbers_from_file(self, filename):
with open(filename) as f:
for case_number in f:
yield case_number

def parse(self, response):
now = datetime.now(tz=timezone.utc).isoformat()

case_info = self.get_case_info(response)
case_info.update(
{
"events": self.get_activities(response),
"court": self.name,
"updated_at": None if self.update else now,
"scraped_at": now,
}
)
case_info["hash"] = dict_hash(case_info)

self._success(response)

return case_info

def get_case_info(self, response):
case_number = response.xpath(
"//span[@id='MainContent_lblCaseNumber']/text()"
).get()
calendar = response.xpath("//span[@id='MainContent_lblCalendar']/text()").get()
filing_date = response.xpath(
"//span[@id='MainContent_lblDateFiled']/text()"
).get()
division = response.xpath(".//span[@id='MainContent_lblDivision']/text()").get()
case_type = response.xpath("//span[@id='MainContent_lblCaseType']/text()").get()

plaintiffs = response.xpath(
"//td/span[@id='MainContent_lblPlaintiffs']/text()"
).getall()

defendants = response.xpath(
"//td/span[@id='MainContent_lblDefendants']/text()"
).getall()

attorneys = response.xpath(
"//td/span[@id='MainContent_lblAttorney']/text()"
).getall()

ad_damnum = response.xpath("//span[@id='MainContent_lblAdDamnum']/text()").get()

return {
"case_number": case_number.strip(),
"calendar": calendar.strip(),
"filing_date": filing_date.strip(),
"division": division.strip(),
"case_type": case_type.strip(),
"ad_damnum": ad_damnum.strip(),
"plaintiffs": [plaintiff.strip() for plaintiff in plaintiffs],
"defendants": [defendant.strip() for defendant in defendants],
"attorneys": [attorney.strip() for attorney in attorneys],
}

def get_activities(self, response):
case_activities = []

case_activity_tables = response.xpath(
".//td[contains(text(), 'Activity Date')]/ancestor::table"
)

for activity_table in case_activity_tables:
activity = {}
cells = activity_table.xpath("./tbody/tr/td")

for i in range(0, len(cells), 2):
key = cells[i].xpath("./text()").get().strip(": \n")
value = cells[i + 1].xpath("./text()").get()
if value is None:
value = ""
activity[key] = value.strip()

case_activities.append(
{
"description": activity["Event Desc"],
"date": activity["Activity Date"],
"comments": activity["Comments"],
}
)

return case_activities[::-1]

def handle_error(self, failure):
if failure.check(HttpError):
response = failure.value.response
if response.status == 404:
self._missing_case(response)
elif response.status == 500:
self._failing_responses(response)
else:
self.logger.error(repr(failure))

def _missing_case(self, response):
missing_case_number = response.meta["case_number"]
if self.last_successful_case_number is None:
self.misses.add(missing_case_number)
elif missing_case_number > self.last_successful_case_number:
self.misses.add(missing_case_number)

if self.misses:
self.logger.info(f'misses: {", ".join(sorted(self.misses))}')

if len(self.misses) > 50:
raise CloseSpider("run of missing case number")

def _failing_responses(self, response):
failing_case_number = response.meta["case_number"]
self.failures.add(failing_case_number)

self.logger.info(f'failures: {", ".join(sorted(self.failures))}')

if len(self.failures) > 20:
raise CloseSpider("run of failures")

def _success(self, response):
successful_case_number = response.meta["case_number"]

if self.last_successful_case_number is None:
self.last_successful_case_number = successful_case_number
elif self.last_successful_case_number < successful_case_number:
self.last_successful_case_number = successful_case_number

if successful_case_number == self.last_successful_case_number:
self.misses = {
case_number
for case_number in self.misses
if case_number > successful_case_number
}

if hasattr(response, "raw_api_response"):
self.failures = set()
Loading

0 comments on commit b707709

Please sign in to comment.