diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..9bb8d7b --- /dev/null +++ b/.flake8 @@ -0,0 +1,7 @@ +[flake8] +exclude = + venv, +# So flake8 plays nicely with black +# https://black.readthedocs.io/en/stable/guides/using_black_with_other_tools.html +max-line-length = 88 +extend-ignore = E203 diff --git a/.gitignore b/.gitignore index 97aff23..25c0790 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,11 @@ # Data files scrape/*.json +*.csv +*.jl +*.db + +# Perl +*.pl # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..92b5a53 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,23 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace + + - repo: https://github.com/psf/black + rev: 23.10.1 + hooks: + - id: black + + - repo: https://github.com/pycqa/flake8 + rev: "7ef0350" + hooks: + - id: flake8 + args: [--config=.flake8] + + - repo: https://github.com/sqlfluff/sqlfluff + rev: 2.3.5 + hooks: + - id: sqlfluff-lint diff --git a/.sqlfluff b/.sqlfluff new file mode 100644 index 0000000..e6c3cdf --- /dev/null +++ b/.sqlfluff @@ -0,0 +1,68 @@ +[sqlfluff] + +# Supported dialects https://docs.sqlfluff.com/en/stable/dialects.html +# Or run 'sqlfluff dialects' +dialect = sqlite + +# One of [raw|jinja|python|placeholder] +templater = jinja + +# Comma separated list of rules to exclude, or None +# See https://docs.sqlfluff.com/en/stable/configuration.html#enabling-and-disabling-rules +# AM04 (ambiguous.column_count) and ST06 (structure.column_order) are +# two of the more controversial rules included to illustrate usage. +exclude_rules = ambiguous.column_count, structure.column_order + +# The standard max_line_length is 80 in line with the convention of +# other tools and several style guides. Many projects however prefer +# something a little longer. +# Set to zero or negative to disable checks. +max_line_length = 80 + +# CPU processes to use while linting. +# The default is "single threaded" to allow easy debugging, but this +# is often undesirable at scale. +# If positive, just implies number of processes. +# If negative or zero, implies number_of_cpus - specified_number. +# e.g. -1 means use all processors but one. 0 means all cpus. +processes = -1 + +# If using the dbt templater, we recommend setting the project dir. +[sqlfluff:templater:dbt] +project_dir = ./ + +[sqlfluff:indentation] +# While implicit indents are not enabled by default. Many of the +# SQLFluff maintainers do use them in their projects. +allow_implicit_indents = True + +# The default configuration for aliasing rules is "consistent" +# which will auto-detect the setting from the rest of the file. This +# is less desirable in a new project and you may find this (slightly +# more strict) setting more useful. +[sqlfluff:rules:aliasing.table] +aliasing = explicit +[sqlfluff:rules:aliasing.column] +aliasing = explicit +[sqlfluff:rules:aliasing.length] +min_alias_length = 3 + +# The default configuration for capitalisation rules is "consistent" +# which will auto-detect the setting from the rest of the file. This +# is less desirable in a new project and you may find this (slightly +# more strict) setting more useful. +# Typically we find users rely on syntax highlighting rather than +# capitalisation to distinguish between keywords and identifiers. +# Clearly, if your organisation has already settled on uppercase +# formatting for any of these syntax elements then set them to "upper". +# See https://stackoverflow.com/questions/608196/why-should-i-capitalize-my-sql-keywords-is-there-a-good-reason +[sqlfluff:rules:capitalisation.keywords] +capitalisation_policy = upper +[sqlfluff:rules:capitalisation.identifiers] +capitalisation_policy = lower +[sqlfluff:rules:capitalisation.functions] +extended_capitalisation_policy = lower +[sqlfluff:rules:capitalisation.literals] +capitalisation_policy = lower +[sqlfluff:rules:capitalisation.types] +extended_capitalisation_policy = lower diff --git a/Makefile b/Makefile index 683ecbf..aadaeca 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,21 @@ .PHONY: all all: upload -civil.zip : civil.db - - rm -rf civil_csv - mkdir civil_csv - echo "select * from court_case" | sqlite3 -csv -header civil.db > civil_csv/court_case.csv - echo "select * from plaintiff" | sqlite3 -csv -header civil.db > civil_csv/plaintiff.csv - echo "select * from defendant" | sqlite3 -csv -header civil.db > civil_csv/defendant.csv - echo "select * from attorney" | sqlite3 -csv -header civil.db > civil_csv/attorney.csv - echo "select * from event" | sqlite3 -csv -header civil.db > civil_csv/event.csv - zip -r $@ civil_csv - -civil.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv +.PHONY: clean +clean: + rm *.jl *.json *.db + +cases.zip : cases.db + - rm -rf cases_csv + mkdir cases_csv + echo "select * from court_case" | sqlite3 -csv -header cases.db > cases_csv/court_case.csv + echo "select * from plaintiff" | sqlite3 -csv -header cases.db > cases_csv/plaintiff.csv + echo "select * from defendant" | sqlite3 -csv -header cases.db > cases_csv/defendant.csv + echo "select * from attorney" | sqlite3 -csv -header cases.db > cases_csv/attorney.csv + echo "select * from event" | sqlite3 -csv -header cases.db > cases_csv/event.csv + zip -r $@ cases_csv + +cases.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv csvs-to-sqlite $^ $@ cat scripts/foreign_key.sql | sqlite3 $@ sqlite-utils add-column $@ court_case subdivision text @@ -56,6 +60,26 @@ civil.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv sqlite-utils convert $@ court_case filing_date 'r.parsedate(value)' sqlite-utils convert $@ event date 'r.parsedate(value)' +.PHONY : update_civil_db +update_civil_db : rescraped_civil_cases.csv + tail -n +2 $< | sqlite3 cases.db -init scripts/update.sql -bail + +.PHONY : update_chancery_db +update_chancery_db : rescraped_chancery_cases.csv + tail -n +2 $< | sqlite3 cases.db -init scripts/update.sql -bail + +rescraped_civil_cases.csv : to_rescrape.civil.csv + scrapy crawl civil -a update=True -a case_numbers_file=$< -O $@ + +rescraped_chancery_cases.csv : to_rescrape.chancery.csv + scrapy crawl chancery -a update=True -a case_numbers_file=$< -O $@ + +to_rescrape.civil.csv : cases.db + sqlite3 cases.db < scripts/to_scrape.civil.sql > $@ + +to_rescrape.chancery.csv : cases.db + sqlite3 cases.db < scripts/to_scrape.chancery.sql > $@ + %.csv: court_case_raw.%.csv cat $< | \ sed '1s/court_case_raw\._key/case_number/g' | \ @@ -64,15 +88,15 @@ civil.db : attorney.csv defendant.csv plaintiff.csv court_case.csv event.csv court_case.csv : court_case_raw.csv cat $< | sed -r '1s/[a-z0-9_]+\.//g' > $@ -court_case_raw.attorney.csv court_case_raw.defendant.csv court_case_raw.plaintiff.csv court_case_raw.csv court_case_raw.event.csv : civil.json - json-to-multicsv.pl --file $< \ +court_case_raw.attorney.csv court_case_raw.defendant.csv court_case_raw.plaintiff.csv court_case_raw.csv court_case_raw.event.csv : cases.json + perl json-to-multicsv.pl --file $< \ --path /:table:court_case_raw \ --path /*/events/:table:event \ --path /*/plaintiffs/:table:plaintiff \ --path /*/defendants/:table:defendant \ --path /*/attorneys/:table:attorney -civil.json : 2022_civil.jl 2023_civil.jl +cases.json : 2022_civil.jl 2023_civil.jl 2022_chancery.jl 2023_chancery.jl cat $^ | sort | python scripts/remove_dupe_cases.py | jq --slurp '.' > $@ %_civil.jl : %_civil-2.jl %_civil-3.jl %_civil-4.jl %_civil-5.jl \ @@ -80,6 +104,11 @@ civil.json : 2022_civil.jl 2023_civil.jl %_civil-13.jl %_civil-14.jl %_civil-15.jl %_civil-17.jl cat $^ > $@ +2022_chancery.jl : + scrapy crawl chancery -a year=2022 -O $@ + +2023_chancery.jl : + scrapy crawl chancery -a year=2023 -O $@ 2022_civil-%.jl : scrapy crawl civil -a division=$* -a year=2022 -O $@ @@ -89,4 +118,4 @@ civil.json : 2022_civil.jl 2023_civil.jl .PHONY : upload upload : 2022_civil.json - python scripts/upload_scrapes.py + python scripts/upload_scrapes.py diff --git a/courtscraper/spiders/base.py b/courtscraper/spiders/base.py new file mode 100644 index 0000000..5a54b62 --- /dev/null +++ b/courtscraper/spiders/base.py @@ -0,0 +1,184 @@ +from abc import ABC, abstractmethod +from datetime import datetime, timezone + +from scrapy import Spider +from scrapy.exceptions import CloseSpider +from scrapy.spidermiddlewares.httperror import HttpError + +from scripts.hash import dict_hash + + +class UnsuccessfulAutomation(Exception): + ... + + +class CourtSpiderBase(ABC, Spider): + def __init__(self, division="2", year=2022, case_numbers_file=None, **kwargs): + self.year = year + self.misses = set() + self.failures = set() + self.last_successful_case_number = None + self.update = bool(case_numbers_file) + + if case_numbers_file: + self.case_numbers = self.case_numbers_from_file(case_numbers_file) + else: + self.case_numbers = self.get_case_numbers(self.year) + + super().__init__(**kwargs) + + @property + @abstractmethod + def name(self): + pass + + @property + @abstractmethod + def url(self): + pass + + @abstractmethod + def start_requests(self): + pass + + @abstractmethod + def get_case_numbers(self): + pass + + def case_numbers_from_file(self, filename): + with open(filename) as f: + for case_number in f: + yield case_number + + def parse(self, response): + now = datetime.now(tz=timezone.utc).isoformat() + + case_info = self.get_case_info(response) + case_info.update( + { + "events": self.get_activities(response), + "court": self.name, + "updated_at": None if self.update else now, + "scraped_at": now, + } + ) + case_info["hash"] = dict_hash(case_info) + + self._success(response) + + return case_info + + def get_case_info(self, response): + case_number = response.xpath( + "//span[@id='MainContent_lblCaseNumber']/text()" + ).get() + calendar = response.xpath("//span[@id='MainContent_lblCalendar']/text()").get() + filing_date = response.xpath( + "//span[@id='MainContent_lblDateFiled']/text()" + ).get() + division = response.xpath(".//span[@id='MainContent_lblDivision']/text()").get() + case_type = response.xpath("//span[@id='MainContent_lblCaseType']/text()").get() + + plaintiffs = response.xpath( + "//td/span[@id='MainContent_lblPlaintiffs']/text()" + ).getall() + + defendants = response.xpath( + "//td/span[@id='MainContent_lblDefendants']/text()" + ).getall() + + attorneys = response.xpath( + "//td/span[@id='MainContent_lblAttorney']/text()" + ).getall() + + ad_damnum = response.xpath("//span[@id='MainContent_lblAdDamnum']/text()").get() + + return { + "case_number": case_number.strip(), + "calendar": calendar.strip(), + "filing_date": filing_date.strip(), + "division": division.strip(), + "case_type": case_type.strip(), + "ad_damnum": ad_damnum.strip(), + "plaintiffs": [plaintiff.strip() for plaintiff in plaintiffs], + "defendants": [defendant.strip() for defendant in defendants], + "attorneys": [attorney.strip() for attorney in attorneys], + } + + def get_activities(self, response): + case_activities = [] + + case_activity_tables = response.xpath( + ".//td[contains(text(), 'Activity Date')]/ancestor::table" + ) + + for activity_table in case_activity_tables: + activity = {} + cells = activity_table.xpath("./tbody/tr/td") + + for i in range(0, len(cells), 2): + key = cells[i].xpath("./text()").get().strip(": \n") + value = cells[i + 1].xpath("./text()").get() + if value is None: + value = "" + activity[key] = value.strip() + + case_activities.append( + { + "description": activity["Event Desc"], + "date": activity["Activity Date"], + "comments": activity["Comments"], + } + ) + + return case_activities[::-1] + + def handle_error(self, failure): + if failure.check(HttpError): + response = failure.value.response + if response.status == 404: + self._missing_case(response) + elif response.status == 500: + self._failing_responses(response) + else: + self.logger.error(repr(failure)) + + def _missing_case(self, response): + missing_case_number = response.meta["case_number"] + if self.last_successful_case_number is None: + self.misses.add(missing_case_number) + elif missing_case_number > self.last_successful_case_number: + self.misses.add(missing_case_number) + + if self.misses: + self.logger.info(f'misses: {", ".join(sorted(self.misses))}') + + if len(self.misses) > 50: + raise CloseSpider("run of missing case number") + + def _failing_responses(self, response): + failing_case_number = response.meta["case_number"] + self.failures.add(failing_case_number) + + self.logger.info(f'failures: {", ".join(sorted(self.failures))}') + + if len(self.failures) > 20: + raise CloseSpider("run of failures") + + def _success(self, response): + successful_case_number = response.meta["case_number"] + + if self.last_successful_case_number is None: + self.last_successful_case_number = successful_case_number + elif self.last_successful_case_number < successful_case_number: + self.last_successful_case_number = successful_case_number + + if successful_case_number == self.last_successful_case_number: + self.misses = { + case_number + for case_number in self.misses + if case_number > successful_case_number + } + + if hasattr(response, "raw_api_response"): + self.failures = set() diff --git a/courtscraper/spiders/chancery.py b/courtscraper/spiders/chancery.py new file mode 100644 index 0000000..4019984 --- /dev/null +++ b/courtscraper/spiders/chancery.py @@ -0,0 +1,66 @@ +from scrapy import Request + +from .base import CourtSpiderBase + + +class ChancerySpider(CourtSpiderBase): + name = "chancery" + url = "https://casesearch.cookcountyclerkofcourt.org/CivilCaseSearchAPI.aspx" + + def __init__(self, year=2022, **kwargs): + self.case_type = CASE_FORMAT + super().__init__(**kwargs) + + def get_case_numbers(self, year): + base_case_num = "{year}CH{serial_format}".format(year=year, **self.case_type) + + for serial in range(self.case_type["start"], self.case_type["end"] + 1): + case_number = base_case_num % serial + yield case_number + + def start_requests(self): + for case_number in self.case_numbers: + yield Request( + ChancerySpider.url, + meta={ + "zyte_api_automap": { + "httpResponseHeaders": True, + "browserHtml": True, + "actions": [ + { + "action": "waitForSelector", + "selector": { + "type": "css", + "value": "#MainContent_btnSearch", + }, + "timeout": 5, + "onError": "return", + }, + { + "action": "evaluate", + "source": f"""$('#MainContent_ddlDatabase').val('3'); + $('#MainContent_txtCaseNumber').val('{case_number}'); + $('#MainContent_btnSearch').click();""", + }, + { + "action": "waitForSelector", + "selector": { + "type": "css", + "value": "#MainContent_lblDetailHeader", + }, + "timeout": 5, + "onError": "return", + }, + ], + }, + "case_number": case_number, + }, + errback=self.handle_error, + ) + + +CASE_FORMAT = { + "start": 0, + "end": 999999, + "serial_format": "%05d", +} diff --git a/courtscraper/spiders/civil.py b/courtscraper/spiders/civil.py index 8610139..fc4e1df 100644 --- a/courtscraper/spiders/civil.py +++ b/courtscraper/spiders/civil.py @@ -1,39 +1,20 @@ -from scrapy import Request, Spider -from scrapy.exceptions import CloseSpider +from scrapy import Request +from .base import CourtSpiderBase -from scrapy.spidermiddlewares.httperror import HttpError - -class UnsuccessfulAutomation(Exception): - ... - - -class CivilSpider(Spider): +class CivilSpider(CourtSpiderBase): name = "civil" + url = "https://casesearch.cookcountyclerkofcourt.org/CivilCaseSearchAPI.aspx" def __init__(self, division="2", year=2022, **kwargs): self.case_type = DIVISIONS[division] - self.year = year - self.misses = set() - self.failures = set() - self.last_successful_case_number = None super().__init__(**kwargs) - def case_numbers(self, year): - - base_case_num = "{year}{district}{type}{serial_format}".format( - year=year, **self.case_type - ) - - for serial in range(self.case_type["start"], self.case_type["end"] + 1): - case_number = base_case_num % serial - yield case_number - def start_requests(self): - for case_number in self.case_numbers(self.year): + for case_number in self.case_numbers: yield Request( - "https://casesearch.cookcountyclerkofcourt.org/CivilCaseSearchAPI.aspx", + CivilSpider.url, meta={ "zyte_api_automap": { "httpResponseHeaders": True, @@ -70,129 +51,14 @@ def start_requests(self): errback=self.handle_error, ) - def parse(self, response): - case_info = self.get_case_info(response) - case_info["events"] = self.get_activities(response) - - self._success(response) - - return case_info - - def get_case_info(self, response): - case_number = response.xpath( - "//span[@id='MainContent_lblCaseNumber']/text()" - ).get() - calendar = response.xpath("//span[@id='MainContent_lblCalendar']/text()").get() - filing_date = response.xpath( - "//span[@id='MainContent_lblDateFiled']/text()" - ).get() - division = response.xpath(".//span[@id='MainContent_lblDivision']/text()").get() - case_type = response.xpath("//span[@id='MainContent_lblCaseType']/text()").get() - - plaintiffs = response.xpath( - "//td/span[@id='MainContent_lblPlaintiffs']/text()" - ).getall() - - defendants = response.xpath( - "//td/span[@id='MainContent_lblDefendants']/text()" - ).getall() - - attorneys = response.xpath( - "//td/span[@id='MainContent_lblAttorney']/text()" - ).getall() - - ad_damnum = response.xpath("//span[@id='MainContent_lblAdDamnum']/text()").get() - - return { - "case_number": case_number.strip(), - "calendar": calendar.strip(), - "filing_date": filing_date.strip(), - "division": division.strip(), - "case_type": case_type.strip(), - "ad_damnum": ad_damnum.strip(), - "plaintiffs": [plaintiff.strip() for plaintiff in plaintiffs], - "defendants": [defendant.strip() for defendant in defendants], - "attorneys": [attorney.strip() for attorney in attorneys], - } - - def get_activities(self, response): - case_activities = [] - - case_activity_tables = response.xpath( - ".//td[contains(text(), 'Activity Date')]/ancestor::table" + def get_case_numbers(self, year): + base_case_num = "{year}{district}{type}{serial_format}".format( + year=year, **self.case_type ) - for activity_table in case_activity_tables: - activity = {} - cells = activity_table.xpath("./tbody/tr/td") - - for i in range(0, len(cells), 2): - key = cells[i].xpath("./text()").get().strip(": \n") - value = cells[i + 1].xpath("./text()").get() - if value is None: - value = "" - activity[key] = value.strip() - - case_activities.append( - { - "description": activity["Event Desc"], - "date": activity["Activity Date"], - "comments": activity["Comments"], - } - ) - - return case_activities[::-1] - - def handle_error(self, failure): - if failure.check(HttpError): - response = failure.value.response - if response.status == 404: - self._missing_case(response) - elif response.status == 500: - self._failing_responses(response) - else: - self.logger.error(repr(failure)) - - def _missing_case(self, response): - missing_case_number = response.meta["case_number"] - if self.last_successful_case_number is None: - self.misses.add(missing_case_number) - elif missing_case_number > self.last_successful_case_number: - self.misses.add(missing_case_number) - - if self.misses: - self.logger.info(f'misses: {", ".join(sorted(self.misses))}') - - if len(self.misses) > 50: - breakpoint() - raise CloseSpider("run of missing case number") - - def _failing_responses(self, response): - failing_case_number = response.meta["case_number"] - self.failures.add(failing_case_number) - - self.logger.info(f'failures: {", ".join(sorted(self.failures))}') - - if len(self.failures) > 20: - raise CloseSpider("run of failures") - - def _success(self, response): - successful_case_number = response.meta["case_number"] - - if self.last_successful_case_number is None: - self.last_successful_case_number = successful_case_number - elif self.last_successful_case_number < successful_case_number: - self.last_successful_case_number = successful_case_number - - if successful_case_number == self.last_successful_case_number: - self.misses = { - case_number - for case_number in self.misses - if case_number > successful_case_number - } - - if hasattr(response, "raw_api_response"): - self.failures = set() + for serial in range(self.case_type["start"], self.case_type["end"] + 1): + case_number = base_case_num % serial + yield case_number DIVISIONS = { diff --git a/scripts/foreign_key.sql b/scripts/foreign_key.sql index caf9e76..488aa51 100644 --- a/scripts/foreign_key.sql +++ b/scripts/foreign_key.sql @@ -1,7 +1,18 @@ -update plaintiff set case_number = court_case.case_number from court_case where plaintiff.case_number = court_case._key; +UPDATE plaintiff SET case_number = court_case.case_number -update attorney set case_number = court_case.case_number from court_case where attorney.case_number = court_case._key; -update defendant set case_number = court_case.case_number from court_case where defendant.case_number = court_case._key; -update event set case_number = court_case.case_number from court_case where event.case_number = court_case._key; +FROM court_case +WHERE plaintiff.case_number = court_case._key; + +UPDATE attorney SET case_number = court_case.case_number +FROM court_case +WHERE attorney.case_number = court_case._key; + +UPDATE defendant SET case_number = court_case.case_number +FROM court_case +WHERE defendant.case_number = court_case._key; + +UPDATE event SET case_number = court_case.case_number +FROM court_case +WHERE event.case_number = court_case._key; diff --git a/scripts/hash.py b/scripts/hash.py new file mode 100644 index 0000000..038dcb1 --- /dev/null +++ b/scripts/hash.py @@ -0,0 +1,12 @@ +# From https://www.doc.ic.ac.uk/~nuric/coding/how-to-hash-a-dictionary-in-python.html + +import hashlib +import json + + +def dict_hash(dictionary): + """MD5 hash of a dictionary.""" + dhash = hashlib.md5() + encoded = json.dumps(dictionary, sort_keys=True).encode() + dhash.update(encoded) + return dhash.hexdigest() diff --git a/scripts/remove_dupe_cases.py b/scripts/remove_dupe_cases.py index 341a242..a2cfeaf 100644 --- a/scripts/remove_dupe_cases.py +++ b/scripts/remove_dupe_cases.py @@ -4,8 +4,7 @@ previous_case_number = None for line in sys.stdin: data = json.loads(line) - case_number = data['case_number'] + case_number = data["case_number"] if case_number != previous_case_number: print(line) previous_case_number = case_number - diff --git a/scripts/to_scrape.chancery.sql b/scripts/to_scrape.chancery.sql new file mode 100644 index 0000000..dbb3834 --- /dev/null +++ b/scripts/to_scrape.chancery.sql @@ -0,0 +1,23 @@ +-- Generates a priority queue of cases to re-scrape +-- Inspired by Cho and Molina, Estimating Frequency of Change +-- https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=60c8e42055dfb80072a547c73fbc18dfbacc20aa +WITH +overall_rate AS ( + SELECT + sum( + 1 / (julianday(last_checked_at) - julianday(updated_at)) + ) / count(*) FILTER ( + WHERE julianday(last_checked_at) > julianday(updated_at) + ) AS rate, + 3 AS prior_weight + FROM court_case +) + +SELECT case_number +FROM court_case +INNER JOIN overall_rate ON 1 = 1 +WHERE court = "chancery" +ORDER BY + ((prior_weight + 1) / (prior_weight / rate + julianday(last_checked_at) - julianday(updated_at))) + * (julianday('now') - julianday(last_checked_at)) DESC +LIMIT 3000; diff --git a/scripts/to_scrape.civil.sql b/scripts/to_scrape.civil.sql new file mode 100644 index 0000000..83ef04d --- /dev/null +++ b/scripts/to_scrape.civil.sql @@ -0,0 +1,23 @@ +-- Generates a priority queue of cases to re-scrape +-- Inspired by Cho and Molina, Estimating Frequency of Change +-- https://citeseerx.ist.psu.edu/document?repid=rep1&type=pdf&doi=60c8e42055dfb80072a547c73fbc18dfbacc20aa +WITH +overall_rate AS ( + SELECT + sum( + 1 / (julianday(last_checked_at) - julianday(updated_at)) + ) / count(*) FILTER ( + WHERE julianday(last_checked_at) > julianday(updated_at) + ) AS rate, + 3 AS prior_weight + FROM court_case +) + +SELECT case_number +FROM court_case +INNER JOIN overall_rate ON 1 = 1 +WHERE court = "civil" +ORDER BY + ((prior_weight + 1) / (prior_weight / rate + julianday(last_checked_at) - julianday(updated_at))) + * (julianday('now') - julianday(last_checked_at)) DESC +LIMIT 3000; diff --git a/scripts/update.sql b/scripts/update.sql new file mode 100644 index 0000000..c5f9728 --- /dev/null +++ b/scripts/update.sql @@ -0,0 +1,39 @@ +-- Given a CSV of re-scraped cases, update the matching cases in the database +CREATE TEMPORARY TABLE raw_case ( + ad_damnum text, + calendar text, + case_number text, + case_type text, + court text, + division text, + filing_date text, + hash text, + scraped_at text, + updated_at text +); + +.mode csv -- noqa +.import /dev/stdin raw_case -- noqa + +-- Update cases that have changed (i.e. their hashes are different) +UPDATE court_case +SET + calendar = raw_case.calendar, + filing_date = raw_case.filing_date, + division = raw_case.division, + case_type = raw_case.case_type, + ad_damnum = raw_case.ad_damnum, + court = raw_case.court, + hash = raw_case.hash, + scraped_at = raw_case.scraped_at, + updated_at = raw_case.scraped_at +FROM raw_case +WHERE + court_case.case_number = raw_case.case_number AND court_case.hash != raw_case.hash; + +-- For cases that haven't changed, just update their scraped_at field +UPDATE court_case +SET scraped_at = raw_case.scraped_at +FROM raw_case +WHERE + court_case.case_number = raw_case.case_number AND court_case.hash = raw_case.hash;