Skip to content

Commit

Permalink
Merge pull request #15 from City-Bureau/fix-spiders
Browse files Browse the repository at this point in the history
🕷️ Fix: 58 Minn City spiders
  • Loading branch information
SimmonsRitchie authored Feb 26, 2024
2 parents 7fa1645 + d3b333e commit 4636b96
Show file tree
Hide file tree
Showing 186 changed files with 1,369 additions and 13,982 deletions.
2 changes: 1 addition & 1 deletion .deploy.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
pipenv run scrapy list | xargs -I {} pipenv run scrapy crawl {} -s LOG_ENABLED=False &
pipenv run scrapy list | xargs -I {} pipenv run scrapy crawl {} -s LOG_ENABLED=True &

# Output to the screen every 9 minutes to prevent a travis timeout
# https://stackoverflow.com/a/40800348
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/archive.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ jobs:
steps:
- uses: actions/checkout@v2

- name: Set up Python 3.8
- name: Set up Python 3.9
uses: actions/setup-python@v1
with:
python-version: 3.8
python-version: 3.9

- name: Install Pipenv
uses: dschep/install-pipenv-action@v1
Expand All @@ -31,15 +31,15 @@ jobs:
uses: actions/cache@v1
with:
path: .venv
key: pip-3.8-${{ hashFiles('**/Pipfile.lock') }}
key: pip-3.9-${{ hashFiles('**/Pipfile.lock') }}
restore-keys: |
pip-3.8-
pip-3.9-
pip-
- name: Install dependencies
run: pipenv sync
env:
PIPENV_DEFAULT_PYTHON_VERSION: 3.8
PIPENV_DEFAULT_PYTHON_VERSION: 3.9

- name: Run scrapers
run: |
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
python-version: [3.6, 3.7, 3.8, 3.9]
python-version: [3.9]

steps:
- uses: actions/checkout@v1
Expand All @@ -42,6 +42,10 @@ jobs:
env:
PIPENV_DEFAULT_PYTHON_VERSION: ${{ matrix.python-version }}

- name: Set up playwright
run: |
pipenv run playwright install firefox
- name: Check imports with isort
run: |
pipenv run isort . --check-only
Expand Down
10 changes: 7 additions & 3 deletions .github/workflows/cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,22 @@ jobs:
steps:
- uses: actions/checkout@v1

- name: Set up Python 3.8
- name: Set up Python 3.9
uses: actions/setup-python@v1
with:
python-version: 3.8
python-version: 3.9

- name: Install Pipenv
uses: dschep/install-pipenv-action@v1

- name: Install dependencies
run: pipenv sync
env:
PIPENV_DEFAULT_PYTHON_VERSION: 3.8
PIPENV_DEFAULT_PYTHON_VERSION: 3.9

- name: Set up playwright
run: |
pipenv run playwright install firefox
- name: Run scrapers
run: |
Expand Down
13 changes: 7 additions & 6 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,20 @@ verify_ssl = true
name = "pypi"

[packages]
jsonschema="3.0.2"
python-dateutil = "*"
pytz = "*"
requests = "*"
scrapy = "*"
scrapy-sentry = "*"
scrapy-sentry-errors = "1.0.0"
scrapy-wayback-middleware = "*"
city-scrapers-core = {extras = ["azure"],version = "*"}
pdfminer-six = "*"
city-scrapers-core = {ref = "main", git = "https://github.com/City-Bureau/city-scrapers-core.git", extras = ["azure"]}
scrapy-playwright = "*"

[dev-packages]
freezegun = "*"
pytest = "*"
"flake8" = "*"
isort = "*"
black = "==19.10b0"
black = "*"

[requires]
python_version = "3.9"
1,385 changes: 783 additions & 602 deletions Pipfile.lock

Large diffs are not rendered by default.

163 changes: 163 additions & 0 deletions city_scrapers/mixins/minn_city.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import json
from datetime import datetime

import scrapy
from city_scrapers_core.constants import BOARD, CITY_COUNCIL, COMMITTEE, NOT_CLASSIFIED
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider


class MinnCityMixinMeta(type):
"""
Metaclass that enforces the implementation of required static
variables in child classes that inherit from MinnCityMixinMixin.
"""

def __init__(cls, name, bases, dct):
required_static_vars = ["agency", "name", "committee_id", "meeting_type"]
missing_vars = [var for var in required_static_vars if var not in dct]

if missing_vars:
missing_vars_str = ", ".join(missing_vars)
raise NotImplementedError(
f"{name} must define the following static variable(s): {missing_vars_str}." # noqa
)

super().__init__(name, bases, dct)


class MinnCityMixin(CityScrapersSpider, metaclass=MinnCityMixinMeta):
timezone = "America/North_Dakota/Beulah"
from_date = datetime.today()
base_url = "https://lims.minneapolismn.gov/Calendar/GetCalenderList"
to_date = ""
links = [
{
"title": "Meeting materials (council)",
"href": "https://lims.minneapolismn.gov/Boards/Meetings/Council",
},
{
"title": "Meeting materials (independent bodies)",
"href": "https://lims.minneapolismn.gov/IndependentBodies/Meetings",
},
{
"title": "Meeting materials (boards)",
"href": "https://lims.minneapolismn.gov/Boards/Meetings",
},
]
name = None
agency = None
committee_id = None
meeting_type = None

def start_requests(self):
"""
Create a GET request to the city's Calendar endpoint with
the appropriate query parameters. We use a headless browser
(scrapy-playwright) to handle our request because the city uses
Cloudflare to detect and block requests from obvious bots.
"""
full_url = f"{self.base_url}?fromDate={self.from_date}&toDate={self.to_date}&meetingType={self.meeting_type}&committeeId={self.committee_id}&pageCount=1000&offsetStart=0&abbreviation=&keywords=&sortOrder=1" # noqa
yield scrapy.Request(
url=full_url,
meta={"playwright": True},
callback=self.parse,
)

def parse(self, response):
"""
Extract JSON from the HTML response and parse it into a list of Meeting items.
"""
json_data = response.css("pre::text").get()
data = json.loads(json_data)
for item in data:
meeting = Meeting(
title=str(item["CommitteeName"]),
description=str(item["Description"]),
classification=self._parse_classification(item),
start=self._parse_start(item),
end=None,
all_day=False,
time_notes="",
location=self._parse_location(item),
links=self._parse_links(item),
source=self._parse_source(item),
)
if item["Cancelled"]:
meeting["status"] = self._get_status(
meeting, text="Meeting is cancelled"
)
else:
meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)
yield meeting

def _parse_title(self, item):
"""Parse or generate meeting title."""
return ""

def _parse_description(self, item):
"""Parse or generate meeting description."""
return ""

def _parse_classification(self, item):
"""Parse or generate classification from title."""
if not item["CommitteeName"]:
return NOT_CLASSIFIED
committee_name = item["CommitteeName"].lower()
if "board" in committee_name:
return BOARD
elif "committee" in committee_name:
return COMMITTEE
elif "council" in committee_name:
return CITY_COUNCIL
else:
return NOT_CLASSIFIED

def _parse_start(self, item):
"""Parse start datetime as a naive datetime object."""
return datetime.strptime(item["MeetingTime"], "%Y-%m-%dT%H:%M:%S")

def _parse_end(self, item):
"""Parse end datetime as a naive datetime object. Added by pipeline if None"""
return None

def _parse_time_notes(self, item):
"""Parse any additional notes on the timing of the meeting"""
return ""

def _parse_all_day(self, item):
"""Parse or generate all-day status. Defaults to False."""
return False

def _parse_location(self, item):
"""Parse or generate location."""
if item["Location"] != "Online Meeting":
address = item["Address"]
else:
address = None
if (
item["Location"] == "Online Meeting"
or item["Address"] == "Online Meeting"
):
address = "Remote"

return {"address": address, "name": item["Location"]}

def _parse_source(self, item):
return "https://lims.minneapolismn.gov/Boards/Meetings/" + item["Abbreviation"]

def _parse_links(self, item):
"""Parse or generate links."""
new_links = self.links.copy() # Copy the default links
if "CommitteeReportDocument" in item and item["CommitteeReportDocument"]:
new_links.append(
{
"title": "Report Document",
"href": "https://lims.minneapolismn.gov/Download/CommitteeReport/"
+ str(item["CommitteeReportDocumentId"])
+ "/"
+ str(item["CommitteeReportDocument"]).replace(" ", "-"),
}
)
return new_links
8 changes: 8 additions & 0 deletions city_scrapers/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,11 @@
}

CLOSESPIDER_ERRORCOUNT = 5

# Playwright settings
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
PLAYWRIGHT_BROWSER_TYPE = "firefox"
4 changes: 2 additions & 2 deletions city_scrapers/settings/prod.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

EXTENSIONS = {
"city_scrapers_core.extensions.AzureBlobStatusExtension": 100,
"scrapy_sentry.extensions.Errors": 10,
"scrapy_sentry_errors.extensions.Errors": 10,
"scrapy.extensions.closespider.CloseSpider": None,
}

Expand Down Expand Up @@ -50,4 +50,4 @@
# "gs://{bucket}/%(year)s/%(month)s/%(day)s/%(hour_min)s/%(name)s.json"
# ).format(bucket=GCS_BUCKET)

FEED_PREFIX = "%Y/%m/%d"
FEED_PREFIX = "%Y/%m/%d"
Loading

0 comments on commit 4636b96

Please sign in to comment.