Merge pull request #15 from City-Bureau/fix-spiders

🕷️ Fix: 58 Minn City spiders
City-Bureau · Feb 26, 2024 · 4636b96 · 4636b96
2 parents 7fa1645 + d3b333e
commit 4636b96
Show file tree

Hide file tree

Showing 186 changed files with 1,369 additions and 13,982 deletions.
diff --git a/.deploy.sh b/.deploy.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-pipenv run scrapy list | xargs -I {} pipenv run scrapy crawl {} -s LOG_ENABLED=False &
+pipenv run scrapy list | xargs -I {} pipenv run scrapy crawl {} -s LOG_ENABLED=True &
 
 # Output to the screen every 9 minutes to prevent a travis timeout
 # https://stackoverflow.com/a/40800348

diff --git a/.github/workflows/archive.yml b/.github/workflows/archive.yml
@@ -19,10 +19,10 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.9
         uses: actions/setup-python@v1
         with:
-          python-version: 3.8
+          python-version: 3.9
 
       - name: Install Pipenv
         uses: dschep/install-pipenv-action@v1
@@ -31,15 +31,15 @@ jobs:
         uses: actions/cache@v1
         with:
           path: .venv
-          key: pip-3.8-${{ hashFiles('**/Pipfile.lock') }}
+          key: pip-3.9-${{ hashFiles('**/Pipfile.lock') }}
           restore-keys: |
-            pip-3.8-
+            pip-3.9-
             pip-
 
       - name: Install dependencies
         run: pipenv sync
         env:
-          PIPENV_DEFAULT_PYTHON_VERSION: 3.8
+          PIPENV_DEFAULT_PYTHON_VERSION: 3.9
 
       - name: Run scrapers
         run: |

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       max-parallel: 4
       matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
+        python-version: [3.9]
 
     steps:
       - uses: actions/checkout@v1
@@ -42,6 +42,10 @@ jobs:
         env:
           PIPENV_DEFAULT_PYTHON_VERSION: ${{ matrix.python-version }}
 
+      - name: Set up playwright
+        run: |
+          pipenv run playwright install firefox
+  
       - name: Check imports with isort
         run: |
           pipenv run isort . --check-only

diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml
@@ -25,18 +25,22 @@ jobs:
     steps:
       - uses: actions/checkout@v1
 
-      - name: Set up Python 3.8
+      - name: Set up Python 3.9
         uses: actions/setup-python@v1
         with:
-          python-version: 3.8
+          python-version: 3.9
 
       - name: Install Pipenv
         uses: dschep/install-pipenv-action@v1
 
       - name: Install dependencies
         run: pipenv sync
         env:
-          PIPENV_DEFAULT_PYTHON_VERSION: 3.8
+          PIPENV_DEFAULT_PYTHON_VERSION: 3.9
+
+      - name: Set up playwright
+        run: |
+          pipenv run playwright install firefox
 
       - name: Run scrapers
         run: |

diff --git a/Pipfile b/Pipfile
@@ -4,19 +4,20 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
-jsonschema="3.0.2"
 python-dateutil = "*"
-pytz = "*"
 requests = "*"
 scrapy = "*"
-scrapy-sentry = "*"
+scrapy-sentry-errors = "1.0.0"
 scrapy-wayback-middleware = "*"
-city-scrapers-core = {extras = ["azure"],version = "*"}
-pdfminer-six = "*"
+city-scrapers-core = {ref = "main", git = "https://github.com/City-Bureau/city-scrapers-core.git", extras = ["azure"]}
+scrapy-playwright = "*"
 
 [dev-packages]
 freezegun = "*"
 pytest = "*"
 "flake8" = "*"
 isort = "*"
-black = "==19.10b0"
+black = "*"
+
+[requires]
+python_version = "3.9"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/city_scrapers/mixins/minn_city.py b/city_scrapers/mixins/minn_city.py
@@ -0,0 +1,163 @@
+import json
+from datetime import datetime
+
+import scrapy
+from city_scrapers_core.constants import BOARD, CITY_COUNCIL, COMMITTEE, NOT_CLASSIFIED
+from city_scrapers_core.items import Meeting
+from city_scrapers_core.spiders import CityScrapersSpider
+
+
+class MinnCityMixinMeta(type):
+    """
+    Metaclass that enforces the implementation of required static
+    variables in child classes that inherit from MinnCityMixinMixin.
+    """
+
+    def __init__(cls, name, bases, dct):
+        required_static_vars = ["agency", "name", "committee_id", "meeting_type"]
+        missing_vars = [var for var in required_static_vars if var not in dct]
+
+        if missing_vars:
+            missing_vars_str = ", ".join(missing_vars)
+            raise NotImplementedError(
+                f"{name} must define the following static variable(s): {missing_vars_str}."  # noqa
+            )
+
+        super().__init__(name, bases, dct)
+
+
+class MinnCityMixin(CityScrapersSpider, metaclass=MinnCityMixinMeta):
+    timezone = "America/North_Dakota/Beulah"
+    from_date = datetime.today()
+    base_url = "https://lims.minneapolismn.gov/Calendar/GetCalenderList"
+    to_date = ""
+    links = [
+        {
+            "title": "Meeting materials (council)",
+            "href": "https://lims.minneapolismn.gov/Boards/Meetings/Council",
+        },
+        {
+            "title": "Meeting materials (independent bodies)",
+            "href": "https://lims.minneapolismn.gov/IndependentBodies/Meetings",
+        },
+        {
+            "title": "Meeting materials (boards)",
+            "href": "https://lims.minneapolismn.gov/Boards/Meetings",
+        },
+    ]
+    name = None
+    agency = None
+    committee_id = None
+    meeting_type = None
+
+    def start_requests(self):
+        """
+        Create a GET request to the city's Calendar endpoint with
+        the appropriate query parameters. We use a headless browser
+        (scrapy-playwright) to handle our request because the city uses
+        Cloudflare to detect and block requests from obvious bots.
+        """
+        full_url = f"{self.base_url}?fromDate={self.from_date}&toDate={self.to_date}&meetingType={self.meeting_type}&committeeId={self.committee_id}&pageCount=1000&offsetStart=0&abbreviation=&keywords=&sortOrder=1"  # noqa
+        yield scrapy.Request(
+            url=full_url,
+            meta={"playwright": True},
+            callback=self.parse,
+        )
+
+    def parse(self, response):
+        """
+        Extract JSON from the HTML response and parse it into a list of Meeting items.
+        """
+        json_data = response.css("pre::text").get()
+        data = json.loads(json_data)
+        for item in data:
+            meeting = Meeting(
+                title=str(item["CommitteeName"]),
+                description=str(item["Description"]),
+                classification=self._parse_classification(item),
+                start=self._parse_start(item),
+                end=None,
+                all_day=False,
+                time_notes="",
+                location=self._parse_location(item),
+                links=self._parse_links(item),
+                source=self._parse_source(item),
+            )
+            if item["Cancelled"]:
+                meeting["status"] = self._get_status(
+                    meeting, text="Meeting is cancelled"
+                )
+            else:
+                meeting["status"] = self._get_status(meeting)
+            meeting["id"] = self._get_id(meeting)
+            yield meeting
+
+    def _parse_title(self, item):
+        """Parse or generate meeting title."""
+        return ""
+
+    def _parse_description(self, item):
+        """Parse or generate meeting description."""
+        return ""
+
+    def _parse_classification(self, item):
+        """Parse or generate classification from title."""
+        if not item["CommitteeName"]:
+            return NOT_CLASSIFIED
+        committee_name = item["CommitteeName"].lower()
+        if "board" in committee_name:
+            return BOARD
+        elif "committee" in committee_name:
+            return COMMITTEE
+        elif "council" in committee_name:
+            return CITY_COUNCIL
+        else:
+            return NOT_CLASSIFIED
+
+    def _parse_start(self, item):
+        """Parse start datetime as a naive datetime object."""
+        return datetime.strptime(item["MeetingTime"], "%Y-%m-%dT%H:%M:%S")
+
+    def _parse_end(self, item):
+        """Parse end datetime as a naive datetime object. Added by pipeline if None"""
+        return None
+
+    def _parse_time_notes(self, item):
+        """Parse any additional notes on the timing of the meeting"""
+        return ""
+
+    def _parse_all_day(self, item):
+        """Parse or generate all-day status. Defaults to False."""
+        return False
+
+    def _parse_location(self, item):
+        """Parse or generate location."""
+        if item["Location"] != "Online Meeting":
+            address = item["Address"]
+        else:
+            address = None
+            if (
+                item["Location"] == "Online Meeting"
+                or item["Address"] == "Online Meeting"
+            ):
+                address = "Remote"
+
+        return {"address": address, "name": item["Location"]}
+
+    def _parse_source(self, item):
+        return "https://lims.minneapolismn.gov/Boards/Meetings/" + item["Abbreviation"]
+
+    def _parse_links(self, item):
+        """Parse or generate links."""
+        new_links = self.links.copy()  # Copy the default links
+        if "CommitteeReportDocument" in item and item["CommitteeReportDocument"]:
+            new_links.append(
+                {
+                    "title": "Report Document",
+                    "href": "https://lims.minneapolismn.gov/Download/CommitteeReport/"
+                    + str(item["CommitteeReportDocumentId"])
+                    + "/"
+                    + str(item["CommitteeReportDocument"]).replace(" ", "-"),
+                }
+            )
+        return new_links
diff --git a/city_scrapers/settings/base.py b/city_scrapers/settings/base.py
@@ -53,3 +53,11 @@
 }
 
 CLOSESPIDER_ERRORCOUNT = 5
+
+# Playwright settings
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
+DOWNLOAD_HANDLERS = {
+    "http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+    "https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
+}
+PLAYWRIGHT_BROWSER_TYPE = "firefox"
diff --git a/city_scrapers/settings/prod.py b/city_scrapers/settings/prod.py
@@ -15,7 +15,7 @@
 
 EXTENSIONS = {
     "city_scrapers_core.extensions.AzureBlobStatusExtension": 100,
-    "scrapy_sentry.extensions.Errors": 10,
+    "scrapy_sentry_errors.extensions.Errors": 10,
     "scrapy.extensions.closespider.CloseSpider": None,
 }
 
@@ -50,4 +50,4 @@
 #    "gs://{bucket}/%(year)s/%(month)s/%(day)s/%(hour_min)s/%(name)s.json"
 # ).format(bucket=GCS_BUCKET)
 
-FEED_PREFIX = "%Y/%m/%d"
+FEED_PREFIX = "%Y/%m/%d"