Cincinnati Board of Education scraper second build

City-Bureau · Dec 30, 2024 · dfaf432 · dfaf432
1 parent 975f98e
commit dfaf432
Show file tree

Hide file tree

Showing 3 changed files with 160 additions and 0 deletions.
diff --git a/city_scrapers/spiders/cinoh_Board_of_Ed.py b/city_scrapers/spiders/cinoh_Board_of_Ed.py
@@ -0,0 +1,84 @@
+from datetime import datetime
+import scrapy
+from city_scrapers_core.constants import COMMITTEE, NOT_CLASSIFIED, BOARD
+from city_scrapers_core.items import Meeting
+from city_scrapers_core.spiders import CityScrapersSpider
+from dateutil.parser import parse
+from dateutil.relativedelta import relativedelta
+
+
+class CinohBoardOfEdSpider(CityScrapersSpider):
+    name = "cinoh_Board_of_Ed"
+    agency = "Cincinnati Board of Education"
+    timezone = "America/New_York"
+    committee_id = "A9HCZC3376F4"
+    custom_settings = {
+        "ROBOTSTXT_OBEY": False,
+    }
+
+    # the original URL is https://go.boarddocs.com/oh/cps/Board.nsf/Public#tab-welcome
+    # the html is not scrapable but clicking on the meetings will bring up two API endpoints containing
+    # the data for the meeting list. I was only able to scrape one of these endpoints which is the one below.
+    # the current API scraping method is based on another previous scaper for the boarddocs website: https://github.com/City-Bureau/city-scrapers-cinoh/pull/10
+    def start_requests(self):
+        url = "https://go.boarddocs.com/oh/cps/Board.nsf/BD-GetMeetingsList"
+        form_data = {"current_committee_id": self.committee_id}
+        yield scrapy.FormRequest(url, formdata=form_data, callback=self.parse)
+
+    def parse(self, response):  
+        lower_limit = datetime.now() - relativedelta(months=12)    
+        data = response.json()
+        # hardcoded location
+        location = {
+            "name": "Cincinnati Public Schools",
+            "address": "2651 Burnet Avenue, Mary A. Ronan Education Center Room 111, Cincinnati, OH 45219",
+        }
+
+        for item in data:
+
+            date = item.get("numberdate")
+            if date is None:
+                continue
+            meeting_date = parse(date)
+            if meeting_date < lower_limit:
+                continue
+
+
+            meeting = Meeting(
+                title=item["name"],
+                description="",
+                classification=self._parse_classification(item),
+                start=parse(date),
+                end=None,
+                all_day=False,
+                time_notes="",
+                location=location,
+                links=self._parse_links(item),
+                source=self._parse_source(response),
+            )
+
+            meeting["status"] = self._get_status(meeting)
+            meeting["id"] = self._get_id(meeting)
+
+            yield meeting
+
+    def _parse_classification(self, item):
+        if "Committee" in item["name"]:
+            return COMMITTEE
+        elif "Board" in item["name"]:
+            return BOARD
+        else:
+            return NOT_CLASSIFIED
+
+    def _parse_links(self, item):
+        # each link is to the full meeting agenda and also contains the meeting's Zoom link  
+        """Generate links."""
+        href = (
+            f"https://go.boarddocs.com/oh/cps/Board.nsf/Download-AgendaDetailed?"
+            f"open&id={item['unique']}&current_committee_id={self.committee_id}"
+        )
+        return [{"title": "Agenda and Zoom Meeting Link", "href": href}]
+
+    def _parse_source(self, response):
+        """Generate source."""
+        return "https://go.boarddocs.com/oh/cps/Board.nsf/Public#"
diff --git a/tests/files/cinoh_Board_of_Ed.json b/tests/files/cinoh_Board_of_Ed.json
diff --git a/tests/test_cinoh_Board_of_Ed.py b/tests/test_cinoh_Board_of_Ed.py
@@ -0,0 +1,75 @@
+from datetime import datetime
+from os.path import dirname, join
+
+import pytest
+from city_scrapers_core.constants import COMMITTEE, NOT_CLASSIFIED, BOARD
+from city_scrapers_core.utils import file_response
+from freezegun import freeze_time
+
+from city_scrapers.spiders.cinoh_Board_of_Ed import CinohBoardOfEdSpider
+
+test_response = file_response(
+    join(dirname(__file__), "files", "cinoh_Board_of_Ed.json"),
+    url="https://go.boarddocs.com/oh/cps/Board.nsf/BD-GetMeetingsList",
+)
+spider = CinohBoardOfEdSpider()
+
+freezer = freeze_time("2024-12-30")
+freezer.start()
+
+parsed_items = [item for item in spider.parse(test_response)]
+
+freezer.stop()
+
+def test_title():
+    assert parsed_items[0]["title"] == "Budget, Finance and Growth Committee Meeting"
+
+
+def test_description():
+    assert parsed_items[0]["description"] == ""
+
+
+def test_start():
+    assert parsed_items[0]["start"] == datetime(2024, 12, 20, 0, 0)
+
+
+def test_end():
+    assert parsed_items[0]["end"] is None
+
+
+def test_time_notes():
+    assert parsed_items[0]["time_notes"] == ""
+
+
+def test_id():
+    assert parsed_items[0]["id"] == "cinoh_Board_of_Ed/202412200000/x/budget_finance_and_growth_committee_meeting"
+
+
+def test_status():
+    assert parsed_items[0]["status"] == "passed"
+
+
+def test_location():
+    assert parsed_items[0]["location"] == {
+            "name": "Cincinnati Public Schools",
+            "address": "2651 Burnet Avenue, Mary A. Ronan Education Center Room 111, Cincinnati, OH 45219",
+        }
+
+def test_source():
+    assert parsed_items[0]["source"] == "https://go.boarddocs.com/oh/cps/Board.nsf/Public#"
+
+
+def test_links():
+    assert parsed_items[0]["links"] == [{
+        "href": "https://go.boarddocs.com/oh/cps/Board.nsf/Download-AgendaDetailed?open&id=DC2QWY6B5DDA&current_committee_id=A9HCZC3376F4",
+        "title": "Agenda and Zoom Meeting Link"
+    }]
+
+
+def test_classification():
+    assert parsed_items[0]["classification"] == COMMITTEE
+
+
+@pytest.mark.parametrize("item", parsed_items)
+def test_all_day(item):
+    assert item["all_day"] is False