Skip to content

Commit

Permalink
Cincinnati Board of Education scraper second build
Browse files Browse the repository at this point in the history
  • Loading branch information
vsspnkkvr committed Dec 30, 2024
1 parent 975f98e commit dfaf432
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 0 deletions.
84 changes: 84 additions & 0 deletions city_scrapers/spiders/cinoh_Board_of_Ed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from datetime import datetime
import scrapy
from city_scrapers_core.constants import COMMITTEE, NOT_CLASSIFIED, BOARD
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta


class CinohBoardOfEdSpider(CityScrapersSpider):
name = "cinoh_Board_of_Ed"
agency = "Cincinnati Board of Education"
timezone = "America/New_York"
committee_id = "A9HCZC3376F4"
custom_settings = {
"ROBOTSTXT_OBEY": False,
}

# the original URL is https://go.boarddocs.com/oh/cps/Board.nsf/Public#tab-welcome
# the html is not scrapable but clicking on the meetings will bring up two API endpoints containing
# the data for the meeting list. I was only able to scrape one of these endpoints which is the one below.
# the current API scraping method is based on another previous scaper for the boarddocs website: https://github.com/City-Bureau/city-scrapers-cinoh/pull/10
def start_requests(self):
url = "https://go.boarddocs.com/oh/cps/Board.nsf/BD-GetMeetingsList"
form_data = {"current_committee_id": self.committee_id}
yield scrapy.FormRequest(url, formdata=form_data, callback=self.parse)

def parse(self, response):
lower_limit = datetime.now() - relativedelta(months=12)
data = response.json()
# hardcoded location
location = {
"name": "Cincinnati Public Schools",
"address": "2651 Burnet Avenue, Mary A. Ronan Education Center Room 111, Cincinnati, OH 45219",
}

for item in data:

date = item.get("numberdate")
if date is None:
continue
meeting_date = parse(date)
if meeting_date < lower_limit:
continue


meeting = Meeting(
title=item["name"],
description="",
classification=self._parse_classification(item),
start=parse(date),
end=None,
all_day=False,
time_notes="",
location=location,
links=self._parse_links(item),
source=self._parse_source(response),
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

def _parse_classification(self, item):
if "Committee" in item["name"]:
return COMMITTEE
elif "Board" in item["name"]:
return BOARD
else:
return NOT_CLASSIFIED

def _parse_links(self, item):
# each link is to the full meeting agenda and also contains the meeting's Zoom link
"""Generate links."""
href = (
f"https://go.boarddocs.com/oh/cps/Board.nsf/Download-AgendaDetailed?"
f"open&id={item['unique']}&current_committee_id={self.committee_id}"
)
return [{"title": "Agenda and Zoom Meeting Link", "href": href}]

def _parse_source(self, response):
"""Generate source."""
return "https://go.boarddocs.com/oh/cps/Board.nsf/Public#"
1 change: 1 addition & 0 deletions tests/files/cinoh_Board_of_Ed.json

Large diffs are not rendered by default.

75 changes: 75 additions & 0 deletions tests/test_cinoh_Board_of_Ed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from datetime import datetime
from os.path import dirname, join

import pytest
from city_scrapers_core.constants import COMMITTEE, NOT_CLASSIFIED, BOARD
from city_scrapers_core.utils import file_response
from freezegun import freeze_time

from city_scrapers.spiders.cinoh_Board_of_Ed import CinohBoardOfEdSpider

test_response = file_response(
join(dirname(__file__), "files", "cinoh_Board_of_Ed.json"),
url="https://go.boarddocs.com/oh/cps/Board.nsf/BD-GetMeetingsList",
)
spider = CinohBoardOfEdSpider()

freezer = freeze_time("2024-12-30")
freezer.start()

parsed_items = [item for item in spider.parse(test_response)]

freezer.stop()

def test_title():
assert parsed_items[0]["title"] == "Budget, Finance and Growth Committee Meeting"


def test_description():
assert parsed_items[0]["description"] == ""


def test_start():
assert parsed_items[0]["start"] == datetime(2024, 12, 20, 0, 0)


def test_end():
assert parsed_items[0]["end"] is None


def test_time_notes():
assert parsed_items[0]["time_notes"] == ""


def test_id():
assert parsed_items[0]["id"] == "cinoh_Board_of_Ed/202412200000/x/budget_finance_and_growth_committee_meeting"


def test_status():
assert parsed_items[0]["status"] == "passed"


def test_location():
assert parsed_items[0]["location"] == {
"name": "Cincinnati Public Schools",
"address": "2651 Burnet Avenue, Mary A. Ronan Education Center Room 111, Cincinnati, OH 45219",
}

def test_source():
assert parsed_items[0]["source"] == "https://go.boarddocs.com/oh/cps/Board.nsf/Public#"


def test_links():
assert parsed_items[0]["links"] == [{
"href": "https://go.boarddocs.com/oh/cps/Board.nsf/Download-AgendaDetailed?open&id=DC2QWY6B5DDA&current_committee_id=A9HCZC3376F4",
"title": "Agenda and Zoom Meeting Link"
}]


def test_classification():
assert parsed_items[0]["classification"] == COMMITTEE


@pytest.mark.parametrize("item", parsed_items)
def test_all_day(item):
assert item["all_day"] is False

0 comments on commit dfaf432

Please sign in to comment.