generated from City-Bureau/city-scrapers-template
-
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Cincinnati Board of Education scraper second build
- Loading branch information
Showing
3 changed files
with
160 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from datetime import datetime | ||
import scrapy | ||
from city_scrapers_core.constants import COMMITTEE, NOT_CLASSIFIED, BOARD | ||
from city_scrapers_core.items import Meeting | ||
from city_scrapers_core.spiders import CityScrapersSpider | ||
from dateutil.parser import parse | ||
from dateutil.relativedelta import relativedelta | ||
|
||
|
||
class CinohBoardOfEdSpider(CityScrapersSpider): | ||
name = "cinoh_Board_of_Ed" | ||
agency = "Cincinnati Board of Education" | ||
timezone = "America/New_York" | ||
committee_id = "A9HCZC3376F4" | ||
custom_settings = { | ||
"ROBOTSTXT_OBEY": False, | ||
} | ||
|
||
# the original URL is https://go.boarddocs.com/oh/cps/Board.nsf/Public#tab-welcome | ||
# the html is not scrapable but clicking on the meetings will bring up two API endpoints containing | ||
# the data for the meeting list. I was only able to scrape one of these endpoints which is the one below. | ||
# the current API scraping method is based on another previous scaper for the boarddocs website: https://github.com/City-Bureau/city-scrapers-cinoh/pull/10 | ||
def start_requests(self): | ||
url = "https://go.boarddocs.com/oh/cps/Board.nsf/BD-GetMeetingsList" | ||
form_data = {"current_committee_id": self.committee_id} | ||
yield scrapy.FormRequest(url, formdata=form_data, callback=self.parse) | ||
|
||
def parse(self, response): | ||
lower_limit = datetime.now() - relativedelta(months=12) | ||
data = response.json() | ||
# hardcoded location | ||
location = { | ||
"name": "Cincinnati Public Schools", | ||
"address": "2651 Burnet Avenue, Mary A. Ronan Education Center Room 111, Cincinnati, OH 45219", | ||
} | ||
|
||
for item in data: | ||
|
||
date = item.get("numberdate") | ||
if date is None: | ||
continue | ||
meeting_date = parse(date) | ||
if meeting_date < lower_limit: | ||
continue | ||
|
||
|
||
meeting = Meeting( | ||
title=item["name"], | ||
description="", | ||
classification=self._parse_classification(item), | ||
start=parse(date), | ||
end=None, | ||
all_day=False, | ||
time_notes="", | ||
location=location, | ||
links=self._parse_links(item), | ||
source=self._parse_source(response), | ||
) | ||
|
||
meeting["status"] = self._get_status(meeting) | ||
meeting["id"] = self._get_id(meeting) | ||
|
||
yield meeting | ||
|
||
def _parse_classification(self, item): | ||
if "Committee" in item["name"]: | ||
return COMMITTEE | ||
elif "Board" in item["name"]: | ||
return BOARD | ||
else: | ||
return NOT_CLASSIFIED | ||
|
||
def _parse_links(self, item): | ||
# each link is to the full meeting agenda and also contains the meeting's Zoom link | ||
"""Generate links.""" | ||
href = ( | ||
f"https://go.boarddocs.com/oh/cps/Board.nsf/Download-AgendaDetailed?" | ||
f"open&id={item['unique']}¤t_committee_id={self.committee_id}" | ||
) | ||
return [{"title": "Agenda and Zoom Meeting Link", "href": href}] | ||
|
||
def _parse_source(self, response): | ||
"""Generate source.""" | ||
return "https://go.boarddocs.com/oh/cps/Board.nsf/Public#" |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
from datetime import datetime | ||
from os.path import dirname, join | ||
|
||
import pytest | ||
from city_scrapers_core.constants import COMMITTEE, NOT_CLASSIFIED, BOARD | ||
from city_scrapers_core.utils import file_response | ||
from freezegun import freeze_time | ||
|
||
from city_scrapers.spiders.cinoh_Board_of_Ed import CinohBoardOfEdSpider | ||
|
||
test_response = file_response( | ||
join(dirname(__file__), "files", "cinoh_Board_of_Ed.json"), | ||
url="https://go.boarddocs.com/oh/cps/Board.nsf/BD-GetMeetingsList", | ||
) | ||
spider = CinohBoardOfEdSpider() | ||
|
||
freezer = freeze_time("2024-12-30") | ||
freezer.start() | ||
|
||
parsed_items = [item for item in spider.parse(test_response)] | ||
|
||
freezer.stop() | ||
|
||
def test_title(): | ||
assert parsed_items[0]["title"] == "Budget, Finance and Growth Committee Meeting" | ||
|
||
|
||
def test_description(): | ||
assert parsed_items[0]["description"] == "" | ||
|
||
|
||
def test_start(): | ||
assert parsed_items[0]["start"] == datetime(2024, 12, 20, 0, 0) | ||
|
||
|
||
def test_end(): | ||
assert parsed_items[0]["end"] is None | ||
|
||
|
||
def test_time_notes(): | ||
assert parsed_items[0]["time_notes"] == "" | ||
|
||
|
||
def test_id(): | ||
assert parsed_items[0]["id"] == "cinoh_Board_of_Ed/202412200000/x/budget_finance_and_growth_committee_meeting" | ||
|
||
|
||
def test_status(): | ||
assert parsed_items[0]["status"] == "passed" | ||
|
||
|
||
def test_location(): | ||
assert parsed_items[0]["location"] == { | ||
"name": "Cincinnati Public Schools", | ||
"address": "2651 Burnet Avenue, Mary A. Ronan Education Center Room 111, Cincinnati, OH 45219", | ||
} | ||
|
||
def test_source(): | ||
assert parsed_items[0]["source"] == "https://go.boarddocs.com/oh/cps/Board.nsf/Public#" | ||
|
||
|
||
def test_links(): | ||
assert parsed_items[0]["links"] == [{ | ||
"href": "https://go.boarddocs.com/oh/cps/Board.nsf/Download-AgendaDetailed?open&id=DC2QWY6B5DDA¤t_committee_id=A9HCZC3376F4", | ||
"title": "Agenda and Zoom Meeting Link" | ||
}] | ||
|
||
|
||
def test_classification(): | ||
assert parsed_items[0]["classification"] == COMMITTEE | ||
|
||
|
||
@pytest.mark.parametrize("item", parsed_items) | ||
def test_all_day(item): | ||
assert item["all_day"] is False |