Skip to content

Commit

Permalink
Fix spider: cuya_health
Browse files Browse the repository at this point in the history
Removes PDF parsing and spider idle logic and refactors to simply parse the URLs on the page.
  • Loading branch information
SimmonsRitchie committed Feb 9, 2024
1 parent 704bd9e commit 50ddb05
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 1,683 deletions.
147 changes: 91 additions & 56 deletions city_scrapers/spiders/cuya_health.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,9 @@
import re
from collections import defaultdict
from datetime import datetime, time
from io import BytesIO, StringIO
from datetime import datetime, timedelta

import scrapy
from city_scrapers_core.constants import BOARD
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from pdfminer.high_level import extract_text_to_fp
from pdfminer.layout import LAParams


class CuyaHealthSpider(CityScrapersSpider):
Expand All @@ -21,64 +16,104 @@ class CuyaHealthSpider(CityScrapersSpider):
"address": "5550 Venture Dr, Parma, OH 44130",
}

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
"""Connect to spider_idle signal and setup link_date_map for organizing links"""
spider = super().from_crawler(crawler, *args, **kwargs)
spider.link_date_map = defaultdict(list)
crawler.signals.connect(spider.spider_idle, signal=scrapy.signals.spider_idle)
return spider

def spider_idle(self):
"""When the spider_idle signal is triggered, yield all scraped items"""
self.crawler.signals.disconnect(
self.spider_idle, signal=scrapy.signals.spider_idle
)
self.crawler.engine.crawl(
scrapy.Request(self.start_urls[0], callback=self._yield_meetings), self
)
raise scrapy.exceptions.DontCloseSpider

def parse(self, response):
# Iterate through the first two year columns of meetings
for link in response.css(".articleContent > div > div")[:2].css("a"):
if ".pdf" in link.attrib["href"]:
yield response.follow(link.attrib["href"], callback=self._parse_pdf)

def _parse_pdf(self, response):
lp = LAParams(line_margin=5.0)
out_str = StringIO()
extract_text_to_fp(BytesIO(response.body), out_str, laparams=lp)
pdf_text = re.sub(r"\s+", " ", out_str.getvalue()).strip()
date_match = re.search(r"[A-Z][a-z]{2,8} \d{1,2},? \d{4}", pdf_text)
if not date_match:
return
date_obj = datetime.strptime(
date_match.group().replace(",", ""), "%B %d %Y"
).date()
self.link_date_map[date_obj].append(
{
"title": "Agenda" if "agenda" in response.url.lower() else "Minutes",
"href": response.url,
}
)

def _yield_meetings(self, response):
for start_date, links in self.link_date_map.items():
"""
Collects a list of meetings from the meetings materials page using
each agenda link to create a meeting item. Attempts to gorup agenda
and minute links together based on their link titles.
"""
meetings_filtered = self._filter_meetings(response)
for item in meetings_filtered.values():
meeting = Meeting(
title="Board of Health",
title=item["title"],
description="",
classification=BOARD,
start=datetime.combine(start_date, time(9)),
start=item["start"],
end=None,
all_day=False,
time_notes="Confirm details with agency",
time_notes="",
location=self.location,
links=links,
source=self.start_urls[0],
links=item["links"],
source=response.url,
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

def _filter_meetings(self, response):
"""
Filters meetings from the response based on their date and title.
Skips meetings without a valid href or date and those older than
6 months. It groups 'agenda' and 'minutes' based on the link title
and returns a dictionary of meetings with their respective links.
"""
meetings = {}
minutes = {}
for link in response.css(".ea-card .ea-body p a"):
href = link.attrib["href"]
link_title = link.css("::text").extract_first()
start = self.parse_meeting_date(link_title)
# If the href or date cannot be parsed, skip the link
if not href or not start:
continue
# If the meeting is older than 6 months, skip it
if self._is_old_meeting(start):
continue
meeting_title = self._parse_title(link_title)
meeting_id_partial = re.sub(r"\s+", "", meeting_title.lower())
meeting_id = f"{start.date()}_{meeting_id_partial}"
if "agenda" in link_title.lower():
date_pretty = start.strftime("%b %d, %Y")
title = f"{meeting_title} meeting ({date_pretty})"
meetings[meeting_id] = {
"title": title,
"href": href,
"start": start,
"links": [{"href": href, "title": "Agenda"}],
}
elif "minutes" in link_title.lower():
minutes[meeting_id] = {"href": href, "title": "Minutes"}

# Merge the minutes into the meetings dictionary
for meeting_id, data in minutes.items():
if meeting_id in meetings:
meetings[meeting_id]["links"].append(data)
return meetings

def parse_meeting_date(self, meeting_string):
"""
Parses a meeting date from various string formats and returns a datetime object.
"""
pattern = r"(January|February|March|April|May|June|July|August|September|October|November|December)\s(\d{1,2}),\s(\d{4})" # noqa
match = re.search(pattern, meeting_string)
if match:
date_str = f"{match.group(1)} {match.group(2)} {match.group(3)}"
meeting_date = datetime.strptime(date_str, "%B %d %Y")
return meeting_date
return None

def _parse_title(self, meeting_string):
"""
Parses a meeting title from a string, removing the
date and the words "minutes" or "agenda".
"""
cleaned_title = re.sub(
r"(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},\s\d{4}", # noqa
"",
meeting_string,
flags=re.IGNORECASE,
)
cleaned_title = re.sub(
r"minutes|agenda", "", cleaned_title, flags=re.IGNORECASE
)
cleaned_title = cleaned_title.strip()
return cleaned_title

def _is_old_meeting(self, provided_date):
"""
Checks if a provided datetime is older than 6 months from the current date.
Returns True if the date is older than 6 months, False otherwise.
"""
six_months_ago = datetime.now() - timedelta(days=30 * 6)
return provided_date < six_months_ago
1,683 changes: 76 additions & 1,607 deletions tests/files/cuya_health.html

Large diffs are not rendered by default.

Binary file removed tests/files/cuya_health.pdf
Binary file not shown.
47 changes: 27 additions & 20 deletions tests/test_cuya_health.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from collections import defaultdict
from datetime import datetime
from os.path import dirname, join

Expand All @@ -11,68 +10,76 @@

test_response = file_response(
join(dirname(__file__), "files", "cuya_health.html"),
url="https://www.ccbh.net/board-minutes-agenda/",
)
test_pdf_response = file_response(
join(dirname(__file__), "files", "cuya_health.pdf"),
url="https://www.ccbh.net/wp-content/uploads/2019/04/REVISED-April-2019-Board-Agenda.pdf", # noqa
mode="rb",
url="https://ccbh.net/board-minutes-agenda/",
)

spider = CuyaHealthSpider()

freezer = freeze_time("2019-10-21")
freezer = freeze_time("2024-02-07")
freezer.start()

parsed_items = [item for item in spider.parse(test_response)]
spider.link_date_map = defaultdict(list)
spider._parse_pdf(test_pdf_response)
parsed_item = [item for item in spider._yield_meetings(test_response)][0]

# Assuming this is the correct item you're testing; adjust as necessary
parsed_item = parsed_items[0]
freezer.stop()


def test_count():
assert len(parsed_items) == 41
assert len(parsed_items) == 5


def test_title():
assert parsed_item["title"] == "Board of Health"
assert parsed_item["title"] == "Board meeting (Dec 20, 2023)"


def test_description():
assert parsed_item["description"] == ""


def test_start():
assert parsed_item["start"] == datetime(2019, 4, 24, 9, 0)
assert parsed_item["start"] == datetime(2023, 12, 20, 0, 0)


def test_end():
assert parsed_item["end"] is None


def test_time_notes():
assert parsed_item["time_notes"] == "Confirm details with agency"
assert parsed_item["time_notes"] == "" # Adjust if there's a specific note


def test_id():
assert parsed_item["id"] == "cuya_health/201904240900/x/board_of_health"
assert parsed_item["id"] == "cuya_health/202312200000/x/board_meeting_dec_20_2023_"


def test_status():
assert parsed_item["status"] == PASSED


def test_location():
assert parsed_item["location"] == spider.location
expected_location = {
"name": "Cuyahoga County Board of Health",
"address": "5550 Venture Dr, Parma, OH 44130",
}
assert parsed_item["location"] == expected_location


def test_source():
assert parsed_item["source"] == test_response.url
assert parsed_item["source"] == "https://ccbh.net/board-minutes-agenda/"


def test_links():
assert parsed_item["links"] == [{"href": test_pdf_response.url, "title": "Agenda"}]
expected_links = [
{
"href": "https://ccbh.net/wp-content/uploads/2024/01/01.00-December_2023_Board_Agenda-Revised_12-19-2023.pdf", # noqa
"title": "Agenda",
},
{
"href": "https://ccbh.net/wp-content/uploads/2024/01/December_20_2023-Minutes-1.pdf", # noqa
"title": "Minutes",
},
]
assert parsed_item["links"] == expected_links


def test_classification():
Expand Down

0 comments on commit 50ddb05

Please sign in to comment.