Skip to content

Commit

Permalink
Build spider: sandie_city_council
Browse files Browse the repository at this point in the history
Includes installation of pytz for timezone handling in URL generation
  • Loading branch information
SimmonsRitchie committed Feb 6, 2024
1 parent 79b46d7 commit b1ae1db
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 8 deletions.
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ scrapy = "*"
scrapy-sentry-errors = "*"
city-scrapers-core = {ref = "main", git = "https://github.com/City-Bureau/city-scrapers-core.git", extras = ["azure"]}
scrapy-wayback-middleware = "*"
pytz = "*"

[dev-packages]
freezegun = "*"
Expand Down
17 changes: 9 additions & 8 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

126 changes: 126 additions & 0 deletions city_scrapers/spiders/sandie_city_council.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
from datetime import datetime, timedelta

import pytz
from city_scrapers_core.constants import CITY_COUNCIL, COMMITTEE, NOT_CLASSIFIED
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from scrapy import Request


class SandieCityCouncilSpider(CityScrapersSpider):
name = "sandie_city_council"
agency = "San Diego City Council"
timezone = "America/Los_Angeles"
location = {
"address": "1600 Pacific Hwy., San Diego, California",
"name": "San Diego County Administration Center",
}
base_url = "https://www.sandiego.gov/city-clerk/officialdocs/meetings-calendar"

def start_requests(self):
"""
Generate URLs for the current and next month pages
and yield scrapy Requests for them, with timestamps
adjusted to 1am on the first of the month in Los Angeles time.
"""
# Local timezone
tz = pytz.timezone(self.timezone)

# Get start of current month at 1 AM
now = datetime.now(tz)
start_of_current_month = datetime(now.year, now.month, 1, 1, 0)
start_of_current_month = tz.localize(start_of_current_month)

# Get start of next month at 1 AM
next_month = start_of_current_month + timedelta(days=31)
start_of_next_month = datetime(next_month.year, next_month.month, 1, 1, 0)
start_of_next_month = tz.localize(start_of_next_month)

# Generate timestamps for URLs
current_month_ts = int(start_of_current_month.timestamp())
next_month_ts = int(start_of_next_month.timestamp())

# Generate URLs
current_month_url = f"{self.base_url}?calendar_timestamp={current_month_ts}"
next_month_url = f"{self.base_url}?calendar_timestamp={next_month_ts}"

yield Request(url=current_month_url, callback=self.parse)
yield Request(url=next_month_url, callback=self.parse)

def parse(self, response):
"""
Parse the calendar page and extract the events if there are any for a given day.
"""
calendar = response.css(".calendar-view-table.calendar-view-month")
for day in calendar.css("tbody tr td.current-month"):
# get date
date_obj = self._parse_date(day)

# skip if no events
events = day.css("ul li")
if len(events) == 0:
continue

# loop through events
for event in events:
title = event.css("a::text").get()
if "legislative recess" in title.lower():
continue
meeting = Meeting(
title=title,
description="",
classification=self._parse_classification(title),
start=self._parse_start(date_obj, event),
end=None,
all_day=False,
time_notes="",
location=self.location,
links=self._parse_links(event),
source=self._parse_source(response),
)
meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)
yield meeting

def _parse_date(self, day):
"""Parse the calendar date from the day element."""
date_str = day.css(".calendar-view-day__number::attr('datetime')").get()
date_part = date_str[:10]
date_obj = datetime.strptime(date_part, "%Y-%m-%d").date()
return date_obj

def _parse_classification(self, title):
"""Parse classification from title."""
clean_title = title.lower()
if "council" in clean_title:
return CITY_COUNCIL
if "committee" in clean_title or "authority" in clean_title:
return COMMITTEE
return NOT_CLASSIFIED

def _parse_start(self, date_obj, event):
"""Parse start datetime as a naive datetime object."""
time_str = event.css("span.fine-print::text").get()
time_obj = datetime.strptime(time_str, "%I:%M %p").time()
return datetime.combine(date_obj, time_obj)

def _parse_location(self, item):
"""Parse or generate location."""
return

def _parse_links(self, event):
"""Parse links."""
links = [
{
"href": "https://sandiego.granicus.com/ViewPublisher.php?view_id=31",
"title": "Webcasts",
}
]
agenda_link = event.css("a::attr('href')").get()
if agenda_link:
links.append({"href": agenda_link, "title": "Meeting materials"})
return links

def _parse_source(self, response):
"""Parse or generate source."""
return response.url

0 comments on commit b1ae1db

Please sign in to comment.