diff --git a/Pipfile b/Pipfile index f05b880..f70e4ad 100644 --- a/Pipfile +++ b/Pipfile @@ -8,6 +8,7 @@ scrapy = "*" scrapy-sentry-errors = "*" city-scrapers-core = {ref = "main", git = "https://github.com/City-Bureau/city-scrapers-core.git", extras = ["azure"]} scrapy-wayback-middleware = "*" +pytz = "*" [dev-packages] freezegun = "*" diff --git a/Pipfile.lock b/Pipfile.lock index 1a83def..b5c5526 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "946eb25eeda4cede0791ce08578a367d3c45f0602070d59469c6ca108918edcf" + "sha256": "41a29ca3648304f55a5f2dc5ab095a8ba7f7664ab88d3fc84a705483b7429094" }, "pipfile-spec": 6, "requires": { @@ -499,6 +499,7 @@ "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812", "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319" ], + "index": "pypi", "version": "==2024.1" }, "queuelib": { @@ -661,15 +662,15 @@ "sha256:fd2b8500d64b909289e3541c201b4d672c0e7b458fc20e77bb37f0d71d93a75a" ], "index": "pypi", - "markers": "python_full_version >= '3.6.1' and python_version < '4.0'", + "markers": "python_version < '4.0' and python_full_version >= '3.6.1'", "version": "==0.3.3" }, "sentry-sdk": { "hashes": [ - "sha256:34ad8cfc9b877aaa2a8eb86bfe5296a467fffe0619b931a05b181c45f6da59bf", - "sha256:78575620331186d32f34b7ece6edea97ce751f58df822547d3ab85517881a27a" + "sha256:1bb9cf4ac317906d20787693b5e7f3e42160a90e8bbf1fc544f91c52fa76b68f", + "sha256:69fc5e7512371547207821d801485f45e3c62db629f02f56f58431a10864ac34" ], - "version": "==1.40.0" + "version": "==1.40.1" }, "service-identity": { "hashes": [ @@ -692,7 +693,7 @@ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "tldextract": { @@ -937,7 +938,7 @@ "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.8.2" }, "six": { @@ -945,7 +946,7 @@ "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" ], - "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'", + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==1.16.0" }, "tomli": { diff --git a/city_scrapers/spiders/sandie_city_council.py b/city_scrapers/spiders/sandie_city_council.py new file mode 100644 index 0000000..ea2a4f4 --- /dev/null +++ b/city_scrapers/spiders/sandie_city_council.py @@ -0,0 +1,126 @@ +from datetime import datetime, timedelta + +import pytz +from city_scrapers_core.constants import CITY_COUNCIL, COMMITTEE, NOT_CLASSIFIED +from city_scrapers_core.items import Meeting +from city_scrapers_core.spiders import CityScrapersSpider +from scrapy import Request + + +class SandieCityCouncilSpider(CityScrapersSpider): + name = "sandie_city_council" + agency = "San Diego City Council" + timezone = "America/Los_Angeles" + location = { + "address": "1600 Pacific Hwy., San Diego, California", + "name": "San Diego County Administration Center", + } + base_url = "https://www.sandiego.gov/city-clerk/officialdocs/meetings-calendar" + + def start_requests(self): + """ + Generate URLs for the current and next month pages + and yield scrapy Requests for them, with timestamps + adjusted to 1am on the first of the month in Los Angeles time. + """ + # Local timezone + tz = pytz.timezone(self.timezone) + + # Get start of current month at 1 AM + now = datetime.now(tz) + start_of_current_month = datetime(now.year, now.month, 1, 1, 0) + start_of_current_month = tz.localize(start_of_current_month) + + # Get start of next month at 1 AM + next_month = start_of_current_month + timedelta(days=31) + start_of_next_month = datetime(next_month.year, next_month.month, 1, 1, 0) + start_of_next_month = tz.localize(start_of_next_month) + + # Generate timestamps for URLs + current_month_ts = int(start_of_current_month.timestamp()) + next_month_ts = int(start_of_next_month.timestamp()) + + # Generate URLs + current_month_url = f"{self.base_url}?calendar_timestamp={current_month_ts}" + next_month_url = f"{self.base_url}?calendar_timestamp={next_month_ts}" + + yield Request(url=current_month_url, callback=self.parse) + yield Request(url=next_month_url, callback=self.parse) + + def parse(self, response): + """ + Parse the calendar page and extract the events if there are any for a given day. + """ + calendar = response.css(".calendar-view-table.calendar-view-month") + for day in calendar.css("tbody tr td.current-month"): + # get date + date_obj = self._parse_date(day) + + # skip if no events + events = day.css("ul li") + if len(events) == 0: + continue + + # loop through events + for event in events: + title = event.css("a::text").get() + if "legislative recess" in title.lower(): + continue + meeting = Meeting( + title=title, + description="", + classification=self._parse_classification(title), + start=self._parse_start(date_obj, event), + end=None, + all_day=False, + time_notes="", + location=self.location, + links=self._parse_links(event), + source=self._parse_source(response), + ) + meeting["status"] = self._get_status(meeting) + meeting["id"] = self._get_id(meeting) + yield meeting + + def _parse_date(self, day): + """Parse the calendar date from the day element.""" + date_str = day.css(".calendar-view-day__number::attr('datetime')").get() + date_part = date_str[:10] + date_obj = datetime.strptime(date_part, "%Y-%m-%d").date() + return date_obj + + def _parse_classification(self, title): + """Parse classification from title.""" + clean_title = title.lower() + if "council" in clean_title: + return CITY_COUNCIL + if "committee" in clean_title or "authority" in clean_title: + return COMMITTEE + return NOT_CLASSIFIED + + def _parse_start(self, date_obj, event): + """Parse start datetime as a naive datetime object.""" + time_str = event.css("span.fine-print::text").get() + time_obj = datetime.strptime(time_str, "%I:%M %p").time() + return datetime.combine(date_obj, time_obj) + + def _parse_location(self, item): + """Parse or generate location.""" + return + + def _parse_links(self, event): + """Parse links.""" + links = [ + { + "href": "https://sandiego.granicus.com/ViewPublisher.php?view_id=31", + "title": "Webcasts", + } + ] + agenda_link = event.css("a::attr('href')").get() + if agenda_link: + links.append({"href": agenda_link, "title": "Meeting materials"}) + return links + + def _parse_source(self, response): + """Parse or generate source.""" + return response.url