Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🏗️ Build spider: Atlantic City #9

Merged
merged 5 commits into from
Dec 23, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 127 additions & 0 deletions city_scrapers/spiders/atconj_Atlantic_City.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import json
msrezaie marked this conversation as resolved.
Show resolved Hide resolved
from datetime import datetime
from urllib.parse import urljoin

import scrapy
from city_scrapers_core.constants import (
CANCELLED,
CITY_COUNCIL,
CLASSIFICATIONS,
NOT_CLASSIFIED,
PASSED,
TENTATIVE,
)
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from dateutil.parser import parse


class AlanticCitySpider(CityScrapersSpider):
msrezaie marked this conversation as resolved.
Show resolved Hide resolved
name = "atconj_Atlantic_City"
agency = "Atlantic City"
timezone = "America/New_York"

custom_settings = {
"ROBOTSTXT_OBEY": False,
}

"""
The website layout of this agency uses JavaScript to dynamically
load meetings for one month at a time, making it challenging to
scrape data directly from the HTML/CSS structure of the site.

So instead API endpoints from the agency's URL are used to fetch
the meetings data:
- `meetings_url`: provides a list of all meetings for a given time
period.
- `meeting_detail_url`: retrieves detailed information for each
meeting using its ID.

Additionally, a third url `calender_source` is used as the source
field of the meeting since it is more user friendly to navigate
than the api endpoints.
"""
meetings_url = "https://www.acnj.gov/api/data/GetCalendarMeetings?end=06%2F30%2F2025+12:00+am&meetingTypeID=all&start=06%2F01%2F2024+12:00+am" # noqa
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Consider parameterizing the date range in meetings_url

The URL contains hardcoded dates (2024-2025) which will need manual updates in the future. Consider making these dates dynamic based on the current date.

-    meetings_url = "https://www.acnj.gov/api/data/GetCalendarMeetings?end=06%2F30%2F2025+12:00+am&meetingTypeID=all&start=06%2F01%2F2024+12:00+am"  # noqa
+    @property
+    def meetings_url(self):
+        start_date = datetime.now().strftime("%m%%2F01%%2F%Y")
+        end_date = (datetime.now().replace(year=datetime.now().year + 1)).strftime("%m%%2F30%%2F%Y")
+        return f"https://www.acnj.gov/api/data/GetCalendarMeetings?end={end_date}+12:00+am&meetingTypeID=all&start={start_date}+12:00+am"

Committable suggestion skipped: line range outside the PR's diff.

msrezaie marked this conversation as resolved.
Show resolved Hide resolved
meeting_detail_url = "https://www.acnj.gov/api/data/GetMeeting?id="
calender_source = "https://www.acnj.gov/calendar"
msrezaie marked this conversation as resolved.
Show resolved Hide resolved

def start_requests(self):
yield scrapy.Request(url=self.meetings_url, method="GET", callback=self.parse)

def parse(self, response):
data = json.loads(response.text)
msrezaie marked this conversation as resolved.
Show resolved Hide resolved
for item in data:
meeting_id = item["id"]
meeting_detail_url = self.meeting_detail_url + meeting_id

yield scrapy.Request(
url=meeting_detail_url,
method="GET",
callback=self.parse_meeting,
cb_kwargs={"item": item},
)

def parse_meeting(self, response, item):
meeting_detail = json.loads(response.text)

meeting = Meeting(
title=item["title"],
description="",
classification=self._parse_classification(meeting_detail),
start=parse(item["start"]),
end=None,
all_day=item["allDay"],
time_notes="",
location=self._parse_location(meeting_detail),
links=self._parse_links(meeting_detail),
source=self.calender_source,
)

meeting["status"] = self._get_status(meeting_detail)
meeting["id"] = int(item["id"])

yield meeting
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add error handling for API responses

The parsing assumes the API response will always contain the expected fields. Consider adding error handling for missing or malformed data.

     def parse_meeting(self, response, item):
-        meeting_detail = json.loads(response.text)
+        try:
+            meeting_detail = json.loads(response.text)
+            if not isinstance(meeting_detail, dict):
+                raise ValueError("Expected dictionary response")
+        except (json.JSONDecodeError, ValueError) as e:
+            self.logger.error(f"Failed to parse meeting detail: {e}")
+            return

         meeting = Meeting(
-            title=item["title"],
+            title=item.get("title", ""),
             description="",
             classification=self._parse_classification(meeting_detail),
-            start=parse(item["start"]),
+            start=parse(item.get("start")) if item.get("start") else None,
             end=None,
-            all_day=item["allDay"],
+            all_day=item.get("allDay", False),
             time_notes="",
             location=self._parse_location(meeting_detail),
             links=self._parse_links(meeting_detail),
             source=self.calender_source,
         )

Committable suggestion skipped: line range outside the PR's diff.


def _parse_classification(self, item):
for classification in CLASSIFICATIONS:
if classification.lower() in item["Meeting_Type"].lower():
return classification
elif "council" in item["Meeting_Type"].lower():
return CITY_COUNCIL
return NOT_CLASSIFIED

msrezaie marked this conversation as resolved.
Show resolved Hide resolved
def _parse_location(self, item):
meeting_location = (
item["Meeting_Location"]
or "1301 Bacharach Boulevard Atlantic City, NJ, 08401"
)

if "-" in meeting_location:
return {
"address": meeting_location.split("-")[1].strip(),
"name": meeting_location.split("-")[0].strip(),
}
else:
return {
"address": meeting_location,
"name": "City Hall of Atlantic City",
}

def _parse_links(self, item):
base_url = "https://www.acnj.gov/"
keys = ["Meeting_AgendaPDF", "Meeting_MinutesPDF", "Meeting_NoticePDF"]
titles = ["Agenda", "Minutes", "Notice"]

links = [
{"title": title, "href": urljoin(base_url, item.get(key, ""))}
msrezaie marked this conversation as resolved.
Show resolved Hide resolved
for title, key in zip(titles, keys)
if item.get(key)
]
return links

def _get_status(self, item):
if item["Meeting_IsCanceled"]:
return CANCELLED
if parse(item["Meeting_DateTime"]) < datetime.now():
return PASSED
return TENTATIVE
msrezaie marked this conversation as resolved.
Show resolved Hide resolved
Loading
Loading