Skip to content

Commit

Permalink
spider for Atlantic County Commissioners
Browse files Browse the repository at this point in the history
  • Loading branch information
msrezaie committed Dec 13, 2024
1 parent 85c2a84 commit f8d8b6c
Show file tree
Hide file tree
Showing 3 changed files with 3,548 additions and 0 deletions.
100 changes: 100 additions & 0 deletions city_scrapers/spiders/atconj_County_Commission.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import json

import scrapy
from city_scrapers_core.constants import BOARD
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from dateutil.parser import parse as dateparse
from scrapy import Selector


class AtconjCountyCommissionSpider(CityScrapersSpider):
name = "atconj_County_Commission"
agency = "Atlantic County Board of County Commissioners"
timezone = "America/New_York"

original_url = "https://www.atlanticcountynj.gov"
meetings_url = "https://www.atlanticcountynj.gov/government/county-government/board-of-county-commissioners/meeting-schedule-agendas-and-minutes/-toggle-all" # noqa

custom_settings = {
"ROBOTSTXT_OBEY": False,
}

default_location = {
"name": "Stillwater Building",
"address": "Stillwater Building, 201 S. Shore Road Northfield, New Jersey 08225", # noqa
}

"""
This website would return a 403 error if the request
is made with the default headers. The headers below
are the ones that are needed to make the request.
"""

def start_requests(self):
headers = {
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36", # noqa
}

yield scrapy.Request(
url=self.meetings_url, headers=headers, callback=self.parse
)

def parse(self, response):
rows = response.css("table.front_end_widget tbody tr").getall()
scripts = response.css("table.front_end_widget tbody script::text").getall()
time_note = response.css("div#widget_45_2122_758 p::text").get()

for row_item, script_item in zip(rows, scripts):
content = json.loads(script_item)

meeting = Meeting(
title="Atlantic County Board of County Commissioners",
description="",
classification=BOARD,
start=self._parse_start(content),
end=self._parse_end(content),
all_day=False,
time_notes=time_note.strip(),
location=self._parse_location(content),
links=self._parse_links(row_item),
source=response.url,
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

def _parse_start(self, item):
"""Parse start datetime as a naive datetime object."""
return dateparse(item["startDate"]).astimezone(tz=None).replace(tzinfo=None)

def _parse_end(self, item):
"""Parse end datetime as a naive datetime object."""
return dateparse(item["endDate"]).astimezone(tz=None).replace(tzinfo=None)

def _parse_location(self, item):
location = item["location"]
if not location["address"] or not location["name"]:
return self.default_location
return {"name": location["name"], "address": location["address"]}

def _parse_links(self, item):
links = []

item = Selector(text=item)
agenda = item.css("td.event_agenda a").get()
minutes = item.css("td.event_minutes a").get()
if agenda:
agenda = agenda.split('href="')[1].split('"')[0]
links.append({"href": self.original_url + agenda, "title": "Agenda"})
if minutes:
minutes = minutes.split('href="')[1].split('"')[0]
links.append({"href": self.original_url + minutes, "title": "Minutes"})
return links
Loading

0 comments on commit f8d8b6c

Please sign in to comment.