Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix CivPlus bugs #176 #189 #190

Merged
merged 8 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/continuous-deployment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ jobs:

- id: build
name: Build release
run: make build-release
run: |
pipenv run pip install setuptools-scm>=8.1.0 --force-reinstall --upgrade
make build-release

- id: check
name: Check release
Expand Down
2 changes: 1 addition & 1 deletion civic_scraper/base/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(
meeting_id: str = None,
scraped_by: str = None,
content_type: str = None,
content_length: str = None
content_length: str = None,
) -> None:
self.url = url
self.asset_name = asset_name
Expand Down
6 changes: 3 additions & 3 deletions civic_scraper/platforms/civic_clerk/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ def __init__(self, url, place=None, state_or_province=None, cache=Cache()):
self.cache = cache

self.session = Session()
self.session.headers[
"User-Agent"
] = "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
self.session.headers["User-Agent"] = (
"Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
)

# Raise an error if a request gets a failing status code
self.session.hooks = {
Expand Down
17 changes: 14 additions & 3 deletions civic_scraper/platforms/civic_plus/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def file_links_with_no_title(tag):
)

metadata = []
# Links often appear twice (once under meeting title, once in download menu)
# so we track which we've already seen to avoid duplicate entries
bookkeeping = set()
for div in divs:
cmte_name = self._committee_name(div)
# Line-item data for each meeting is inside table rows.
Expand All @@ -52,6 +55,9 @@ def file_links_with_no_title(tag):
# Skip links to page listing previous agenda versions
if self._previous_version_link(link):
continue
# Skip previously harvested links
if link["href"] in bookkeeping:
continue
metadata.append(
{
"committee_name": cmte_name,
Expand All @@ -63,13 +69,18 @@ def file_links_with_no_title(tag):
"asset_type": self._asset_type(link["href"]),
}
)
bookkeeping.add(link["href"])
return metadata

def _committee_name(self, div):
# Remove span that contains
# If present, remove span that contains
# arrow ▼ for toggling meeting list
div.h2.span.extract()
return div.h2.text.strip()
try:
div.h2.span.extract()
except AttributeError:
pass
header_node = div.h2 or div.h3
return header_node.text.strip()

def _mtg_title(self, row):
return row.p.text.strip()
Expand Down
4 changes: 3 additions & 1 deletion civic_scraper/platforms/civic_plus/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def __init__(self, base_url, cache=Cache(), parser_kls=Parser, place_name=None):

@property
def place(self):
return self.place_name or self._get_asset_metadata(r"(?<=-)\w+(?=\.)", self.base_url)
return self.place_name or self._get_asset_metadata(
r"(?<=-)\w+(?=\.)", self.base_url
)

def scrape(
self,
Expand Down
2 changes: 1 addition & 1 deletion civic_scraper/platforms/legistar/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _create_asset(self, event, meeting_meta, asset_type):
name_bits.append(asset_type)
kwargs = {
"url": event[asset_type]["url"],
"asset_type": asset_type.lower().replace(' ', '_'),
"asset_type": asset_type.lower().replace(" ", "_"),
"asset_name": " - ".join(name_bits),
"content_type": None,
"content_length": None,
Expand Down
6 changes: 3 additions & 3 deletions civic_scraper/platforms/primegov/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ def __init__(self, url, place=None, state_or_province=None, cache=Cache()):
self.cache = cache

self.session = Session()
self.session.headers[
"User-Agent"
] = "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
self.session.headers["User-Agent"] = (
"Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
)

# Raise an error if a request gets a failing status code
self.session.hooks = {
Expand Down
1 change: 1 addition & 0 deletions scripts/generate_civicplus_sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"/Users/amydipierro/GitHub/test.csv"

"""

import csv
import re

Expand Down
1 change: 1 addition & 0 deletions scripts/run_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
path/to/target.csv \
--scraper_args '{"start_date": "2015-09-09", "end_date": "2015-10-14"}'
"""

from civic_scraper.scrapers import SUPPORTED_SITES


Expand Down
Loading