Skip to content

Commit

Permalink
Atualiza raspadores para Pau dos Ferros-RN e Paulínia-SP (#1159)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Jun 12, 2024
2 parents 28b4b78 + 565ea52 commit b4d351f
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@


class RnPauDosFerrosSpider(BaseGazetteSpider):
name = "rn_pau_dos_ferros"
name = "rn_pau_dos_ferros_2017"
allowed_domains = ["paudosferros.rn.gov.br"]
start_date = datetime.date(2017, 1, 2)
end_date = datetime.date(2022, 9, 28)
TERRITORY_ID = "2409407"
start_urls = ["https://paudosferros.rn.gov.br/publicacoes.php?grupo=&cat=11"]

Expand Down
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/rn/rn_pau_dos_ferros_2022.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.adiarios_v1 import BaseAdiariosV1Spider


class RnPauDosFerrosSpider(BaseAdiariosV1Spider):
TERRITORY_ID = "2409407"
name = "rn_pau_dos_ferros_2022"
allowed_domains = ["paudosferros.rn.gov.br"]
BASE_URL = "https://www.paudosferros.rn.gov.br"
start_date = date(2022, 9, 28)
61 changes: 7 additions & 54 deletions data_collection/gazette/spiders/sp/sp_paulinia.py
Original file line number Diff line number Diff line change
@@ -1,58 +1,11 @@
import datetime
from datetime import date

import scrapy
from gazette.spiders.base.instar import BaseInstarSpider

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class SpPauliniaSpider(BaseGazetteSpider):
name = "sp_paulinia"
class SpPauliniaSpider(BaseInstarSpider):
TERRITORY_ID = "3536505"
start_date = datetime.date(2012, 1, 4)
allowed_domains = ["www.paulinia.sp.gov.br"]
start_urls = ["http://www.paulinia.sp.gov.br/semanarios"]

def parse(self, response):
years = response.css("div.col-md-1")

for year in years:
year_to_scrape = int(year.xpath("./a/text()").get())

if not (self.start_date.year <= year_to_scrape <= self.end_date.year):
continue

event_target = year.xpath("./a/@href").re_first(r"(ctl00.*?)',")

yield scrapy.FormRequest.from_response(
response,
formdata={"__EVENTTARGET": event_target},
callback=self.parse_year,
)

yield from self.parse_year(response)

def parse_year(self, response):
editions = response.css("div.body-content div.row a[href*='AbreSemanario']")

for edition in editions:
title = edition.xpath("./text()")
gazette_date = datetime.datetime.strptime(
title.re_first(r"\d{2}/\d{2}/\d{4}"),
"%d/%m/%Y",
).date()

if not (self.start_date <= gazette_date <= self.end_date):
continue

document_href = edition.xpath("./@href").get()
edition_number = title.re_first(r"- (\d+) -")
is_extra_edition = "extra" in title.get().lower()

yield Gazette(
date=gazette_date,
edition_number=edition_number,
file_urls=[response.urljoin(document_href)],
is_extra_edition=is_extra_edition,
power="executive",
)
name = "sp_paulinia"
allowed_domains = ["paulinia.sp.gov.br"]
base_url = "https://www.paulinia.sp.gov.br/portal/diario-oficial"
start_date = date(2012, 1, 4)

0 comments on commit b4d351f

Please sign in to comment.