From b6f355ff9a50561490cd6261803f0f48fa5872cd Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 10 Jun 2024 11:53:54 -0300 Subject: [PATCH 1/2] =?UTF-8?q?Atualiza=20raspador=20de=20Paul=C3=ADnia-SP?= =?UTF-8?q?=20para=20site=20novo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/sp/sp_paulinia.py | 61 +++---------------- 1 file changed, 7 insertions(+), 54 deletions(-) diff --git a/data_collection/gazette/spiders/sp/sp_paulinia.py b/data_collection/gazette/spiders/sp/sp_paulinia.py index 1ba6d9c35..ab472ba4c 100644 --- a/data_collection/gazette/spiders/sp/sp_paulinia.py +++ b/data_collection/gazette/spiders/sp/sp_paulinia.py @@ -1,58 +1,11 @@ -import datetime +from datetime import date -import scrapy +from gazette.spiders.base.instar import BaseInstarSpider -from gazette.items import Gazette -from gazette.spiders.base import BaseGazetteSpider - -class SpPauliniaSpider(BaseGazetteSpider): - name = "sp_paulinia" +class SpPauliniaSpider(BaseInstarSpider): TERRITORY_ID = "3536505" - start_date = datetime.date(2012, 1, 4) - allowed_domains = ["www.paulinia.sp.gov.br"] - start_urls = ["http://www.paulinia.sp.gov.br/semanarios"] - - def parse(self, response): - years = response.css("div.col-md-1") - - for year in years: - year_to_scrape = int(year.xpath("./a/text()").get()) - - if not (self.start_date.year <= year_to_scrape <= self.end_date.year): - continue - - event_target = year.xpath("./a/@href").re_first(r"(ctl00.*?)',") - - yield scrapy.FormRequest.from_response( - response, - formdata={"__EVENTTARGET": event_target}, - callback=self.parse_year, - ) - - yield from self.parse_year(response) - - def parse_year(self, response): - editions = response.css("div.body-content div.row a[href*='AbreSemanario']") - - for edition in editions: - title = edition.xpath("./text()") - gazette_date = datetime.datetime.strptime( - title.re_first(r"\d{2}/\d{2}/\d{4}"), - "%d/%m/%Y", - ).date() - - if not (self.start_date <= gazette_date <= self.end_date): - continue - - document_href = edition.xpath("./@href").get() - edition_number = title.re_first(r"- (\d+) -") - is_extra_edition = "extra" in title.get().lower() - - yield Gazette( - date=gazette_date, - edition_number=edition_number, - file_urls=[response.urljoin(document_href)], - is_extra_edition=is_extra_edition, - power="executive", - ) + name = "sp_paulinia" + allowed_domains = ["paulinia.sp.gov.br"] + base_url = "https://www.paulinia.sp.gov.br/portal/diario-oficial" + start_date = date(2012, 1, 4) From 565ea52988ede1571f446a31b165b1e04deafff9 Mon Sep 17 00:00:00 2001 From: trevineju Date: Mon, 10 Jun 2024 10:51:47 -0300 Subject: [PATCH 2/2] =?UTF-8?q?Adiciona=20vers=C3=A3o=20atual=20de=20Pau?= =?UTF-8?q?=20dos=20Ferros-RN?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...rn_pau_dos_ferros.py => rn_pau_dos_ferros_2017.py} | 3 ++- .../gazette/spiders/rn/rn_pau_dos_ferros_2022.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) rename data_collection/gazette/spiders/rn/{rn_pau_dos_ferros.py => rn_pau_dos_ferros_2017.py} (95%) create mode 100644 data_collection/gazette/spiders/rn/rn_pau_dos_ferros_2022.py diff --git a/data_collection/gazette/spiders/rn/rn_pau_dos_ferros.py b/data_collection/gazette/spiders/rn/rn_pau_dos_ferros_2017.py similarity index 95% rename from data_collection/gazette/spiders/rn/rn_pau_dos_ferros.py rename to data_collection/gazette/spiders/rn/rn_pau_dos_ferros_2017.py index 5cbfadaca..3bfcdad45 100644 --- a/data_collection/gazette/spiders/rn/rn_pau_dos_ferros.py +++ b/data_collection/gazette/spiders/rn/rn_pau_dos_ferros_2017.py @@ -8,9 +8,10 @@ class RnPauDosFerrosSpider(BaseGazetteSpider): - name = "rn_pau_dos_ferros" + name = "rn_pau_dos_ferros_2017" allowed_domains = ["paudosferros.rn.gov.br"] start_date = datetime.date(2017, 1, 2) + end_date = datetime.date(2022, 9, 28) TERRITORY_ID = "2409407" start_urls = ["https://paudosferros.rn.gov.br/publicacoes.php?grupo=&cat=11"] diff --git a/data_collection/gazette/spiders/rn/rn_pau_dos_ferros_2022.py b/data_collection/gazette/spiders/rn/rn_pau_dos_ferros_2022.py new file mode 100644 index 000000000..eb9eccab9 --- /dev/null +++ b/data_collection/gazette/spiders/rn/rn_pau_dos_ferros_2022.py @@ -0,0 +1,11 @@ +from datetime import date + +from gazette.spiders.base.adiarios_v1 import BaseAdiariosV1Spider + + +class RnPauDosFerrosSpider(BaseAdiariosV1Spider): + TERRITORY_ID = "2409407" + name = "rn_pau_dos_ferros_2022" + allowed_domains = ["paudosferros.rn.gov.br"] + BASE_URL = "https://www.paudosferros.rn.gov.br" + start_date = date(2022, 9, 28)