From 2c02fb0b643b468c6f9e2c6b6839a4a711aede36 Mon Sep 17 00:00:00 2001 From: trevineju Date: Wed, 8 Jan 2025 21:43:17 -0300 Subject: [PATCH] =?UTF-8?q?Atualiza=20base=20moderniza=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/base/modernizacao.py | 9 +++++---- data_collection/gazette/spiders/rj/rj_belford_roxo.py | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/data_collection/gazette/spiders/base/modernizacao.py b/data_collection/gazette/spiders/base/modernizacao.py index 00dcef415..6b25fd318 100644 --- a/data_collection/gazette/spiders/base/modernizacao.py +++ b/data_collection/gazette/spiders/base/modernizacao.py @@ -11,6 +11,8 @@ class BaseModernizacaoSpider(BaseGazetteSpider): power = "executive_legislative" ver_subpath = "ver20230623" + filter_endpoint = "diario_oficial_get" + edition_endpoint = "WEB-ObterAnexo.rule" custom_settings = { "CONCURRENT_REQUESTS": 4, @@ -19,7 +21,7 @@ class BaseModernizacaoSpider(BaseGazetteSpider): def start_requests(self): domain = self.allowed_domains[0] - base_url = f"https://{domain}/diario_oficial_get.php" + base_url = f"https://{domain}/{self.filter_endpoint}.php" initial_date = date(self.start_date.year, self.start_date.month, 1) for monthly_date in rrule( @@ -29,20 +31,19 @@ def start_requests(self): yield scrapy.FormRequest( method="GET", url=base_url, - formdata={"mesano": month_year}, + formdata={"mes_ano": month_year}, ) def parse(self, response): for gazette_data in response.json(): raw_gazette_date = gazette_data["Data_Formatada"] - raw_gazette_date gazette_date = datetime.strptime(raw_gazette_date, "%d/%m/%Y").date() if not self.start_date <= gazette_date <= self.end_date: continue gazette_code = gazette_data["Codigo_ANEXO"] gazette_url = response.urljoin( - f"{self.ver_subpath}/WEB-ObterAnexo.rule?sys=LAI&codigo={gazette_code}" + f"{self.ver_subpath}/{self.edition_endpoint}?sys=LAI&codigo={gazette_code}" ) raw_edition_number = gazette_data["ANEXO"] diff --git a/data_collection/gazette/spiders/rj/rj_belford_roxo.py b/data_collection/gazette/spiders/rj/rj_belford_roxo.py index acd3c5e2c..c0ee9f505 100644 --- a/data_collection/gazette/spiders/rj/rj_belford_roxo.py +++ b/data_collection/gazette/spiders/rj/rj_belford_roxo.py @@ -9,3 +9,5 @@ class RjBelfordRoxoSpider(BaseModernizacaoSpider): allowed_domains = ["transparencia.prefeituradebelfordroxo.rj.gov.br"] start_date = date(2019, 1, 2) power = "executive" + edition_endpoint = "WEB-ObterAnexomaior.rule" + filter_endpoint = "diario_oficial_getmaior"