Skip to content

Commit

Permalink
Atualiza base modernização para dar manutenção no raspador de Belford…
Browse files Browse the repository at this point in the history
… Roxo-RJ (#1347)

resolve #1346
  • Loading branch information
trevineju authored Jan 9, 2025
2 parents 041dd8a + 2c02fb0 commit a34ba6b
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
9 changes: 5 additions & 4 deletions data_collection/gazette/spiders/base/modernizacao.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
class BaseModernizacaoSpider(BaseGazetteSpider):
power = "executive_legislative"
ver_subpath = "ver20230623"
filter_endpoint = "diario_oficial_get"
edition_endpoint = "WEB-ObterAnexo.rule"

custom_settings = {
"CONCURRENT_REQUESTS": 4,
Expand All @@ -19,7 +21,7 @@ class BaseModernizacaoSpider(BaseGazetteSpider):

def start_requests(self):
domain = self.allowed_domains[0]
base_url = f"https://{domain}/diario_oficial_get.php"
base_url = f"https://{domain}/{self.filter_endpoint}.php"
initial_date = date(self.start_date.year, self.start_date.month, 1)

for monthly_date in rrule(
Expand All @@ -29,20 +31,19 @@ def start_requests(self):
yield scrapy.FormRequest(
method="GET",
url=base_url,
formdata={"mesano": month_year},
formdata={"mes_ano": month_year},
)

def parse(self, response):
for gazette_data in response.json():
raw_gazette_date = gazette_data["Data_Formatada"]
raw_gazette_date
gazette_date = datetime.strptime(raw_gazette_date, "%d/%m/%Y").date()
if not self.start_date <= gazette_date <= self.end_date:
continue

gazette_code = gazette_data["Codigo_ANEXO"]
gazette_url = response.urljoin(
f"{self.ver_subpath}/WEB-ObterAnexo.rule?sys=LAI&codigo={gazette_code}"
f"{self.ver_subpath}/{self.edition_endpoint}?sys=LAI&codigo={gazette_code}"
)

raw_edition_number = gazette_data["ANEXO"]
Expand Down
2 changes: 2 additions & 0 deletions data_collection/gazette/spiders/rj/rj_belford_roxo.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ class RjBelfordRoxoSpider(BaseModernizacaoSpider):
allowed_domains = ["transparencia.prefeituradebelfordroxo.rj.gov.br"]
start_date = date(2019, 1, 2)
power = "executive"
edition_endpoint = "WEB-ObterAnexomaior.rule"
filter_endpoint = "diario_oficial_getmaior"

0 comments on commit a34ba6b

Please sign in to comment.