From d0ad000fd15632483624c8d031853f8838dbe8df Mon Sep 17 00:00:00 2001 From: trevineju Date: Wed, 17 Apr 2024 17:15:36 -0300 Subject: [PATCH] =?UTF-8?q?fix:=20Atualiza=20raspador=20de=20Uberl=C3=A2nd?= =?UTF-8?q?ia-MG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/mg/mg_uberlandia.py | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/data_collection/gazette/spiders/mg/mg_uberlandia.py b/data_collection/gazette/spiders/mg/mg_uberlandia.py index 084898ab5..b100800dc 100644 --- a/data_collection/gazette/spiders/mg/mg_uberlandia.py +++ b/data_collection/gazette/spiders/mg/mg_uberlandia.py @@ -1,4 +1,5 @@ import datetime +import re import dateparser import scrapy @@ -10,11 +11,14 @@ class MgUberlandiaSpider(BaseGazetteSpider): - zyte_smartproxy_enabled = True - TERRITORY_ID = "3170206" name = "mg_uberlandia" start_date = datetime.date(2005, 1, 3) + allowed_domains = ["uberlandia.mg.gov.br"] + + custom_settings = { + "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36" + } def start_requests(self): first_day_of_start_date_month = datetime.date( @@ -54,15 +58,28 @@ def parse(self, response): edition_number = edition.re_first(r"(\d+)") is_extra_edition = bool(edition.re(r"\d+.*?([A-Za-z]+)")) - gazette_url = gazette.css("a::attr(href)").get() + intermediary_page_url = gazette.css("a::attr(href)").get() - yield Gazette( - date=gazette_date, - edition_number=edition_number, - is_extra_edition=is_extra_edition, - file_urls=[gazette_url], - power="executive", + gazette_item = { + "date": gazette_date, + "edition_number": edition_number, + "is_extra_edition": is_extra_edition, + } + + yield scrapy.Request( + intermediary_page_url, + callback=self.intermediary_page, + cb_kwargs={"gazette_item": gazette_item}, ) for page_url in response.css("nav a.page-numbers::attr(href)").getall(): yield scrapy.Request(page_url) + + def intermediary_page(self, response, gazette_item): + gazette_url = re.search(r'location="(.*)";', response.text).group(1) + + yield Gazette( + **gazette_item, + file_urls=[gazette_url], + power="executive", + )