From 87de8eb97619ec2fd404e0d385e38e30c3825e9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Lima?= Date: Sat, 10 Aug 2024 17:06:00 -0300 Subject: [PATCH 1/2] #1191 adiciona spider para rj_angra_dos_reis --- .../gazette/spiders/rj/rj_angra_dos_reis.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 data_collection/gazette/spiders/rj/rj_angra_dos_reis.py diff --git a/data_collection/gazette/spiders/rj/rj_angra_dos_reis.py b/data_collection/gazette/spiders/rj/rj_angra_dos_reis.py new file mode 100644 index 000000000..12145f5f6 --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_angra_dos_reis.py @@ -0,0 +1,48 @@ +import re +from datetime import date, datetime as dt + +import scrapy + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class RjAngraDosReisSpider(BaseGazetteSpider): + name = "rj_angra_dos_reis" + TERRITORY_ID = "3300100" + allowed_domains = ["angra.rj.gov.br"] + start_date = date(2005, 3, 11) + + def start_requests(self): + start_year = int(self.start_date.strftime("%Y")) + end_year = int(self.end_date.strftime("%Y")) + + for year in range(end_year, (start_year - 1), -1): + yield scrapy.Request( + f"https://angra.rj.gov.br/boletim-oficial.asp?vAno={year}" + ) + + def parse(self, response): + for tr in response.xpath("//tr")[1:]: + raw_gazette_date = tr.xpath("./td/strong/text()").get() + gazette_date = dt.strptime(raw_gazette_date, "%d/%m/%Y").date() + if gazette_date > self.end_date: + continue + if gazette_date < self.start_date: + return + + raw_gazette_edition = tr.xpath("./td/text()")[0].get() + match = re.search(r"\d+", raw_gazette_edition) + gazette_edition_number = "" if match is None else match.group(0) + is_extra_edition = "EXTRA" in raw_gazette_edition.upper() + + url_subdir = tr.xpath(".//a/@href").get() + gazette_url = response.urljoin(url_subdir) + + yield Gazette( + date=gazette_date, + edition_number=gazette_edition_number, + is_extra_edition=is_extra_edition, + file_urls=[gazette_url], + power="executive_legislative", + ) From 832ed06e0138c4c9a5e77958d656eb3767779b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Lima?= Date: Thu, 5 Sep 2024 14:33:56 -0300 Subject: [PATCH 2/2] =?UTF-8?q?#1191=20aplica=20sugest=C3=B5es=20da=20revi?= =?UTF-8?q?s=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/rj/rj_angra_dos_reis.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/data_collection/gazette/spiders/rj/rj_angra_dos_reis.py b/data_collection/gazette/spiders/rj/rj_angra_dos_reis.py index 12145f5f6..72e839ee4 100644 --- a/data_collection/gazette/spiders/rj/rj_angra_dos_reis.py +++ b/data_collection/gazette/spiders/rj/rj_angra_dos_reis.py @@ -1,4 +1,3 @@ -import re from datetime import date, datetime as dt import scrapy @@ -14,16 +13,13 @@ class RjAngraDosReisSpider(BaseGazetteSpider): start_date = date(2005, 3, 11) def start_requests(self): - start_year = int(self.start_date.strftime("%Y")) - end_year = int(self.end_date.strftime("%Y")) - - for year in range(end_year, (start_year - 1), -1): + for year in range(self.start_date.year, self.end_date.year + 1): yield scrapy.Request( f"https://angra.rj.gov.br/boletim-oficial.asp?vAno={year}" ) def parse(self, response): - for tr in response.xpath("//tr")[1:]: + for tr in response.xpath("//article//tr")[1:]: raw_gazette_date = tr.xpath("./td/strong/text()").get() gazette_date = dt.strptime(raw_gazette_date, "%d/%m/%Y").date() if gazette_date > self.end_date: @@ -31,14 +27,16 @@ def parse(self, response): if gazette_date < self.start_date: return - raw_gazette_edition = tr.xpath("./td/text()")[0].get() - match = re.search(r"\d+", raw_gazette_edition) - gazette_edition_number = "" if match is None else match.group(0) - is_extra_edition = "EXTRA" in raw_gazette_edition.upper() + raw_gazette_edition = tr.xpath("./td/text()") + gazette_edition_number = raw_gazette_edition.re_first(r"\d+") or "" url_subdir = tr.xpath(".//a/@href").get() gazette_url = response.urljoin(url_subdir) + is_extra_edition = ( + "EXTRA" in (raw_gazette_edition.get() + url_subdir).upper() + ) + yield Gazette( date=gazette_date, edition_number=gazette_edition_number,