From d924ff2e93b058d5f11dd5c742e8a3fff6137862 Mon Sep 17 00:00:00 2001 From: AllexLima10 Date: Thu, 22 Aug 2024 11:03:08 -0300 Subject: [PATCH 1/2] #1199 Cria spider para rj_macae --- .../gazette/spiders/rj/rj_macae.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 data_collection/gazette/spiders/rj/rj_macae.py diff --git a/data_collection/gazette/spiders/rj/rj_macae.py b/data_collection/gazette/spiders/rj/rj_macae.py new file mode 100644 index 000000000..429cb6820 --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_macae.py @@ -0,0 +1,53 @@ +import re +from datetime import date, datetime as dt + +import scrapy + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class RjMacaeSpider(BaseGazetteSpider): + TERRITORY_ID = "3302403" + name = "rj_macae" + allowed_domains = ["macae.rj.gov.br"] + start_date = date(2020, 5, 22) + + def start_requests(self): + yield scrapy.FormRequest( + url="https://sistemas.macae.rj.gov.br:840/diariooficial/index/listarajax", + method="POST", + formdata={ + "periodode": self.start_date.strftime("%d/%m/%Y"), + "periodoate": self.end_date.strftime("%d/%m/%Y"), + }, + ) + + def parse(self, response): + for data in response.json()["data"]: + gazette_code = data["DT_RowId"] + gazette_url = f"https://sistemas.macae.rj.gov.br:840/diariooficial/index/download?id={gazette_code}" + + gazette_edition = data["edicao"] + gazette_edition_number = re.search(r"\d+", gazette_edition).group(0) + + raw_gazette_date = data["publicacao"][0:10] + gazette_date = dt.strptime(raw_gazette_date, "%d/%m/%Y").date() + + gazette_item = { + "date": gazette_date, + "edition_number": gazette_edition_number, + "is_extra_edition": "EXTRA" in gazette_edition.upper(), + } + + yield scrapy.Request( + url=gazette_url, + method="HEAD", + callback=self.parse_pdf_url, + cb_kwargs={"gazette_item": gazette_item}, + ) + + def parse_pdf_url(self, response, gazette_item): + yield Gazette( + **gazette_item, file_urls=[response.url], power="executive_legislative" + ) From f023d01ee11a83455e426c46b21514aa3bcc2418 Mon Sep 17 00:00:00 2001 From: Juliana Trevine <44185775+trevineju@users.noreply.github.com> Date: Wed, 25 Sep 2024 15:50:15 -0300 Subject: [PATCH 2/2] Melhora coleta de metadado usando regex Signed-off-by: Juliana Trevine <44185775+trevineju@users.noreply.github.com> --- data_collection/gazette/spiders/rj/rj_macae.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/data_collection/gazette/spiders/rj/rj_macae.py b/data_collection/gazette/spiders/rj/rj_macae.py index 429cb6820..79a0bee31 100644 --- a/data_collection/gazette/spiders/rj/rj_macae.py +++ b/data_collection/gazette/spiders/rj/rj_macae.py @@ -31,7 +31,9 @@ def parse(self, response): gazette_edition = data["edicao"] gazette_edition_number = re.search(r"\d+", gazette_edition).group(0) - raw_gazette_date = data["publicacao"][0:10] + raw_gazette_date = re.search( + r"\d{2}\/\d{2}\/\d{4}", data["publicacao"] + ).group() gazette_date = dt.strptime(raw_gazette_date, "%d/%m/%Y").date() gazette_item = {