diff --git a/data_collection/gazette/spiders/base/portalgov.py b/data_collection/gazette/spiders/base/portalgov.py new file mode 100644 index 000000000..5df8b39b8 --- /dev/null +++ b/data_collection/gazette/spiders/base/portalgov.py @@ -0,0 +1,46 @@ +import re +from datetime import datetime as dt + +import scrapy + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class BasePortalGovSpider(BaseGazetteSpider): + power = "executive" + + def start_requests(self): + yield scrapy.FormRequest( + url=f"https://{self.domain}/controllers/diario_oficial/class_diario.php", + formdata={ + "func": "5", + "param": "1", + }, + ) + + def parse(self, response): + for gazette_data in response.json(): + raw_gazette_date = gazette_data["data"] + gazette_date = dt.strptime(raw_gazette_date, "%d/%m/%Y").date() + if gazette_date > self.end_date: + continue + if gazette_date < self.start_date: + return + + gazette_desc = gazette_data["descricao"] + gazette_edition = gazette_data["numero"] + gazette_edition_number = re.search(r"\d+", gazette_edition).group(0) + is_extra_edition = bool( + re.search(r"extra|supl", gazette_edition + gazette_desc, re.IGNORECASE) + ) + + gazette_url = f"https://{self.domain}/arquivos/diario_oficial/{gazette_data['arquivo']}" + + yield Gazette( + date=gazette_date, + edition_number=gazette_edition_number, + file_urls=[gazette_url], + is_extra_edition=is_extra_edition, + power=self.power, + ) diff --git a/data_collection/gazette/spiders/rj/rj_sao_joao_da_barra.py b/data_collection/gazette/spiders/rj/rj_sao_joao_da_barra.py new file mode 100644 index 000000000..015af5d2e --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_sao_joao_da_barra.py @@ -0,0 +1,11 @@ +from datetime import date + +from gazette.spiders.base.portalgov import BasePortalGovSpider + + +class RjSaoJoaoDaBarraSpider(BasePortalGovSpider): + name = "rj_sao_joao_da_barra" + TERRITORY_ID = "3305000" + allowed_domains = ["sjb.rj.gov.br"] + start_date = date(2013, 7, 15) + domain = "www.sjb.rj.gov.br" diff --git a/data_collection/gazette/spiders/rj/rj_varre_sai.py b/data_collection/gazette/spiders/rj/rj_varre_sai.py new file mode 100644 index 000000000..30108c39f --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_varre_sai.py @@ -0,0 +1,12 @@ +from datetime import date + +from gazette.spiders.base.portalgov import BasePortalGovSpider + + +class RjVarreSaiSpider(BasePortalGovSpider): + name = "rj_varre_sai" + TERRITORY_ID = "3306156" + allowed_domains = ["varresai.rj.gov.br"] + start_date = date(2019, 9, 21) + power = "executive_legislative" + domain = "varresai.rj.gov.br"