-
-
Notifications
You must be signed in to change notification settings - Fork 415
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Melhora o desempenho da coleta para Sistema Replicavel DOSP (#888)
- Loading branch information
Showing
3 changed files
with
51 additions
and
59 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,42 @@ | ||
import base64 | ||
import datetime | ||
import json | ||
import re | ||
from base64 import b64encode | ||
from datetime import datetime | ||
from json import loads | ||
|
||
import scrapy | ||
from dateutil.rrule import WEEKLY, rrule | ||
|
||
from gazette.items import Gazette | ||
from gazette.spiders.base import BaseGazetteSpider | ||
|
||
|
||
class DospGazetteSpider(BaseGazetteSpider): | ||
allowed_domains = ["dosp.com.br", "imprensaoficialmunicipal.com.br"] | ||
|
||
# Must be defined into child classes | ||
code = None | ||
start_date = None | ||
|
||
def _dosp_request(self, start_date, end_date): | ||
for date in rrule(freq=WEEKLY, dtstart=start_date, until=end_date): | ||
from_date = date.strftime("%Y-%m-%d") | ||
to_date = date + datetime.timedelta(days=6) | ||
to_date = to_date.strftime("%Y-%m-%d") | ||
|
||
yield scrapy.Request( | ||
f"https://dosp.com.br/api/index.php/dioedata.js/{self.code}/{from_date}/{to_date}?callback=dioe" | ||
) | ||
allowed_domains = ["dosp.com.br"] | ||
|
||
def start_requests(self): | ||
yield from self._dosp_request(self.start_date, self.end_date) | ||
yield scrapy.Request(f"https://dosp.com.br/api/index.php/dioe.js/{self.code}") | ||
|
||
def parse(self, response): | ||
# The response are in a javascript format, then needs some clean up | ||
data = json.loads(response.text[6:-2]) | ||
|
||
for item in data["data"]: | ||
code = item["iddo"] | ||
code = str(code).encode("ascii") | ||
pdf_code = base64.b64encode(code).decode("ascii") | ||
file_url = f"https://dosp.com.br/exibe_do.php?i={pdf_code}" | ||
date = datetime.datetime.strptime(item["data"], "%Y-%m-%d").date() | ||
raw_edition_number = re.search(r"(\d+)([a-zA-Z]*)", item["edicao_do"]) | ||
edition_number = raw_edition_number.group(1) | ||
|
||
if self.start_date <= date <= self.end_date: | ||
json_text = ( | ||
response.css("p::text").get().replace("parseResponse(", "") | ||
).replace(");", "") | ||
|
||
json_text = loads(json_text) | ||
|
||
for diarios in json_text["data"]: | ||
data = datetime.strptime(diarios["data"], "%Y-%m-%d").date() | ||
code_link = str(diarios["iddo"]).encode("ascii") | ||
code_link = b64encode(code_link).decode("ascii") | ||
|
||
if self.start_date <= data <= self.end_date: | ||
yield Gazette( | ||
date=date, | ||
file_urls=[file_url], | ||
edition_number=edition_number, | ||
power="executive_legislative", | ||
is_extra_edition=raw_edition_number.group(2) != "", | ||
date=data, | ||
edition_number=diarios["edicao_do"], | ||
file_urls=[ | ||
f"https://dosp.com.br/exibe_do.php?i={code_link}.pdf", | ||
], | ||
is_extra_edition=diarios["flag_extra"] > 0, | ||
power="executive", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
import datetime as dt | ||
|
||
from gazette.spiders.base.dosp import DospGazetteSpider | ||
|
||
|
||
class MgUberabaSpider(DospGazetteSpider): | ||
TERRITORY_ID = "3170107" | ||
name = "mg_uberaba_2021" | ||
code = 2364 | ||
start_date = dt.date(2021, 9, 1) |