Skip to content

Commit

Permalink
Melhora o desempenho da coleta para Sistema Replicavel DOSP (#888)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored May 22, 2024
2 parents 0edf4a4 + b3d6d25 commit 23f9806
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 59 deletions.
59 changes: 24 additions & 35 deletions data_collection/gazette/spiders/base/dosp.py
Original file line number Diff line number Diff line change
@@ -1,53 +1,42 @@
import base64
import datetime
import json
import re
from base64 import b64encode
from datetime import datetime
from json import loads

import scrapy
from dateutil.rrule import WEEKLY, rrule

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class DospGazetteSpider(BaseGazetteSpider):
allowed_domains = ["dosp.com.br", "imprensaoficialmunicipal.com.br"]

# Must be defined into child classes
code = None
start_date = None

def _dosp_request(self, start_date, end_date):
for date in rrule(freq=WEEKLY, dtstart=start_date, until=end_date):
from_date = date.strftime("%Y-%m-%d")
to_date = date + datetime.timedelta(days=6)
to_date = to_date.strftime("%Y-%m-%d")

yield scrapy.Request(
f"https://dosp.com.br/api/index.php/dioedata.js/{self.code}/{from_date}/{to_date}?callback=dioe"
)
allowed_domains = ["dosp.com.br"]

def start_requests(self):
yield from self._dosp_request(self.start_date, self.end_date)
yield scrapy.Request(f"https://dosp.com.br/api/index.php/dioe.js/{self.code}")

def parse(self, response):
# The response are in a javascript format, then needs some clean up
data = json.loads(response.text[6:-2])

for item in data["data"]:
code = item["iddo"]
code = str(code).encode("ascii")
pdf_code = base64.b64encode(code).decode("ascii")
file_url = f"https://dosp.com.br/exibe_do.php?i={pdf_code}"
date = datetime.datetime.strptime(item["data"], "%Y-%m-%d").date()
raw_edition_number = re.search(r"(\d+)([a-zA-Z]*)", item["edicao_do"])
edition_number = raw_edition_number.group(1)

if self.start_date <= date <= self.end_date:
json_text = (
response.css("p::text").get().replace("parseResponse(", "")
).replace(");", "")

json_text = loads(json_text)

for diarios in json_text["data"]:
data = datetime.strptime(diarios["data"], "%Y-%m-%d").date()
code_link = str(diarios["iddo"]).encode("ascii")
code_link = b64encode(code_link).decode("ascii")

if self.start_date <= data <= self.end_date:
yield Gazette(
date=date,
file_urls=[file_url],
edition_number=edition_number,
power="executive_legislative",
is_extra_edition=raw_edition_number.group(2) != "",
date=data,
edition_number=diarios["edicao_do"],
file_urls=[
f"https://dosp.com.br/exibe_do.php?i={code_link}.pdf",
],
is_extra_edition=diarios["flag_extra"] > 0,
power="executive",
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,21 @@
from scrapy import FormRequest

from gazette.items import Gazette
from gazette.spiders.base.dosp import DospGazetteSpider
from gazette.spiders.base import BaseGazetteSpider


class MgUberabaSpider(DospGazetteSpider):
class MgUberabaSpider(BaseGazetteSpider):
TERRITORY_ID = "3170107"
name = "mg_uberaba"
name = "mg_uberaba_2003"

code = 2364
start_date = dt.date(2003, 4, 25)
end_date = dt.date(2021, 9, 1)

def start_requests(self):
# Gazettes older than this date didn't use DOSP system
older_collection_end_date = dt.date(2021, 9, 2)

if self.end_date >= older_collection_end_date:
start_date = max([older_collection_end_date, self.start_date])
end_date = self.end_date
yield from self._dosp_request(start_date, end_date)

if self.start_date < older_collection_end_date:
start_date = self.start_date
end_date = min([older_collection_end_date, self.end_date])
yield from self._older_collection_request(start_date, end_date)

def _older_collection_request(self, start_date, end_date):
for year in range(start_date.year, end_date.year + 1):
for year in range(self.start_date.year, self.end_date.year + 1):
yield FormRequest(
url="http://www.uberaba.mg.gov.br/portal/listImagesHtml",
method="POST",
callback=self.parse_older_collection,
formdata={
"desc": "1",
"type": "1",
Expand All @@ -42,16 +27,24 @@ def _older_collection_request(self, start_date, end_date):
"types": "gif,jpg,png,bmp,tif,dxf,swf,dcr,mov,qt,ram,rm,avi,mpg,mpeg,asf,flv,pdf,doc,docx,xls,xlsx,zip,rar,txt,cdr,ai,eps,ppt,pptx,pot,psd,wmv",
"listAll": "1",
},
cb_kwargs={"start_date": start_date, "end_date": end_date},
)

def parse_older_collection(self, response, start_date, end_date):
def parse(self, response):
gazettes = response.css(".claGaleriaBoxFileTable")
for gazette in gazettes:
raw_date = gazette.css("::text").re_first(r"(\d{2}-\d{2}-\d{4})")
gazette_date = dt.datetime.strptime(raw_date, "%d-%m-%Y").date()
if gazette_date < start_date or gazette_date > end_date:
try:
gazette_date = dt.datetime.strptime(raw_date, "%d-%m-%Y").date()
except Exception:
self.logger.error(
f"Gazette date can't be parsed from gazette named \"{gazette.css('::text').get()}\""
)
continue

if gazette_date > self.end_date:
continue
elif gazette_date < self.start_date:
return

gazette_url = response.urljoin(
gazette.css("img::attr(onclick)").re_first(r"download\(\'(.*)\'\)")
Expand Down
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/mg/mg_uberaba_2021.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.dosp import DospGazetteSpider


class MgUberabaSpider(DospGazetteSpider):
TERRITORY_ID = "3170107"
name = "mg_uberaba_2021"
code = 2364
start_date = dt.date(2021, 9, 1)

0 comments on commit 23f9806

Please sign in to comment.