Skip to content

Commit

Permalink
Corrige Uberaba (MG) para novo DOSP
Browse files Browse the repository at this point in the history
  • Loading branch information
ogecece authored and trevineju committed May 22, 2024
1 parent 6196bb8 commit b3d6d25
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 25 deletions.
1 change: 0 additions & 1 deletion data_collection/gazette/spiders/base/dosp.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ class DospGazetteSpider(BaseGazetteSpider):
start_date = None

allowed_domains = ["dosp.com.br"]
end_date = datetime.today().date()

def start_requests(self):
yield scrapy.Request(f"https://dosp.com.br/api/index.php/dioe.js/{self.code}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,21 @@
from scrapy import FormRequest

from gazette.items import Gazette
from gazette.spiders.base.dosp import DospGazetteSpider
from gazette.spiders.base import BaseGazetteSpider


class MgUberabaSpider(DospGazetteSpider):
class MgUberabaSpider(BaseGazetteSpider):
TERRITORY_ID = "3170107"
name = "mg_uberaba"
name = "mg_uberaba_2003"

code = 2364
start_date = dt.date(2003, 4, 25)
end_date = dt.date(2021, 9, 1)

def start_requests(self):
# Gazettes older than this date didn't use DOSP system
older_collection_end_date = dt.date(2021, 9, 2)

if self.end_date >= older_collection_end_date:
start_date = max([older_collection_end_date, self.start_date])
end_date = self.end_date
yield from self._dosp_request(start_date, end_date)

if self.start_date < older_collection_end_date:
start_date = self.start_date
end_date = min([older_collection_end_date, self.end_date])
yield from self._older_collection_request(start_date, end_date)

def _older_collection_request(self, start_date, end_date):
for year in range(start_date.year, end_date.year + 1):
for year in range(self.start_date.year, self.end_date.year + 1):
yield FormRequest(
url="http://www.uberaba.mg.gov.br/portal/listImagesHtml",
method="POST",
callback=self.parse_older_collection,
formdata={
"desc": "1",
"type": "1",
Expand All @@ -42,16 +27,24 @@ def _older_collection_request(self, start_date, end_date):
"types": "gif,jpg,png,bmp,tif,dxf,swf,dcr,mov,qt,ram,rm,avi,mpg,mpeg,asf,flv,pdf,doc,docx,xls,xlsx,zip,rar,txt,cdr,ai,eps,ppt,pptx,pot,psd,wmv",
"listAll": "1",
},
cb_kwargs={"start_date": start_date, "end_date": end_date},
)

def parse_older_collection(self, response, start_date, end_date):
def parse(self, response):
gazettes = response.css(".claGaleriaBoxFileTable")
for gazette in gazettes:
raw_date = gazette.css("::text").re_first(r"(\d{2}-\d{2}-\d{4})")
gazette_date = dt.datetime.strptime(raw_date, "%d-%m-%Y").date()
if gazette_date < start_date or gazette_date > end_date:
try:
gazette_date = dt.datetime.strptime(raw_date, "%d-%m-%Y").date()
except Exception:
self.logger.error(
f"Gazette date can't be parsed from gazette named \"{gazette.css('::text').get()}\""
)
continue

if gazette_date > self.end_date:
continue
elif gazette_date < self.start_date:
return

gazette_url = response.urljoin(
gazette.css("img::attr(onclick)").re_first(r"download\(\'(.*)\'\)")
Expand Down
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/mg/mg_uberaba_2021.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.dosp import DospGazetteSpider


class MgUberabaSpider(DospGazetteSpider):
TERRITORY_ID = "3170107"
name = "mg_uberaba_2021"
code = 2364
start_date = dt.date(2021, 9, 1)

0 comments on commit b3d6d25

Please sign in to comment.