diff --git a/data_collection/gazette/spiders/base/dosp.py b/data_collection/gazette/spiders/base/dosp.py index 9d2fea476..35de61a48 100644 --- a/data_collection/gazette/spiders/base/dosp.py +++ b/data_collection/gazette/spiders/base/dosp.py @@ -14,7 +14,6 @@ class DospGazetteSpider(BaseGazetteSpider): start_date = None allowed_domains = ["dosp.com.br"] - end_date = datetime.today().date() def start_requests(self): yield scrapy.Request(f"https://dosp.com.br/api/index.php/dioe.js/{self.code}") diff --git a/data_collection/gazette/spiders/mg/mg_uberaba.py b/data_collection/gazette/spiders/mg/mg_uberaba_2003.py similarity index 55% rename from data_collection/gazette/spiders/mg/mg_uberaba.py rename to data_collection/gazette/spiders/mg/mg_uberaba_2003.py index 3639d6bdc..f60633add 100644 --- a/data_collection/gazette/spiders/mg/mg_uberaba.py +++ b/data_collection/gazette/spiders/mg/mg_uberaba_2003.py @@ -3,36 +3,21 @@ from scrapy import FormRequest from gazette.items import Gazette -from gazette.spiders.base.dosp import DospGazetteSpider +from gazette.spiders.base import BaseGazetteSpider -class MgUberabaSpider(DospGazetteSpider): +class MgUberabaSpider(BaseGazetteSpider): TERRITORY_ID = "3170107" - name = "mg_uberaba" + name = "mg_uberaba_2003" - code = 2364 start_date = dt.date(2003, 4, 25) + end_date = dt.date(2021, 9, 1) def start_requests(self): - # Gazettes older than this date didn't use DOSP system - older_collection_end_date = dt.date(2021, 9, 2) - - if self.end_date >= older_collection_end_date: - start_date = max([older_collection_end_date, self.start_date]) - end_date = self.end_date - yield from self._dosp_request(start_date, end_date) - - if self.start_date < older_collection_end_date: - start_date = self.start_date - end_date = min([older_collection_end_date, self.end_date]) - yield from self._older_collection_request(start_date, end_date) - - def _older_collection_request(self, start_date, end_date): - for year in range(start_date.year, end_date.year + 1): + for year in range(self.start_date.year, self.end_date.year + 1): yield FormRequest( url="http://www.uberaba.mg.gov.br/portal/listImagesHtml", method="POST", - callback=self.parse_older_collection, formdata={ "desc": "1", "type": "1", @@ -42,16 +27,24 @@ def _older_collection_request(self, start_date, end_date): "types": "gif,jpg,png,bmp,tif,dxf,swf,dcr,mov,qt,ram,rm,avi,mpg,mpeg,asf,flv,pdf,doc,docx,xls,xlsx,zip,rar,txt,cdr,ai,eps,ppt,pptx,pot,psd,wmv", "listAll": "1", }, - cb_kwargs={"start_date": start_date, "end_date": end_date}, ) - def parse_older_collection(self, response, start_date, end_date): + def parse(self, response): gazettes = response.css(".claGaleriaBoxFileTable") for gazette in gazettes: raw_date = gazette.css("::text").re_first(r"(\d{2}-\d{2}-\d{4})") - gazette_date = dt.datetime.strptime(raw_date, "%d-%m-%Y").date() - if gazette_date < start_date or gazette_date > end_date: + try: + gazette_date = dt.datetime.strptime(raw_date, "%d-%m-%Y").date() + except Exception: + self.logger.error( + f"Gazette date can't be parsed from gazette named \"{gazette.css('::text').get()}\"" + ) + continue + + if gazette_date > self.end_date: continue + elif gazette_date < self.start_date: + return gazette_url = response.urljoin( gazette.css("img::attr(onclick)").re_first(r"download\(\'(.*)\'\)") diff --git a/data_collection/gazette/spiders/mg/mg_uberaba_2021.py b/data_collection/gazette/spiders/mg/mg_uberaba_2021.py new file mode 100644 index 000000000..62737d4ac --- /dev/null +++ b/data_collection/gazette/spiders/mg/mg_uberaba_2021.py @@ -0,0 +1,10 @@ +import datetime as dt + +from gazette.spiders.base.dosp import DospGazetteSpider + + +class MgUberabaSpider(DospGazetteSpider): + TERRITORY_ID = "3170107" + name = "mg_uberaba_2021" + code = 2364 + start_date = dt.date(2021, 9, 1)