From 7de037c8206244ec582dca5fead1805053c653d6 Mon Sep 17 00:00:00 2001 From: Ariel Pontes Date: Tue, 10 Mar 2020 18:03:59 +0200 Subject: [PATCH] Fix UNGM parsing and add logging for better debugging. --- app/management/commands/update_ungm.py | 2 +- app/parsers/ungm.py | 52 ++++++++++++++++++-------- app/server_requests.py | 7 ++-- 3 files changed, 41 insertions(+), 20 deletions(-) diff --git a/app/management/commands/update_ungm.py b/app/management/commands/update_ungm.py index c6eddff..3505565 100644 --- a/app/management/commands/update_ungm.py +++ b/app/management/commands/update_ungm.py @@ -45,7 +45,7 @@ def handle(self, *args, **kwargs): return success_msg except Exception as error: self.stdout.write( - self.style.ERROR('TED tenders update failed: {}'.format(error)) + self.style.ERROR('UNGM tenders update failed: {}'.format(error)) ) send_error_email(str(error)) raise diff --git a/app/parsers/ungm.py b/app/parsers/ungm.py index 9d300d5..dd2450c 100644 --- a/app/parsers/ungm.py +++ b/app/parsers/ungm.py @@ -1,4 +1,5 @@ from datetime import date, datetime, timedelta +import logging import re import requests from tempfile import TemporaryFile @@ -16,12 +17,17 @@ class UNGMWorker: requester = get_request_class(public=True) def parse_tenders(self, tenders): + """ + Args: + tenders: QuerySet + """ codes = UNSPSCCode.objects.all() parsed_tenders = [] for tender in tenders: - text = self.requester.get_request(getattr(tender, 'url')) - parsed_tenders.append( - self.parse_ungm_notice(text, getattr(tender, 'url'), codes)) + html = self.requester.get_request(tender.url) + parsed_tender = self.parse_ungm_notice(html, tender.url, codes) + if parsed_tender: + parsed_tenders.append(parsed_tender) return UNGMWorker.update_ungm_tenders(parsed_tenders) def parse_latest_notices(self, last_date): @@ -39,9 +45,9 @@ def parse_latest_notices(self, last_date): break parsed_tenders = [] for tender in extracted_tenders: - text = self.requester.get_request(tender['url']) + html = self.requester.get_request(tender['url']) parsed_tender = self.parse_ungm_notice( - text, tender['url'], codes) + html, tender['url'], codes) if parsed_tender: parsed_tenders.append(parsed_tender) ungm_tenders, added_tenders = UNGMWorker.update_ungm_tenders( @@ -65,14 +71,11 @@ def parse_ungm_notice_list(html): soup = BeautifulSoup(html, 'html.parser') tenders = soup.select('div.tableRow.dataRow') - if not tenders: - raise Exception( - 'UNGM scraping failed. Cannot find tenders in HTML.') - endpoint = settings.UNGM_ENDPOINT_URI - tenders_list = [ - { + tenders_list = [] + for tender in tenders: + tender_dict = { 'published': UNGMWorker.parse_date( tender.contents[7].span.string.strip(), '%d-%b-%Y'), 'reference': tender.contents[13].span.string.strip(), @@ -81,13 +84,27 @@ def parse_ungm_notice_list(html): if tender.contents[3].a['href'].strip() else '' ), } - for tender in tenders - ] + if tender_dict['url'] == 'http://google.com': + # This is a bug that should be investigated + logging.error('Invalid tender.', exc_info=True) + else: + tenders_list.append(tender_dict) return tenders_list @staticmethod def parse_ungm_notice(html, url, codes): + """ + Args: + html: A string containing the HTML returned when accessing `url` + ulr: A string containing the URL where details about a specific + tender can be found. + codes: A list of UNSPSC codes. In principle it should be + UNSPSCCode.objects.all(). + Returns: + A dictionary representing a tender and its documents, or None if + the tender is invalid. + """ soup = BeautifulSoup(html, 'html.parser') documents = UNGMWorker.find_by_class(soup, "lnkShowDocument", "a") description = UNGMWorker.find_by_class( @@ -109,8 +126,6 @@ def parse_ungm_notice(html, url, codes): soup, "highlighted", "span", True) reference = UNGMWorker.find_by_span(soup, 'Reference:') - if not reference: - raise Exception('Parsed UNGM tender has no reference.') published = UNGMWorker.find_by_span(soup, 'Published on:') deadline = UNGMWorker.find_by_span(soup, 'Deadline on:') @@ -142,6 +157,11 @@ def parse_ungm_notice(html, url, codes): 'unspsc_codes': ', '.join(unspsc_codes), } + if not tender['reference']: + logging.error( + 'Parsed UNGM tender has no reference.', exc_info=True) + return None + tender_item = { 'tender': tender, 'documents': [ @@ -199,7 +219,7 @@ def update_ungm_tenders(parsed_tenders): for doc in item['documents']: try: tender_doc = TenderDocument.objects.get( - tender=old_tender, name=doc['name']) + tender=new_tender, name=doc['name']) for k, v in doc.items(): old_value = getattr(tender_doc, k) diff --git a/app/server_requests.py b/app/server_requests.py index 74f6050..86fea6a 100644 --- a/app/server_requests.py +++ b/app/server_requests.py @@ -117,13 +117,14 @@ def get_data(self, url, last_date, index): def request(self, url, last_date, index): # Original for i in range(0, 3): - resp = self.post_request(url, url + '/Search', self.get_data(url, last_date, index)) + resp = self.post_request( + url, url + '/Search', self.get_data(url, last_date, index)) if resp: return resp sleep(randint(10, 15)) - def post_request(self, get_url, post_url, data, headers=HEADERS, - content_type=None): + def post_request( + self, get_url, post_url, data, headers=HEADERS, content_type=None): """ AJAX-like POST request. Does a GET initially to receive cookies that are used to the subsequent POST request.