Skip to content

Commit

Permalink
Merge pull request #251 from eaudeweb/new-ungm
Browse files Browse the repository at this point in the history
Fix UNGM parsing and add logging for better debugging.
  • Loading branch information
dianaboiangiu authored Mar 11, 2020
2 parents 043feab + 7de037c commit 0cb0694
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 20 deletions.
2 changes: 1 addition & 1 deletion app/management/commands/update_ungm.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def handle(self, *args, **kwargs):
return success_msg
except Exception as error:
self.stdout.write(
self.style.ERROR('TED tenders update failed: {}'.format(error))
self.style.ERROR('UNGM tenders update failed: {}'.format(error))
)
send_error_email(str(error))
raise
Expand Down
52 changes: 36 additions & 16 deletions app/parsers/ungm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import date, datetime, timedelta
import logging
import re
import requests
from tempfile import TemporaryFile
Expand All @@ -16,12 +17,17 @@ class UNGMWorker:
requester = get_request_class(public=True)

def parse_tenders(self, tenders):
"""
Args:
tenders: QuerySet
"""
codes = UNSPSCCode.objects.all()
parsed_tenders = []
for tender in tenders:
text = self.requester.get_request(getattr(tender, 'url'))
parsed_tenders.append(
self.parse_ungm_notice(text, getattr(tender, 'url'), codes))
html = self.requester.get_request(tender.url)
parsed_tender = self.parse_ungm_notice(html, tender.url, codes)
if parsed_tender:
parsed_tenders.append(parsed_tender)
return UNGMWorker.update_ungm_tenders(parsed_tenders)

def parse_latest_notices(self, last_date):
Expand All @@ -39,9 +45,9 @@ def parse_latest_notices(self, last_date):
break
parsed_tenders = []
for tender in extracted_tenders:
text = self.requester.get_request(tender['url'])
html = self.requester.get_request(tender['url'])
parsed_tender = self.parse_ungm_notice(
text, tender['url'], codes)
html, tender['url'], codes)
if parsed_tender:
parsed_tenders.append(parsed_tender)
ungm_tenders, added_tenders = UNGMWorker.update_ungm_tenders(
Expand All @@ -65,14 +71,11 @@ def parse_ungm_notice_list(html):
soup = BeautifulSoup(html, 'html.parser')
tenders = soup.select('div.tableRow.dataRow')

if not tenders:
raise Exception(
'UNGM scraping failed. Cannot find tenders in HTML.')

endpoint = settings.UNGM_ENDPOINT_URI

tenders_list = [
{
tenders_list = []
for tender in tenders:
tender_dict = {
'published': UNGMWorker.parse_date(
tender.contents[7].span.string.strip(), '%d-%b-%Y'),
'reference': tender.contents[13].span.string.strip(),
Expand All @@ -81,13 +84,27 @@ def parse_ungm_notice_list(html):
if tender.contents[3].a['href'].strip() else ''
),
}
for tender in tenders
]
if tender_dict['url'] == 'http://google.com':
# This is a bug that should be investigated
logging.error('Invalid tender.', exc_info=True)
else:
tenders_list.append(tender_dict)

return tenders_list

@staticmethod
def parse_ungm_notice(html, url, codes):
"""
Args:
html: A string containing the HTML returned when accessing `url`
ulr: A string containing the URL where details about a specific
tender can be found.
codes: A list of UNSPSC codes. In principle it should be
UNSPSCCode.objects.all().
Returns:
A dictionary representing a tender and its documents, or None if
the tender is invalid.
"""
soup = BeautifulSoup(html, 'html.parser')
documents = UNGMWorker.find_by_class(soup, "lnkShowDocument", "a")
description = UNGMWorker.find_by_class(
Expand All @@ -109,8 +126,6 @@ def parse_ungm_notice(html, url, codes):
soup, "highlighted", "span", True)

reference = UNGMWorker.find_by_span(soup, 'Reference:')
if not reference:
raise Exception('Parsed UNGM tender has no reference.')
published = UNGMWorker.find_by_span(soup, 'Published on:')
deadline = UNGMWorker.find_by_span(soup, 'Deadline on:')

Expand Down Expand Up @@ -142,6 +157,11 @@ def parse_ungm_notice(html, url, codes):
'unspsc_codes': ', '.join(unspsc_codes),
}

if not tender['reference']:
logging.error(
'Parsed UNGM tender has no reference.', exc_info=True)
return None

tender_item = {
'tender': tender,
'documents': [
Expand Down Expand Up @@ -199,7 +219,7 @@ def update_ungm_tenders(parsed_tenders):
for doc in item['documents']:
try:
tender_doc = TenderDocument.objects.get(
tender=old_tender, name=doc['name'])
tender=new_tender, name=doc['name'])

for k, v in doc.items():
old_value = getattr(tender_doc, k)
Expand Down
7 changes: 4 additions & 3 deletions app/server_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,14 @@ def get_data(self, url, last_date, index):
def request(self, url, last_date, index):
# Original
for i in range(0, 3):
resp = self.post_request(url, url + '/Search', self.get_data(url, last_date, index))
resp = self.post_request(
url, url + '/Search', self.get_data(url, last_date, index))
if resp:
return resp
sleep(randint(10, 15))

def post_request(self, get_url, post_url, data, headers=HEADERS,
content_type=None):
def post_request(
self, get_url, post_url, data, headers=HEADERS, content_type=None):
"""
AJAX-like POST request. Does a GET initially to receive cookies that
are used to the subsequent POST request.
Expand Down

0 comments on commit 0cb0694

Please sign in to comment.