LittleKidogo · Pancakem · Jul 13, 2020 · Jul 14, 2020 · Jul 15, 2020
diff --git a/api/__init__.py b/api/__init__.py
@@ -5,6 +5,7 @@
 from api.config import app_config
 from parser.parser import parse
 from api.util import validate_url
+import json
 
 document_url = None
 callback_url = None
@@ -30,8 +31,10 @@ def run_parse():
     response = {
             'error': 'None',
             'data': data
-        }
-    requests.post(callback_url, headers=headers, data=response)
+    }
+
+    res = json.dumps(response)
+    requests.post(callback_url, headers=headers, data=res)
 
 
 def create_app(config_name):

diff --git a/parser/parser.py b/parser/parser.py
@@ -2,136 +2,176 @@
 from pdfminer.high_level import extract_text
 from shutil import copyfileobj
 import tempfile
-
-keywords = ['REGION', 'COUNTY', 'TIME', 'DATE','AREA', ',' ]
-
-class County:
-    name = None
-    area = None
-    time = None
-    date = None
-    locations = []
-
-    def serialize(self):
-        if self.name == None:
-            return
-
-        return { 'name': self.name,
-                 'area': self.area,
-                 'time': self.time,
-                 'date': self.date,
-                 'locations': self.locations
-        }
-
-class Region:
-    region = None
-    counties = []
-
-    def serialize(self):
-        ser_counties = []
-        for sc in self.counties:
-            ser_counties.append(sc.serialize())
-
-        return {'region': self.region,
-                 'counties': ser_counties
-         }
-
-def download_file(url):
+from re import search, sub, IGNORECASE
+from .util import rlstrip_dot, composite_function
+
+# {
+# 	"region": {
+# 		"name": "Region name",
+# 		"counties": [
+# 			{
+# 				"name": "County Name",
+# 				"areas":[
+# 					{
+# 						"name": "Area name",
+# 						"details": {
+# 							"date": "Date",
+# 							"time": "Time",
+# 							"locations": ["location"]
+# 						}
+# 					}
+# 				]
+# 			}
+# 		]
+# 	}
+# }
+
+def get_text(url):
+    """
+    Downloads a pdf converts it to text and returns the text with all '\n' replaced with '.'
+    """
     r = requests.get(url, stream=True)
     temFile = tempfile.TemporaryFile()
     copyfileobj(r.raw, temFile)
-    return temFile
-
-def get_text(file_):
-    return extract_text(file_)
-
-def check_for_keyword(lines):
-    new_lines = []
-    for line in lines:
-        for k in keywords:
-            if k in line:
-                new_lines.append(line)
-                break
-    return new_lines
-
-def take_lines(contents):
-    lines = []
-    contents = contents.split('\n')
-    lappend = lines.append
-    for line in contents:
-        if len(line) < 3: continue
-        lappend(line)
-    return check_for_keyword(lines[1:])
-
-def parse_(lines):
-    hit_county, hit_region, i = 0, 0, 0
-    regions = []
-    region = Region()
-    county = County()
-    rounds = len(lines)
-    for line in lines:
-        i += 1
-        line = line.replace('\n', '').lstrip().rstrip()
-        if 'REGION' in line:
-            if hit_region == 0:
-                region.region = line
-                hit_region = 1
-            elif hit_region == 1:
-                # another region encountered store current
-                region.counties.append(county)
-                county = County()
-                regions.append(region)
-                region = Region()
-                region.region = line
-
-        elif 'COUNTY' in line:
-            if hit_county == 0:
-                county.name = line
-                hit_county = 1
-            else:
-                region.counties.append(county)
-                county = County()
-                county.name = line
-
-        elif 'DATE' in line and 'TIME' in line:
-            date_str = ''
-            for x in line:
-                if x == ' ':
-                    continue
-
-                if x == 'T':
-                    county.date = date_str.replace('\n', '')
-                    date_str = x
-                    continue
-
-                date_str += x
-            county.time = date_str
-            county.locations = lines[i].replace('\n', '').rstrip().lstrip().split(',')
-
-        elif 'DATE' in line:
-            county.date = line[6:]
-
-        elif 'TIME' in line:
-            county.time = line[6:]
-            county.locations = lines[i].replace('\n', '').rstrip().lstrip().split(',')
-
-        elif 'AREA' in line:
-            county.area = line[6:]
-
-        if i == rounds-1:
-            region.counties.append(county)
-            regions.append(region)
-
-
+    text = extract_text(temFile)
+    text = text.replace("\n", '.')
+    text = sub(r"[\s]{2,}", ' ', text)
+    return text
+
+
+def get_regions(text):
+    """ Consumes text
+    Takes a chunk of text marked with REGION at the beginning and REGION at the end
+    From the chunk of text, counties, areas and area details are mined
+    """
+    regions = dict()
+    regex = r"[.]([a-zA-Z\s]+?REGION)(.+?)[.](?:[a-zA-Z\s]+?REGION)"
+    region_search = search(regex, text, IGNORECASE)
+    while region_search:
+        # Get the top region
+        region = dict()
+        region["name"] = region_search.group(1).strip()
+        region_key = '_'.join(region["name"].lower().split(' '))
+        region["counties"] = get_counties(region_search.group(2), regions, region_key)
+        regions[region_key] = region
+        # Remove the region
+        text = text.replace(region_search.group(1), '')
+        text = text.replace(region_search.group(2), '')
+
+	# Do the region search again
+        region_search = search(regex, text, IGNORECASE)
+
+    last_region_check = search(r"[.]([a-zA-Z\s]+?REGION)(.+?customers)", text, IGNORECASE)
+    if last_region_check:
+        # Get the last region
+        region = dict()
+        region["name"] = last_region_check.group(1).strip()
+        region_key = '_'.join(region["name"].lower().split(' '))
+        region["counties"] = get_counties(last_region_check.group(2), regions, region_key)
+        regions[region_key] = region
     return regions
 
+def get_counties(text, regions, region_key):
+    """Consumes text, a dictionary of regions and current region key
+    The text is chunked using county boundaries.
+    The parsed county and its details will be stored in the
+    dictionary value of the current region key.
+    """
+    counties = list()
+    regex = r"[.]([a-zA-Z\s]+?COUNTY)(.+?)[.]([a-zA-Z\s]*?COUNTY)"
+    county_search = search(regex, text, IGNORECASE)
+    while county_search:
+        # Get the top county
+        county = dict()
+        county["name"] = county_search.group(1).strip()
+        county["areas"] = get_areas(county_search.group(2))
+
+        # Check if the region already exists
+        if region_key in regions.keys():
+            regions[region_key]["counties"].append(county)
+        else:
+            counties.append(county)
+
+        # Remove the county
+        text = text.replace(county_search.group(1), '')
+        text = text.replace(county_search.group(2), '')
+
+        # Do the county search again
+        county_search = search(regex, text, IGNORECASE)
+
+    last_county_check = search(r"[.]([a-zA-Z\s]+?COUNTY)(.+?)$", text, IGNORECASE)
+    if last_county_check:
+        # Get the last county
+        county = dict()
+        county["name"] = last_county_check.group(1).strip()
+        county["areas"] = get_areas(last_county_check.group(2))
+
+        # Check if the region already exists
+        if region_key in regions.keys():
+            regions[region_key]["counties"].append(county)
+        else:
+            counties.append(county)
+
+    return counties
+
+def get_areas(text):
+    """consume text
+    chunks text using AREA boundaries and capturing date
+    The area details (time and date) are mined from the date capture group
+    """
+    areas = list()
+    regex = r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)AREA"
+    area_search = search(regex, text, IGNORECASE)
+    while area_search:
+	# Get the top area
+	area = dict()
+	area["name"] = area_search.group(1)
+	area["details"] = get_details(area_search.group(2))
+	areas.append(area)
+
+	# Remove the area
+	text = text.replace(area_search.group(1), '')
+	text = text.replace(area_search.group(2), '')
+
+	# Do the county search again
+	area_search = search(regex, text, IGNORECASE)
+
+	last_area_check = search(r"(AREA:[a-zA-Z\s,]+[.])(DATE.+?)$", text, IGNORECASE)
+	if last_area_check:
+	    # Get the last area
+	    area = dict()
+	    area["name"] = last_area_check.group(1)
+	    area["details"] = get_details(last_area_check.group(2))
+	    areas.append(area)
+
+    return areas
+
+def get_details(text):
+    """The text consumed should be from a date capture group
+    The text is searched for time and date
+    """
+    details = dict()
+    date_search = search(r"(DATE:)(.+?)TIME", text, IGNORECASE)
+    if date_search:
+	details["date"] = date_search.group(2).strip()
+	text = text.replace(date_search.group(1), '')
+	text = text.replace(date_search.group(2), '')
+
+	time_search = search(r"(TIME:)(.+?P[.]M[.])", text, IGNORECASE)
+	if time_search:
+	    details["time"] = time_search.group(2).strip()
+	    text = text.replace(time_search.group(1), '')
+	    text = text.replace(time_search.group(2), '')
+
+	details["locations"] = get_locations(text)
+
+    return details
+
+def get_locations(text):
+    """Mines comma separated locations at the end of a area section"""
+    stripSpaces = lambda location : location.strip()
+    return list(map(composite_function(stripSpaces, rlstrip_dot), text.split(',')))
 
 def parse(url):
-    tempFile = download_file(url)
-    file_data = take_lines(get_text(tempFile))
-    all_data = parse_(file_data)
-    serialized_data = []
-    append = serialized_data.append
-    for r in all_data:
-        append(r.serialize())
-    return serialized_data
+    """ Do everything"""
+    return get_regions(get_text(url))
diff --git a/parser/util.py b/parser/util.py
@@ -0,0 +1,17 @@
+from functools import reduce
+from re import sub
+
+# strip leading and trailing dots
+def rlstrip_dot(string):
+    return sub(r"^[\.]+|[\.\s]+$", "", string)
+
+
+# helper function
+
+# this function takes a number of functions and composes them
+def composite_function(*func):
+
+    def compose(f, g):
+        return lambda x : f(g(x))
+
+    return reduce(compose, func, lambda x : x)