From 020137901cc4017e075811f47a07b2665749605d Mon Sep 17 00:00:00 2001
From: "sandip.purplecheetah" <sandip.purplecheetah@outlook.com>
Date: Tue, 16 Jul 2019 16:52:13 +0545
Subject: [PATCH] Added check for spaces in annotated text, and accordingly
 updated start and end index

---
 train.py | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/train.py b/train.py
index 8bdc7e1d..a5860ae6 100644
--- a/train.py
+++ b/train.py
@@ -24,21 +24,29 @@ def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
             data = json.loads(line)
             text = data['content']
             entities = []
-            for annotation in data['annotation']:
-                #only a single point in text annotation.
-                point = annotation['points'][0]
-                labels = annotation['label']
-                # handle both list of labels or a single label.
-                if not isinstance(labels, list):
-                    labels = [labels]
-
-                for label in labels:
-                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
-                    entities.append((point['start'], point['end'] + 1 ,label))
-
+            data_annotations = data['annotation']
+            if data_annotations is not None:
+                for annotation in data_annotations:
+                    #only a single point in text annotation.
+                    point = annotation['points'][0]
+                    labels = annotation['label']
+                    # handle both list of labels or a single label.
+                    if not isinstance(labels, list):
+                        labels = [labels]
 
+                    for label in labels:
+                        point_start = point['start']
+                        point_end = point['end']
+                        point_text = point['text']
+                        
+                        lstrip_diff = len(point_text) - len(point_text.lstrip())
+                        rstrip_diff = len(point_text) - len(point_text.rstrip())
+                        if lstrip_diff != 0:
+                            point_start = point_start + lstrip_diff
+                        if rstrip_diff != 0:
+                            point_end = point_end - rstrip_diff
+                        entities.append((point_start, point_end + 1 , label))
             training_data.append((text, {"entities" : entities}))
-
         return training_data
     except Exception as e:
         logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))