From 020137901cc4017e075811f47a07b2665749605d Mon Sep 17 00:00:00 2001 From: "sandip.purplecheetah" Date: Tue, 16 Jul 2019 16:52:13 +0545 Subject: [PATCH] Added check for spaces in annotated text, and accordingly updated start and end index --- train.py | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/train.py b/train.py index 8bdc7e1d..a5860ae6 100644 --- a/train.py +++ b/train.py @@ -24,21 +24,29 @@ def convert_dataturks_to_spacy(dataturks_JSON_FilePath): data = json.loads(line) text = data['content'] entities = [] - for annotation in data['annotation']: - #only a single point in text annotation. - point = annotation['points'][0] - labels = annotation['label'] - # handle both list of labels or a single label. - if not isinstance(labels, list): - labels = [labels] - - for label in labels: - #dataturks indices are both inclusive [start, end] but spacy is not [start, end) - entities.append((point['start'], point['end'] + 1 ,label)) - + data_annotations = data['annotation'] + if data_annotations is not None: + for annotation in data_annotations: + #only a single point in text annotation. + point = annotation['points'][0] + labels = annotation['label'] + # handle both list of labels or a single label. + if not isinstance(labels, list): + labels = [labels] + for label in labels: + point_start = point['start'] + point_end = point['end'] + point_text = point['text'] + + lstrip_diff = len(point_text) - len(point_text.lstrip()) + rstrip_diff = len(point_text) - len(point_text.rstrip()) + if lstrip_diff != 0: + point_start = point_start + lstrip_diff + if rstrip_diff != 0: + point_end = point_end - rstrip_diff + entities.append((point_start, point_end + 1 , label)) training_data.append((text, {"entities" : entities})) - return training_data except Exception as e: logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))