NeotomaDB · brabbit61 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023
diff --git a/src/entity_extraction/training/hf_token_classification/huggingface_preprocess.py b/src/entity_extraction/training/hf_token_classification/huggingface_preprocess.py
@@ -74,18 +74,22 @@
         labelled_chunks = []
 
         for file in os.listdir(data_folder):
-            # if file doesn't end with txt skip it
-            if not file.endswith(".txt"):
-                continue
-
-            with open(os.path.join(data_folder, file), "r") as f:
-                task = json.load(f)
-
             try:
-                raw_text = task["task"]["data"]["text"]
-                annotation_result = task["result"]
-                gdd_id = task["task"]["data"]["gdd_id"]
-
+                if file.endswith(".txt"):
+                    with open(os.path.join(data_folder, file), "r") as f:
+                        task = json.load(f)
+                    annotation_result = task["result"]
+                    gdd_id = task["task"]["data"]["gdd_id"]
+                    raw_text = task["task"]["data"]["text"]
+                elif file.endswith(".json"):
+                    with open(os.path.join(data_folder, file), "r") as f:
+                        task = json.load(f)
+                    annotation_result = task["result"]
+                    gdd_id = task["data"]["gdd_id"]
+                    raw_text = task["data"]["text"]
+                else:
+                    continue      
+
                 labelled_entities = [
                     annotation["value"] for annotation in annotation_result
                 ]

diff --git a/src/entity_extraction/training/spacy_ner/README.md b/src/entity_extraction/training/spacy_ner/README.md
@@ -12,11 +12,10 @@ This folder contains the training and evaluation scripts for the SpaCy Transform
 ## Training Workflow
 
 A bash script is used to initialize a training job. Model training is fully customizable and users are encouraged to update the parameters in the `run_spacy_training.sh` and `spacy_transfomer_train.cfg` files prior to training. The training workflow is as follows:
-1. Create a new data directory and dump all the TXT files (contains annotations in the JSONLines format) from Label Studio.
+1. Create a new data directory and dump all the JSON files containing annotations from Label Studio and any reviewed parquet files.
 2. Most parameters can be used with the default value, open the `run_spacy_training.sh` bash script and update the following fields with absolute paths or relative paths from the root of the repository:
    - `DATA_PATH`: path to directory with Label Studio labelled data
    - `DATA_OUTPUT_PATH`: path to directory to store the split dataset (train/val/test) as well as other data artifacts required for training.
-   - `MODEL_PATH`: If retraining, specify path to model artifacts. If training a model from scratch, pass empty string `""`
    - `MODEL_OUTPUT_PATH`: path to store new model artifacts
    - `VERSION`: Version can be updated to keep track of different training runs.
    - `--gpu-id`: While executing the `spacy train` command, GPU can be used, if available, by setting this flag to **0**.

diff --git a/src/entity_extraction/training/spacy_ner/run_spacy_training.sh b/src/entity_extraction/training/spacy_ner/run_spacy_training.sh
@@ -9,7 +9,6 @@ echo "Current working directory: $(pwd)"
 
 DATA_PATH="/path/to/sample input folder"
 DATA_OUTPUT_PATH="/path/to/sample output folder"
-MODEL_PATH="/path/to/model artifacts"
 MODEL_OUTPUT_PATH="/path/to/new model artifacts"
 VERSION="v1"
 TRAIN_SPLIT=0.7
@@ -28,34 +27,17 @@ python3 src/preprocessing/labelling_data_split.py \
 
 python3 src/preprocessing/spacy_preprocess.py --data_path $DATA_OUTPUT_PATH
 
-if [ -z "$MODEL_PATH" ]; then
-    # If the model path is null, then start training from scratch
-
-    # Fill configuration with required fields
-    python -m spacy init fill-config \
-            src/entity_extraction/training/spacy_ner/spacy_transformer_train.cfg \
-            src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg
-
-    # Execute the training job by pointing to the new config file
-    python -m spacy train \
-        src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \
-        --paths.train $DATA_OUTPUT_PATH/train.spacy \
-        --paths.dev $DATA_OUTPUT_PATH/val.spacy \
-        --output $MODEL_OUTPUT_PATH \
-        --gpu-id -1
-
-else
-    # Else create a new config file to resume training
-    python src/entity_extraction/training/spacy_ner/create_config.py \
-        --model_path $MODEL_PATH \
-        --output_path src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg
-
-    python -m spacy train \
-        src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \
-        --paths.train $DATA_OUTPUT_PATH/train.spacy \
-        --paths.dev $DATA_OUTPUT_PATH/val.spacy \
-        --components.ner.source $MODEL_PATH \
-        --components.transformer.source $MODEL_PATH \
-        --output $MODEL_OUTPUT_PATH \
-        --gpu-id -1
-fi
+# Start training from scratch
+
+# Fill configuration with required fields
+python -m spacy init fill-config \
+        src/entity_extraction/training/spacy_ner/spacy_transformer_train.cfg \
+        src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg
+
+# Execute spacy CLI training
+python -m spacy train \
+    src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \
+    --paths.train $DATA_OUTPUT_PATH/train.spacy \
+    --paths.dev $DATA_OUTPUT_PATH/val.spacy \
+    --output $MODEL_OUTPUT_PATH \
+    --gpu-id -1
diff --git a/src/preprocessing/README.md b/src/preprocessing/README.md
@@ -94,7 +94,7 @@ This script takes labelled dataset in JSONLines format as input and splits it in
 The resulting train, validation, and test sets can be used for training and evaluating machine learning models.
 
 #### **Options**
-- `--raw_label_path=<raw_label_path>`: Specify the path to the directory where the raw label files are located.
+- `--raw_label_path=<raw_label_path>`: Specify the path to the directory where the raw label files exported from LabelStudio and the parquet files containing the reviewed entities are located.
 
 - `--output_path=<output_path>`: Specify the path to the directory where the output files will be written.
 
@@ -126,4 +126,4 @@ This script manages the creation of custom data artifacts required for training
 4. Creates the custom data artifacts that can be used for training or fine-tuning spaCy models.
 
 #### **Options**
-- `--data_path=<data_path>`: Specify the path to the folder containing files in JSONLines format.
+- `--data_path=<data_path>`: Specify the path to the folder containing JSON files in txt/json format.
diff --git a/src/preprocessing/labelling_data_split.py b/src/preprocessing/labelling_data_split.py
@@ -17,14 +17,16 @@
 import numpy as np
 import shutil
 import json
-
+from collections import defaultdict
+from datetime import datetime
 from docopt import docopt
 
 sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
 
 from src.logs import get_logger
-logger = get_logger(__name__)
+from src.preprocessing.labelling_preprocessing import get_hash
 
+logger = get_logger(__name__)
 
 def separate_labels_to_train_val_test(
     labelled_file_path: str,
@@ -74,6 +76,9 @@
     os.makedirs(os.path.join(output_path, "val"), exist_ok=True)
     os.makedirs(os.path.join(output_path, "test"), exist_ok=True)
 
+    # Checks for parquet files and extracts them 
+    extract_parquet_file(labelled_file_path)
+
     gdd_ids = get_article_gdd_ids(labelled_file_path)
 
     logger.info(f"Found {len(gdd_ids)} unique GDD IDs in the labelled data.")
@@ -156,20 +161,24 @@
         },
     }
 
-    # iterate through the files in the folder and convert them to the hf format
     for file in os.listdir(labelled_file_path):
         # if file doesn't end with txt skip it
-        if not file.endswith(".txt"):
-            continue
-
-        with open(os.path.join(labelled_file_path, file), "r") as f:
-            task = json.load(f)
-
-        try:
-            gdd_id = task["task"]["data"]["gdd_id"]
-            raw_text = task["task"]["data"]["text"]
-            annotation_result = task["result"]
-
+        try:    
+            if file.endswith(".txt"):
+                with open(os.path.join(labelled_file_path, file), "r") as f:
+                    task = json.load(f)
+                annotation_result = task["result"]
+                gdd_id = task["task"]["data"]["gdd_id"]
+                raw_text = task["task"]["data"]["text"]
+            elif file.endswith(".json"):
+                with open(os.path.join(labelled_file_path, file), "r") as f:
+                    task = json.load(f)
+                annotation_result = task["result"]
+                gdd_id = task["data"]["gdd_id"]
+                raw_text = task["data"]["text"]
+            else:
+                continue      
+
             # get the number of words in the article
             num_words = len(raw_text.split())
 
@@ -229,8 +238,16 @@
         json.dump(data_metrics, f, indent=2)
 
     logger.info("Finished separating files into train, val and test sets.")
-
-
+    logger.info(
+        f"Found {data_metrics['train']['entity_counts']} entities in {data_metrics['train']['article_count']} articles in train set."
+    )
+    logger.info(
+        f"Found {data_metrics['val']['entity_counts']} entities in {data_metrics['val']['article_count']} articles in val set."
+    )
+    logger.info(
+        f"Found {data_metrics['test']['entity_counts']} entities in {data_metrics['test']['article_count']} articles in test set."
+    )
+
 def get_article_gdd_ids(labelled_file_path: str):
     """
     Parameters
@@ -256,24 +273,100 @@
 
     # iterate through the files and get the unique gdd_ids
     gdd_ids = []
+
     for file in os.listdir(labelled_file_path):
-        # if file doesn't end with txt skip it
-        if not file.endswith(".txt"):
-            continue
-
-        with open(os.path.join(labelled_file_path, file), "r") as f:
-            task = json.load(f)
-
+
         try:
-            gdd_id = task["task"]["data"]["gdd_id"]
+            if file.endswith(".txt"):
+                with open(os.path.join(labelled_file_path, file), "r") as f:
+                    task = json.load(f)
+                    gdd_id = task["task"]["data"]["gdd_id"]
+            elif file.endswith(".json"):
+                with open(os.path.join(labelled_file_path, file), "r") as f:
+                    task = json.load(f)
+                    gdd_id = task["data"]["gdd_id"]
+            else:
+                continue
         except Exception as e:
             logger.warning(f"Issue with file data: {file}, {e}")
-
+            continue
+
         if gdd_id not in gdd_ids:
             gdd_ids.append(gdd_id)
 
     return gdd_ids
 
+def extract_parquet_file(labelled_file_path: str):
+    """Checks the directory for parquet files and extracts the corrected entities
+
+    Parameter
+    ---------
+    labelled_file_path: str
+        Directory containing the data files
+    """
+
+    files = os.listdir(labelled_file_path)
+
+    # Iterate through the files and check if they are parquet files
+    for fin in files:
+        if fin.endswith(".parquet"):
+            df = pd.read_parquet(os.path.join(labelled_file_path, fin))
+
+            logger.info(f"Read parquet file {fin} with {len(df)} rows.")
+
+            for index, row in df.iterrows():
+
+                output_files = defaultdict(list)
+                all_sentences = {}
+                gdd_id = row["gddid"]
+                if row["corrected_entities"] != "None":
+
+                    logger.info(f"Entities found in xDD ID: {gdd_id}")
+
+                    corrected_entities = json.loads(row["corrected_entities"])
+
+                    for ent_type in corrected_entities.keys():
+                        for entity in corrected_entities[ent_type].keys():
+                            if corrected_entities[ent_type][entity]['corrected_name']:
+                                entity_text = corrected_entities[ent_type][entity]['corrected_name']
+                            else:
+                                entity_text = entity
+                            for sentence in corrected_entities[ent_type][entity]['sentence']:
+                                if (sentence['char_index']['start'] != -1 and
+                                    sentence['char_index']['end'] != -1):
+                                    all_sentences[sentence['sentid']] = sentence['text']
+                                    output_files[sentence['sentid']].append({
+                                        "value": {
+                                            "text": entity_text,
+                                            "start": sentence['char_index']['start'],
+                                            "end": sentence['char_index']['end'],
+                                            "labels": [ent_type]
+                                        }          
+                                    })
+
+                    logger.info(f"Number of sentences extracted for training: {len(output_files)}")
+
+                # Iterate through each sentence and create a json file
+                for sentid in output_files.keys():
+                    text = all_sentences[sentid]
+                    article_data = {
+                        "text": text,
+                        "global_index": sentid,
+                        "local_index": sentid,
+                        "gdd_id": gdd_id,
+                        "doi": row['DOI'],
+                        "timestamp": str(datetime.today()),
+                        "chunk_hash": get_hash(text),
+                        "article_hash": get_hash(text),
+                    }
+                    output_data = {
+                        "data": article_data,
+                        "result": output_files[sentid]
+                    }
+                    file_name = os.path.join(labelled_file_path, f"{gdd_id}_{sentid}.json")
+                    # Save the dictionary as a json file
+                    with open(file_name, "w") as f:
+                        json.dump(output_data, f, indent=2)
 
 def main():
     opt = docopt(__doc__)

diff --git a/src/preprocessing/labelling_preprocessing.py b/src/preprocessing/labelling_preprocessing.py
@@ -33,7 +33,6 @@
 from src.logs import get_logger
 # logger = logging.getLogger(__name__)
 logger = get_logger(__name__)
-logger.setLevel(logging.INFO)
 
 from src.entity_extraction.baseline_entity_extraction import baseline_extract_all
 from src.entity_extraction.spacy_entity_extraction import spacy_extract_all