diff --git a/src/entity_extraction/training/hf_token_classification/huggingface_preprocess.py b/src/entity_extraction/training/hf_token_classification/huggingface_preprocess.py index e236621..d2f1b74 100644 --- a/src/entity_extraction/training/hf_token_classification/huggingface_preprocess.py +++ b/src/entity_extraction/training/hf_token_classification/huggingface_preprocess.py @@ -74,18 +74,22 @@ def convert_labelled_data_to_hf_format( labelled_chunks = [] for file in os.listdir(data_folder): - # if file doesn't end with txt skip it - if not file.endswith(".txt"): - continue - - with open(os.path.join(data_folder, file), "r") as f: - task = json.load(f) - try: - raw_text = task["task"]["data"]["text"] - annotation_result = task["result"] - gdd_id = task["task"]["data"]["gdd_id"] - + if file.endswith(".txt"): + with open(os.path.join(data_folder, file), "r") as f: + task = json.load(f) + annotation_result = task["result"] + gdd_id = task["task"]["data"]["gdd_id"] + raw_text = task["task"]["data"]["text"] + elif file.endswith(".json"): + with open(os.path.join(data_folder, file), "r") as f: + task = json.load(f) + annotation_result = task["result"] + gdd_id = task["data"]["gdd_id"] + raw_text = task["data"]["text"] + else: + continue + labelled_entities = [ annotation["value"] for annotation in annotation_result ] diff --git a/src/entity_extraction/training/spacy_ner/README.md b/src/entity_extraction/training/spacy_ner/README.md index ee24381..5690324 100644 --- a/src/entity_extraction/training/spacy_ner/README.md +++ b/src/entity_extraction/training/spacy_ner/README.md @@ -12,11 +12,10 @@ This folder contains the training and evaluation scripts for the SpaCy Transform ## Training Workflow A bash script is used to initialize a training job. Model training is fully customizable and users are encouraged to update the parameters in the `run_spacy_training.sh` and `spacy_transfomer_train.cfg` files prior to training. The training workflow is as follows: -1. Create a new data directory and dump all the TXT files (contains annotations in the JSONLines format) from Label Studio. +1. Create a new data directory and dump all the JSON files containing annotations from Label Studio and any reviewed parquet files. 2. Most parameters can be used with the default value, open the `run_spacy_training.sh` bash script and update the following fields with absolute paths or relative paths from the root of the repository: - `DATA_PATH`: path to directory with Label Studio labelled data - `DATA_OUTPUT_PATH`: path to directory to store the split dataset (train/val/test) as well as other data artifacts required for training. - - `MODEL_PATH`: If retraining, specify path to model artifacts. If training a model from scratch, pass empty string `""` - `MODEL_OUTPUT_PATH`: path to store new model artifacts - `VERSION`: Version can be updated to keep track of different training runs. - `--gpu-id`: While executing the `spacy train` command, GPU can be used, if available, by setting this flag to **0**. diff --git a/src/entity_extraction/training/spacy_ner/run_spacy_training.sh b/src/entity_extraction/training/spacy_ner/run_spacy_training.sh index 3ff9354..cf46ccd 100755 --- a/src/entity_extraction/training/spacy_ner/run_spacy_training.sh +++ b/src/entity_extraction/training/spacy_ner/run_spacy_training.sh @@ -9,7 +9,6 @@ echo "Current working directory: $(pwd)" DATA_PATH="/path/to/sample input folder" DATA_OUTPUT_PATH="/path/to/sample output folder" -MODEL_PATH="/path/to/model artifacts" MODEL_OUTPUT_PATH="/path/to/new model artifacts" VERSION="v1" TRAIN_SPLIT=0.7 @@ -28,34 +27,17 @@ python3 src/preprocessing/labelling_data_split.py \ python3 src/preprocessing/spacy_preprocess.py --data_path $DATA_OUTPUT_PATH -if [ -z "$MODEL_PATH" ]; then - # If the model path is null, then start training from scratch - - # Fill configuration with required fields - python -m spacy init fill-config \ - src/entity_extraction/training/spacy_ner/spacy_transformer_train.cfg \ - src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg - - # Execute the training job by pointing to the new config file - python -m spacy train \ - src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \ - --paths.train $DATA_OUTPUT_PATH/train.spacy \ - --paths.dev $DATA_OUTPUT_PATH/val.spacy \ - --output $MODEL_OUTPUT_PATH \ - --gpu-id -1 - -else - # Else create a new config file to resume training - python src/entity_extraction/training/spacy_ner/create_config.py \ - --model_path $MODEL_PATH \ - --output_path src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg - - python -m spacy train \ - src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \ - --paths.train $DATA_OUTPUT_PATH/train.spacy \ - --paths.dev $DATA_OUTPUT_PATH/val.spacy \ - --components.ner.source $MODEL_PATH \ - --components.transformer.source $MODEL_PATH \ - --output $MODEL_OUTPUT_PATH \ - --gpu-id -1 -fi \ No newline at end of file +# Start training from scratch + +# Fill configuration with required fields +python -m spacy init fill-config \ + src/entity_extraction/training/spacy_ner/spacy_transformer_train.cfg \ + src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg + +# Execute spacy CLI training +python -m spacy train \ + src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \ + --paths.train $DATA_OUTPUT_PATH/train.spacy \ + --paths.dev $DATA_OUTPUT_PATH/val.spacy \ + --output $MODEL_OUTPUT_PATH \ + --gpu-id -1 diff --git a/src/preprocessing/README.md b/src/preprocessing/README.md index 3e0f385..6e0036e 100644 --- a/src/preprocessing/README.md +++ b/src/preprocessing/README.md @@ -94,7 +94,7 @@ This script takes labelled dataset in JSONLines format as input and splits it in The resulting train, validation, and test sets can be used for training and evaluating machine learning models. #### **Options** -- `--raw_label_path=`: Specify the path to the directory where the raw label files are located. +- `--raw_label_path=`: Specify the path to the directory where the raw label files exported from LabelStudio and the parquet files containing the reviewed entities are located. - `--output_path=`: Specify the path to the directory where the output files will be written. @@ -126,4 +126,4 @@ This script manages the creation of custom data artifacts required for training 4. Creates the custom data artifacts that can be used for training or fine-tuning spaCy models. #### **Options** -- `--data_path=`: Specify the path to the folder containing files in JSONLines format. \ No newline at end of file +- `--data_path=`: Specify the path to the folder containing JSON files in txt/json format. \ No newline at end of file diff --git a/src/preprocessing/labelling_data_split.py b/src/preprocessing/labelling_data_split.py index 057b98c..1ed42d2 100644 --- a/src/preprocessing/labelling_data_split.py +++ b/src/preprocessing/labelling_data_split.py @@ -17,14 +17,16 @@ import numpy as np import shutil import json - +from collections import defaultdict +from datetime import datetime from docopt import docopt sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)) from src.logs import get_logger -logger = get_logger(__name__) +from src.preprocessing.labelling_preprocessing import get_hash +logger = get_logger(__name__) def separate_labels_to_train_val_test( labelled_file_path: str, @@ -74,6 +76,9 @@ def separate_labels_to_train_val_test( os.makedirs(os.path.join(output_path, "val"), exist_ok=True) os.makedirs(os.path.join(output_path, "test"), exist_ok=True) + # Checks for parquet files and extracts them + extract_parquet_file(labelled_file_path) + gdd_ids = get_article_gdd_ids(labelled_file_path) logger.info(f"Found {len(gdd_ids)} unique GDD IDs in the labelled data.") @@ -156,20 +161,24 @@ def separate_labels_to_train_val_test( }, } - # iterate through the files in the folder and convert them to the hf format for file in os.listdir(labelled_file_path): # if file doesn't end with txt skip it - if not file.endswith(".txt"): - continue - - with open(os.path.join(labelled_file_path, file), "r") as f: - task = json.load(f) - - try: - gdd_id = task["task"]["data"]["gdd_id"] - raw_text = task["task"]["data"]["text"] - annotation_result = task["result"] - + try: + if file.endswith(".txt"): + with open(os.path.join(labelled_file_path, file), "r") as f: + task = json.load(f) + annotation_result = task["result"] + gdd_id = task["task"]["data"]["gdd_id"] + raw_text = task["task"]["data"]["text"] + elif file.endswith(".json"): + with open(os.path.join(labelled_file_path, file), "r") as f: + task = json.load(f) + annotation_result = task["result"] + gdd_id = task["data"]["gdd_id"] + raw_text = task["data"]["text"] + else: + continue + # get the number of words in the article num_words = len(raw_text.split()) @@ -229,8 +238,16 @@ def separate_labels_to_train_val_test( json.dump(data_metrics, f, indent=2) logger.info("Finished separating files into train, val and test sets.") - - + logger.info( + f"Found {data_metrics['train']['entity_counts']} entities in {data_metrics['train']['article_count']} articles in train set." + ) + logger.info( + f"Found {data_metrics['val']['entity_counts']} entities in {data_metrics['val']['article_count']} articles in val set." + ) + logger.info( + f"Found {data_metrics['test']['entity_counts']} entities in {data_metrics['test']['article_count']} articles in test set." + ) + def get_article_gdd_ids(labelled_file_path: str): """ Parameters @@ -256,24 +273,100 @@ def get_article_gdd_ids(labelled_file_path: str): # iterate through the files and get the unique gdd_ids gdd_ids = [] + for file in os.listdir(labelled_file_path): - # if file doesn't end with txt skip it - if not file.endswith(".txt"): - continue - - with open(os.path.join(labelled_file_path, file), "r") as f: - task = json.load(f) - + try: - gdd_id = task["task"]["data"]["gdd_id"] + if file.endswith(".txt"): + with open(os.path.join(labelled_file_path, file), "r") as f: + task = json.load(f) + gdd_id = task["task"]["data"]["gdd_id"] + elif file.endswith(".json"): + with open(os.path.join(labelled_file_path, file), "r") as f: + task = json.load(f) + gdd_id = task["data"]["gdd_id"] + else: + continue except Exception as e: logger.warning(f"Issue with file data: {file}, {e}") - + continue + if gdd_id not in gdd_ids: gdd_ids.append(gdd_id) return gdd_ids +def extract_parquet_file(labelled_file_path: str): + """Checks the directory for parquet files and extracts the corrected entities + + Parameter + --------- + labelled_file_path: str + Directory containing the data files + """ + + files = os.listdir(labelled_file_path) + + # Iterate through the files and check if they are parquet files + for fin in files: + if fin.endswith(".parquet"): + df = pd.read_parquet(os.path.join(labelled_file_path, fin)) + + logger.info(f"Read parquet file {fin} with {len(df)} rows.") + + for index, row in df.iterrows(): + + output_files = defaultdict(list) + all_sentences = {} + gdd_id = row["gddid"] + if row["corrected_entities"] != "None": + + logger.info(f"Entities found in xDD ID: {gdd_id}") + + corrected_entities = json.loads(row["corrected_entities"]) + + for ent_type in corrected_entities.keys(): + for entity in corrected_entities[ent_type].keys(): + if corrected_entities[ent_type][entity]['corrected_name']: + entity_text = corrected_entities[ent_type][entity]['corrected_name'] + else: + entity_text = entity + for sentence in corrected_entities[ent_type][entity]['sentence']: + if (sentence['char_index']['start'] != -1 and + sentence['char_index']['end'] != -1): + all_sentences[sentence['sentid']] = sentence['text'] + output_files[sentence['sentid']].append({ + "value": { + "text": entity_text, + "start": sentence['char_index']['start'], + "end": sentence['char_index']['end'], + "labels": [ent_type] + } + }) + + logger.info(f"Number of sentences extracted for training: {len(output_files)}") + + # Iterate through each sentence and create a json file + for sentid in output_files.keys(): + text = all_sentences[sentid] + article_data = { + "text": text, + "global_index": sentid, + "local_index": sentid, + "gdd_id": gdd_id, + "doi": row['DOI'], + "timestamp": str(datetime.today()), + "chunk_hash": get_hash(text), + "article_hash": get_hash(text), + } + output_data = { + "data": article_data, + "result": output_files[sentid] + } + file_name = os.path.join(labelled_file_path, f"{gdd_id}_{sentid}.json") + # Save the dictionary as a json file + with open(file_name, "w") as f: + json.dump(output_data, f, indent=2) def main(): opt = docopt(__doc__) diff --git a/src/preprocessing/labelling_preprocessing.py b/src/preprocessing/labelling_preprocessing.py index 9791db5..5e69d16 100644 --- a/src/preprocessing/labelling_preprocessing.py +++ b/src/preprocessing/labelling_preprocessing.py @@ -33,7 +33,6 @@ from src.logs import get_logger # logger = logging.getLogger(__name__) logger = get_logger(__name__) -logger.setLevel(logging.INFO) from src.entity_extraction.baseline_entity_extraction import baseline_extract_all from src.entity_extraction.spacy_entity_extraction import spacy_extract_all diff --git a/src/preprocessing/spacy_preprocess.py b/src/preprocessing/spacy_preprocess.py index 758b321..764be9b 100644 --- a/src/preprocessing/spacy_preprocess.py +++ b/src/preprocessing/spacy_preprocess.py @@ -20,9 +20,6 @@ # ensure that the parent directory is on the path for relative imports sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)) -from src.logs import get_logger -logger = get_logger(__name__) - def preprocess_data(data_path: str): """Creates data artifacts used by the Spacy model for training @@ -36,56 +33,56 @@ def preprocess_data(data_path: str): nlp = spacy.blank("en") train_files = glob.glob(os.path.join(data_path, "train", "*.txt")) val_files = glob.glob(os.path.join(data_path, "val", "*.txt")) + train_files.extend(glob.glob(os.path.join(data_path, "train", "*.json"))) + val_files.extend(glob.glob(os.path.join(data_path, "val", "*.json"))) + + train_doc_bin = get_doc(nlp, train_files) + train_doc_bin.to_disk(os.path.join(data_path, "train.spacy")) - logger.info( - f"Number of files found under the train dir: {len(train_files)}") - logger.info( - f"Number of files found under the val dir: {len(val_files)}") + val_doc_bin = get_doc(nlp, val_files) + val_doc_bin.to_disk(os.path.join(data_path, "val.spacy")) + +def get_doc(nlp, files): + """Creates and saves a doc bin object for training - def get_doc(files): - """Creates and saves a doc bin object for training + Parameters + ---------- + nlp: spacy.lang + A blank nlp object for english language + files: list + List of files that contain labelled entities - Parameters - ---------- - files: list - List of files that contain labelled entities - - Returns - ---------- - doc_bin: DocBin - DocBin object that can be used for training the spacy model - """ - doc_bin = DocBin() - for labelled_file in files: - entities = [] - with open(labelled_file, 'r') as fin: - article = fin.readlines() - article_data = json.loads(article[0]) - text = article_data['task']['data']["text"] + Returns + ---------- + doc_bin: DocBin + DocBin object that can be used for training the spacy model + """ + doc_bin = DocBin() + for labelled_file in files: + + with open(labelled_file, 'r') as fin: + task = json.load(fin) - doc = nlp.make_doc(text) + if labelled_file.endswith(".txt"): + text = task['task']['data']["text"] + else: + text = task['data']["text"] + + entities = [] + doc = nlp.make_doc(text) - for label in article_data['result']: - start = label['value']['start'] - end = label['value']['end'] - ent = label['value']['labels'][0] - span = doc.char_span(start, end, label=ent) - if span is not None: - entities.append(span) - - doc.ents = entities - doc_bin.add(doc) - - return doc_bin - - train_doc_bin = get_doc(train_files) - train_doc_bin.to_disk(os.path.join(data_path, "train.spacy")) - - val_doc_bin = get_doc(val_files) - val_doc_bin.to_disk(os.path.join(data_path, "val.spacy")) - - # TODO: Else If the data_path consists of parquet files, load JSON files from all parquet files in the directory + for label in task['result']: + start = label['value']['start'] + end = label['value']['end'] + ent = label['value']['labels'][0] + span = doc.char_span(start, end, label=ent) + if span is not None: + entities.append(span) + + doc.ents = entities + doc_bin.add(doc) + return doc_bin if __name__ == "__main__": opt = docopt(__doc__) assert os.path.exists(opt['--data_path']), \