diff --git a/src/entity_extraction/training/spacy_ner/spacy_evaluate.py b/src/entity_extraction/training/spacy_ner/spacy_evaluate.py index 8935fd2..537a757 100644 --- a/src/entity_extraction/training/spacy_ner/spacy_evaluate.py +++ b/src/entity_extraction/training/spacy_ner/spacy_evaluate.py @@ -38,9 +38,6 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) -opt = docopt(__doc__) - - def get_spacy_token_labels(labelled_entities, raw_text): """ Returns a list of labels per token in the raw text from spacy generated labels. @@ -93,7 +90,7 @@ def get_spacy_token_labels(labelled_entities, raw_text): return split_text, token_labels -def load_ner_model_pipeline(model_path: str, gpu: bool = False): +def load_ner_model_pipeline(model_path: str, gpu: str = "False"): """ Loads a spacy named entity recognition model. @@ -206,6 +203,7 @@ def get_labels(ner_model, data): def main(): + opt = docopt(__doc__) # load the model model = load_ner_model_pipeline(opt["--model_path"], opt["--gpu"]) all_predicted_labels = [] diff --git a/tests/entity_extraction/test_spacy_entity_extraction.py b/tests/entity_extraction/test_spacy_entity_extraction.py new file mode 100644 index 0000000..503e037 --- /dev/null +++ b/tests/entity_extraction/test_spacy_entity_extraction.py @@ -0,0 +1,46 @@ +# Author: Jenit Jain +# Date: June 28, 2023 + +import os +import sys +import spacy +import pytest + +# ensure that the parent directory is on the path for relative imports +sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)) + +from src.entity_extraction.spacy_entity_extraction import spacy_extract_all + +@pytest.fixture +def load_empty_model(): + return spacy.blank("en") + +@pytest.fixture +def load_null_model(): + return None + +def test_spacy_extract_all(load_empty_model): + + text = "Sample text with a site Lake Garibaldi" + + entities = spacy_extract_all(text, load_empty_model) + + assert isinstance(entities, list) + assert len(entities) == 0 + +def test_spacy_extract_all_with_null_model(load_null_model): + + text = "Sample text with a site Lake Garibaldi" + + entities = spacy_extract_all(text, load_null_model) + + assert isinstance(entities, list) + + try: + # If the default model was not installed. + assert len(entities) == 0 + except: + assert entities[0]["start"] == 24 + assert entities[0]["end"] == 38 + assert entities[0]["labels"] == ["SITE"] + assert entities[0]["text"] == "Lake Garibaldi" \ No newline at end of file diff --git a/tests/entity_extraction/test_spacy_evaluate.py b/tests/entity_extraction/test_spacy_evaluate.py new file mode 100644 index 0000000..7929bfb --- /dev/null +++ b/tests/entity_extraction/test_spacy_evaluate.py @@ -0,0 +1,97 @@ +import os +import sys +import pytest +import logging +import spacy +import pandas as pd + +logger = logging.getLogger(__name__) + +# ensure that the parent directory is on the path for relative imports +sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) + +from src.entity_extraction.training.spacy_ner.spacy_evaluate import ( + get_spacy_token_labels, + load_evaluation_data, + load_ner_model_pipeline, + get_labels +) + + +@pytest.fixture +def example_token_labels(): + labels = { + "start": 24, + "end": 38, + "entity_group": "SITE", + "entity_text": "Lake Garibaldi", + } + text = "Sample text with a site Lake Garibaldi" + + return labels, text + + +@pytest.fixture +def data_file_path(): + + return os.path.join( + os.path.dirname(__file__), + "test_labelling_data_split", + "test-label-1.txt" + ) + + +@pytest.fixture +def sample_ner_model(): + nlp = spacy.blank("en") + data = { + "task": { + "data": { + "text": "Sample text with a site Lake Garibaldi", + } + }, + "result": [ + { + "value": { + "start": 24, + "end": 38, + "labels": ["PERSON"], + "text": "Lake Garibaldi" + }, + } + ] + } + return nlp, data + +def test_get_spacy_token_labels(example_token_labels): + labelled_entities, raw_text = example_token_labels + + expected_split_text = ["Sample", "text", "with", "a", "site", "Lake", "Garibaldi"] + expected_token_labels = ["O", "O", "O", "O", "O", "B-SITE", "I-SITE"] + + assert expected_split_text, expected_token_labels == get_spacy_token_labels(labelled_entities, raw_text) + + +def test_load_evaluation_data(data_file_path): + + data = load_evaluation_data(data_file_path) + + assert isinstance(data, dict) + + +def test_load_ner_model_pipeline(): + ner_model_name = "random_model" + + with pytest.raises(OSError): + ner_pipe = load_ner_model_pipeline(ner_model_name) + +def test_get_labels(sample_ner_model): + ner_model, data = sample_ner_model + + expected_predicted_labels = ['O', 'O', 'O', 'O', 'O', 'O', 'O'] + expected_tagged_labels = ['O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON'] + + predicted_labels, tagged_labels = get_labels(ner_model, data) + + assert expected_predicted_labels == predicted_labels + assert expected_tagged_labels == tagged_labels diff --git a/tests/preprocessing/test_labelling_preprocessing.py b/tests/preprocessing/test_labelling_preprocessing.py new file mode 100644 index 0000000..f02e1c5 --- /dev/null +++ b/tests/preprocessing/test_labelling_preprocessing.py @@ -0,0 +1,79 @@ +# Author: Jenit Jain +# Date: June 23 2023 + +import os +import sys + +import pytest +import pandas as pd +from collections import namedtuple + +# ensure that the parent directory is on the path for relative imports +sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) + +from src.preprocessing.labelling_preprocessing import ( + clean_words, + return_json, + chunk_text +) + + +@pytest.fixture +def sample_words(): + words = ["This", "is", "a", "test", "sentence", "."] + return words + +@pytest.fixture +def json_args(): + args = { + 'nlp': None, + 'chunk': 'This is sample text with a site Lake Garibaldi and taxa Pinus', + 'chunk_local': 1, + 'chunk_global': 1, + 'chunk_subsection': 'Sample Subsection', + 'gdd': 'sample xDD ID', + 'doi': 'Sample DOI', + 'article_hash_code': '3567c4fb5ecd02be', + 'model_version': 'v1', + } + + return args + + +@pytest.fixture +def chunking_args(): + args = { + '--char_len': '4000', + '--min_len': '500', + } + text = "This is a test chunk text" + return args, text + + +def test_clean_words(sample_words): + words = sample_words + + expected_cleaned_words = ["This", "is", "a", "test", "sentence."] + + assert expected_cleaned_words == clean_words(words) + + +def test_return_json(json_args): + + labeled_json = return_json(**json_args) + + # Check entity text + assert isinstance(labeled_json['predictions'][0]['result'], list) + +def test_chunk_text(chunking_args): + args, text = chunking_args + + expected_chunks = [ + 'This is a test chunk text.' + ] + expected_local_index = [0] + expected_subsection = ["This"] + + assert chunk_text(args, text) == (expected_chunks, + expected_local_index, + expected_subsection) \ No newline at end of file