NeotomaDB · brabbit61 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023
diff --git a/src/entity_extraction/training/spacy_ner/spacy_evaluate.py b/src/entity_extraction/training/spacy_ner/spacy_evaluate.py
@@ -38,9 +38,6 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-opt = docopt(__doc__)
-
-
 def get_spacy_token_labels(labelled_entities, raw_text):
     """
     Returns a list of labels per token in the raw text from spacy generated labels.
@@ -93,7 +90,7 @@
     return split_text, token_labels
 
 
-def load_ner_model_pipeline(model_path: str, gpu: bool = False):
+def load_ner_model_pipeline(model_path: str, gpu: str = "False"):
     """
     Loads a spacy named entity recognition model.
 
@@ -206,6 +203,7 @@
 
 
 def main():
+    opt = docopt(__doc__)
     # load the model
     model = load_ner_model_pipeline(opt["--model_path"], opt["--gpu"])
     all_predicted_labels = []

diff --git a/tests/entity_extraction/test_spacy_entity_extraction.py b/tests/entity_extraction/test_spacy_entity_extraction.py
@@ -0,0 +1,46 @@
+# Author: Jenit Jain
+# Date: June 28, 2023
+
+import os
+import sys
+import spacy
+import pytest
+
+# ensure that the parent directory is on the path for relative imports
+sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
+
+from src.entity_extraction.spacy_entity_extraction import spacy_extract_all
+
+@pytest.fixture
+def load_empty_model():
+    return spacy.blank("en")
+
+@pytest.fixture
+def load_null_model():
+    return None
+
+def test_spacy_extract_all(load_empty_model):
+
+    text = "Sample text with a site Lake Garibaldi"
+
+    entities = spacy_extract_all(text, load_empty_model)
+
+    assert isinstance(entities, list)
+    assert len(entities) == 0
+
+def test_spacy_extract_all_with_null_model(load_null_model):
+
+    text = "Sample text with a site Lake Garibaldi"
+
+    entities = spacy_extract_all(text, load_null_model)
+
+    assert isinstance(entities, list)
+
+    try:
+        # If the default model was not installed.
+        assert len(entities) == 0
+    except:
+        assert entities[0]["start"] == 24
+        assert entities[0]["end"] == 38
+        assert entities[0]["labels"] == ["SITE"]
+        assert entities[0]["text"] == "Lake Garibaldi"
diff --git a/tests/entity_extraction/test_spacy_evaluate.py b/tests/entity_extraction/test_spacy_evaluate.py
@@ -0,0 +1,97 @@
+import os
+import sys
+import pytest
+import logging
+import spacy
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+# ensure that the parent directory is on the path for relative imports
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from src.entity_extraction.training.spacy_ner.spacy_evaluate import (
+    get_spacy_token_labels,
+    load_evaluation_data,
+    load_ner_model_pipeline,
+    get_labels
+)
+
+
+@pytest.fixture
+def example_token_labels():
+    labels = {
+        "start": 24,
+        "end": 38,
+        "entity_group": "SITE",
+        "entity_text": "Lake Garibaldi",
+    }
+    text = "Sample text with a site Lake Garibaldi"
+
+    return labels, text
+
+
+@pytest.fixture
+def data_file_path():
+
+    return  os.path.join(
+        os.path.dirname(__file__),
+        "test_labelling_data_split",
+        "test-label-1.txt"
+    )
+
+
+@pytest.fixture
+def sample_ner_model():
+    nlp = spacy.blank("en")
+    data = {
+        "task": {
+            "data": {
+                "text": "Sample text with a site Lake Garibaldi",
+            }
+        },
+        "result": [
+            {
+                "value": {
+                    "start": 24,
+                    "end": 38,
+                    "labels": ["PERSON"],
+                    "text": "Lake Garibaldi"
+                },
+            }
+        ]
+    }
+    return nlp, data
+
+def test_get_spacy_token_labels(example_token_labels):
+    labelled_entities, raw_text = example_token_labels
+
+    expected_split_text = ["Sample", "text", "with", "a", "site", "Lake", "Garibaldi"]
+    expected_token_labels = ["O", "O", "O", "O", "O", "B-SITE", "I-SITE"]
+
+    assert expected_split_text, expected_token_labels == get_spacy_token_labels(labelled_entities, raw_text)
+
+
+def test_load_evaluation_data(data_file_path):
+
+    data = load_evaluation_data(data_file_path)
+
+    assert isinstance(data, dict)
+
+
+def test_load_ner_model_pipeline():
+    ner_model_name = "random_model"
+
+    with pytest.raises(OSError):
+        ner_pipe = load_ner_model_pipeline(ner_model_name)
+
+def test_get_labels(sample_ner_model):
+    ner_model, data = sample_ner_model
+
+    expected_predicted_labels = ['O', 'O', 'O', 'O', 'O', 'O', 'O']
+    expected_tagged_labels =     ['O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON']
+
+    predicted_labels, tagged_labels = get_labels(ner_model, data)
+
+    assert expected_predicted_labels == predicted_labels
+    assert expected_tagged_labels == tagged_labels
diff --git a/tests/preprocessing/test_labelling_preprocessing.py b/tests/preprocessing/test_labelling_preprocessing.py
@@ -0,0 +1,79 @@
+# Author: Jenit Jain
+# Date: June 23 2023
+
+import os
+import sys
+
+import pytest
+import pandas as pd
+from collections import namedtuple
+
+# ensure that the parent directory is on the path for relative imports
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from src.preprocessing.labelling_preprocessing import (
+    clean_words,
+    return_json,
+    chunk_text
+)
+
+
+@pytest.fixture
+def sample_words():
+    words = ["This", "is", "a", "test", "sentence", "."]
+    return words
+
+@pytest.fixture
+def json_args():
+    args = {
+        'nlp': None,
+        'chunk': 'This is sample text with a site Lake Garibaldi and taxa Pinus',
+        'chunk_local': 1, 
+        'chunk_global': 1, 
+        'chunk_subsection': 'Sample Subsection', 
+        'gdd': 'sample xDD ID', 
+        'doi': 'Sample DOI', 
+        'article_hash_code': '3567c4fb5ecd02be',
+        'model_version': 'v1',
+    }
+
+    return args
+
+
+@pytest.fixture
+def chunking_args():
+    args = {
+        '--char_len': '4000',
+        '--min_len': '500',
+    }
+    text = "This is a test chunk text"
+    return args, text
+
+
+def test_clean_words(sample_words):
+    words = sample_words
+
+    expected_cleaned_words = ["This", "is", "a", "test", "sentence."]
+
+    assert expected_cleaned_words == clean_words(words)
+
+
+def test_return_json(json_args):
+
+    labeled_json = return_json(**json_args)
+
+    # Check entity text
+    assert isinstance(labeled_json['predictions'][0]['result'], list)
+
+def test_chunk_text(chunking_args):
+    args, text = chunking_args
+
+    expected_chunks = [
+        'This is a test chunk text.'
+    ]
+    expected_local_index = [0]
+    expected_subsection = ["This"]
+
+    assert chunk_text(args, text) == (expected_chunks,
+                                      expected_local_index,
+                                      expected_subsection)