Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tests for preprocessing scripts #90

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/entity_extraction/training/spacy_ner/spacy_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,6 @@
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

opt = docopt(__doc__)


def get_spacy_token_labels(labelled_entities, raw_text):
"""
Returns a list of labels per token in the raw text from spacy generated labels.
Expand Down Expand Up @@ -93,7 +90,7 @@
return split_text, token_labels


def load_ner_model_pipeline(model_path: str, gpu: bool = False):
def load_ner_model_pipeline(model_path: str, gpu: str = "False"):
"""
Loads a spacy named entity recognition model.

Expand Down Expand Up @@ -206,6 +203,7 @@


def main():
opt = docopt(__doc__)

Check warning on line 206 in src/entity_extraction/training/spacy_ner/spacy_evaluate.py

View check run for this annotation

Codecov / codecov/patch

src/entity_extraction/training/spacy_ner/spacy_evaluate.py#L206

Added line #L206 was not covered by tests
# load the model
model = load_ner_model_pipeline(opt["--model_path"], opt["--gpu"])
all_predicted_labels = []
Expand Down
46 changes: 46 additions & 0 deletions tests/entity_extraction/test_spacy_entity_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Author: Jenit Jain
# Date: June 28, 2023

import os
import sys
import spacy
import pytest

# ensure that the parent directory is on the path for relative imports
sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))

from src.entity_extraction.spacy_entity_extraction import spacy_extract_all

@pytest.fixture
def load_empty_model():
return spacy.blank("en")

@pytest.fixture
def load_null_model():
return None

def test_spacy_extract_all(load_empty_model):

text = "Sample text with a site Lake Garibaldi"

entities = spacy_extract_all(text, load_empty_model)

assert isinstance(entities, list)
assert len(entities) == 0

def test_spacy_extract_all_with_null_model(load_null_model):

text = "Sample text with a site Lake Garibaldi"

entities = spacy_extract_all(text, load_null_model)

assert isinstance(entities, list)

try:
# If the default model was not installed.
assert len(entities) == 0
except:
assert entities[0]["start"] == 24
assert entities[0]["end"] == 38
assert entities[0]["labels"] == ["SITE"]
assert entities[0]["text"] == "Lake Garibaldi"
97 changes: 97 additions & 0 deletions tests/entity_extraction/test_spacy_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import os
import sys
import pytest
import logging
import spacy
import pandas as pd

logger = logging.getLogger(__name__)

# ensure that the parent directory is on the path for relative imports
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))

from src.entity_extraction.training.spacy_ner.spacy_evaluate import (
get_spacy_token_labels,
load_evaluation_data,
load_ner_model_pipeline,
get_labels
)


@pytest.fixture
def example_token_labels():
labels = {
"start": 24,
"end": 38,
"entity_group": "SITE",
"entity_text": "Lake Garibaldi",
}
text = "Sample text with a site Lake Garibaldi"

return labels, text


@pytest.fixture
def data_file_path():

return os.path.join(
os.path.dirname(__file__),
"test_labelling_data_split",
"test-label-1.txt"
)


@pytest.fixture
def sample_ner_model():
nlp = spacy.blank("en")
data = {
"task": {
"data": {
"text": "Sample text with a site Lake Garibaldi",
}
},
"result": [
{
"value": {
"start": 24,
"end": 38,
"labels": ["PERSON"],
"text": "Lake Garibaldi"
},
}
]
}
return nlp, data

def test_get_spacy_token_labels(example_token_labels):
labelled_entities, raw_text = example_token_labels

expected_split_text = ["Sample", "text", "with", "a", "site", "Lake", "Garibaldi"]
expected_token_labels = ["O", "O", "O", "O", "O", "B-SITE", "I-SITE"]

assert expected_split_text, expected_token_labels == get_spacy_token_labels(labelled_entities, raw_text)


def test_load_evaluation_data(data_file_path):

data = load_evaluation_data(data_file_path)

assert isinstance(data, dict)


def test_load_ner_model_pipeline():
ner_model_name = "random_model"

with pytest.raises(OSError):
ner_pipe = load_ner_model_pipeline(ner_model_name)

def test_get_labels(sample_ner_model):
ner_model, data = sample_ner_model

expected_predicted_labels = ['O', 'O', 'O', 'O', 'O', 'O', 'O']
expected_tagged_labels = ['O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON']

predicted_labels, tagged_labels = get_labels(ner_model, data)

assert expected_predicted_labels == predicted_labels
assert expected_tagged_labels == tagged_labels
79 changes: 79 additions & 0 deletions tests/preprocessing/test_labelling_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Author: Jenit Jain
# Date: June 23 2023

import os
import sys

import pytest
import pandas as pd
from collections import namedtuple

# ensure that the parent directory is on the path for relative imports
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))

from src.preprocessing.labelling_preprocessing import (
clean_words,
return_json,
chunk_text
)


@pytest.fixture
def sample_words():
words = ["This", "is", "a", "test", "sentence", "."]
return words

@pytest.fixture
def json_args():
args = {
'nlp': None,
'chunk': 'This is sample text with a site Lake Garibaldi and taxa Pinus',
'chunk_local': 1,
'chunk_global': 1,
'chunk_subsection': 'Sample Subsection',
'gdd': 'sample xDD ID',
'doi': 'Sample DOI',
'article_hash_code': '3567c4fb5ecd02be',
'model_version': 'v1',
}

return args


@pytest.fixture
def chunking_args():
args = {
'--char_len': '4000',
'--min_len': '500',
}
text = "This is a test chunk text"
return args, text


def test_clean_words(sample_words):
words = sample_words

expected_cleaned_words = ["This", "is", "a", "test", "sentence."]

assert expected_cleaned_words == clean_words(words)


def test_return_json(json_args):

labeled_json = return_json(**json_args)

# Check entity text
assert isinstance(labeled_json['predictions'][0]['result'], list)

def test_chunk_text(chunking_args):
args, text = chunking_args

expected_chunks = [
'This is a test chunk text.'
]
expected_local_index = [0]
expected_subsection = ["This"]

assert chunk_text(args, text) == (expected_chunks,
expected_local_index,
expected_subsection)