diff --git a/.gitignore b/.gitignore index baf5db4..0c49eb5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,8 @@ # data folder data/ !tests/data -!tests/data/*.yaml tests/data/* +!tests/data/*.yaml # google new binary GoogleNews-vectors-negative300-prenorm.bin diff --git a/faqt/__init__.py b/faqt/__init__.py index 61fbeb6..74e9e12 100644 --- a/faqt/__init__.py +++ b/faqt/__init__.py @@ -6,6 +6,7 @@ QuestionAnswerBERTScorer, StepwiseKeyedVectorsScorer, WMDScorer, + Contextualization, ) from .preprocessing import ( preprocess_text_for_keyword_rule, diff --git a/faqt/model/__init__.py b/faqt/model/__init__.py index 01abf0d..4f338d0 100644 --- a/faqt/model/__init__.py +++ b/faqt/model/__init__.py @@ -2,6 +2,7 @@ StepwiseKeyedVectorsScorer, WMDScorer, ) +from faqt.model.faq_matching.contextualization import Contextualization from faqt.model.faq_matching.bert import QuestionAnswerBERTScorer from faqt.model.urgency_detection.urgency_detection_base import KeywordRule, RuleBasedUD diff --git a/faqt/model/faq_matching/contextualization.py b/faqt/model/faq_matching/contextualization.py new file mode 100644 index 0000000..3b4dfff --- /dev/null +++ b/faqt/model/faq_matching/contextualization.py @@ -0,0 +1,125 @@ +import numpy as np +import pandas as pd +from sklearn.preprocessing import MultiLabelBinarizer +from warnings import warn + +VARIANCE = 0.1 + + +class Contextualization: + """ + Contextualization class to use context information to calculate weights. + + Contextualization can be used to calculate weights to be attributed to each content while scoring. + This weight is calculated using some contexts obtained from each content and the context of the message. + + Parameters + ---------- + contents_contexts: Dict[str, List[str]] + Dictionnary of contents with the contents ID as key and the contexts list as value. + distance_matrix :pandas.DataFrame + A square matrix in the form of a pandas dataframe with the contexts list as + both columns and index and distance between each pair of contexts as values. + variance: float + The variance parameter for the kernelization using the radial basis function. + + """ + + def __init__(self, contents_dict, distance_matrix, variance=VARIANCE): + """Define constructor""" + + self.check_inputs(contents_dict, distance_matrix) + self.contexts = list(distance_matrix.columns) + self.contents_id = contents_dict.keys() + self.binarizer = MultiLabelBinarizer(classes=self.contexts) + self._context_matrix = self._get_context_matrix(list(contents_dict.values())) + self._distance_matrix = distance_matrix.values + + self.variance = variance + + def check_inputs(self, contents_dict, distance_matrix): + assert ( + len(distance_matrix) > 0 + ), "Empty dataframe, please provided a distance matrix as a dataframe" + assert ( + len(distance_matrix.shape) == 2 + and distance_matrix.shape[0] == distance_matrix.shape[1] + ), "Distance matrix is not a square matrix" + if len(contents_dict) < 1: + warn("No faqs detected, No weight will be calculated.") + else: + flattened_contexts = np.hstack(list(contents_dict.values())) + unique_values = np.unique(flattened_contexts) + invalid = np.setdiff1d(unique_values, distance_matrix.columns) + if len(invalid) > 0: + raise ValueError( + f"contexts {str(invalid)} cannot be found in 'distance_matrix'" + ) + + def _get_context_matrix(self, content_contexts): + """Convert contexts provided as list of strings into a binary vector representation""" + return self.binarizer.fit_transform(content_contexts) + + def _message_context_vector(self, message_context): + """Convert message context list into vector of indexes as they appear in the content context list""" + + if len(message_context) < 1: + raise ValueError("Message context cannot be empty") + + message_vector = [ + self.contexts.index(value) + for value in message_context + if value in self.contexts + ] + if len(message_vector) != len(message_context): + invalid = [value for value in message_context if value not in self.contexts] + raise ValueError(f"Unknown contexts : {str(invalid)} ") + else: + return message_vector + + def get_context_weights(self, message_context): + """ + Get context weights from the message contexts. + + Parameters + ---------- + + + message_context :List[str] + list of contexts + + Returns + ------- + weights : list of str + List of tokens, with entities connected. + """ + + def rbf(variance, vectors): + return np.exp(-((variance * vectors) ** 2)) + + message_vector = self._message_context_vector(message_context) + + distance_vectors = self._distance_matrix[message_vector].min(axis=0) + + rbf_weights = rbf(self.variance, distance_vectors) + weights = (rbf_weights * self._context_matrix).max(axis=1) + content_weights = { + content_id: weight + for (content_id, weight) in zip(self.contents_id, weights) + } + return content_weights + + +def get_ordered_distance_matrix(context_list): + """Create a distance matrix by asssuming that the distance between each adjacent context is 1""" + size = len(context_list) + + a = np.abs(np.arange(-size, size)) + distance_matrix = np.empty((size, size)) + + for i in np.arange(size): + distance_matrix[i] = a[size - i : 2 * size - i] + distance_matrix = pd.DataFrame( + distance_matrix, columns=context_list, index=context_list, dtype=int + ) + return distance_matrix diff --git a/faqt/model/faq_matching/keyed_vectors_scoring.py b/faqt/model/faq_matching/keyed_vectors_scoring.py index 086fda7..4dcefcf 100644 --- a/faqt/model/faq_matching/keyed_vectors_scoring.py +++ b/faqt/model/faq_matching/keyed_vectors_scoring.py @@ -114,7 +114,9 @@ def set_contents(self, contents, weights=None): optionally saves word-vectors to `self.content_vectors`.""" raise NotImplementedError - def score_contents(self, message, return_spell_corrected=False, **kwargs): + def score_contents( + self, message, return_spell_corrected=False, weights=None, **kwargs + ): """ Scores contents and applies weighting if `self.weighting_method` is not None @@ -123,6 +125,9 @@ def score_contents(self, message, return_spell_corrected=False, **kwargs): ---------- message : str return_spell_corrected : bool, default=False + weights: List[str] or None + Weight of each FAQ, will override content_weights if added + kwargs : additional keyword arguments to pass. e.g. for StepwiseKeyedVectorsScorer, `return_tag_scores=True` will @@ -144,6 +149,11 @@ def score_contents(self, message, return_spell_corrected=False, **kwargs): "Contents have not been set. Set contents with " "`self.set_contents()`" ) + if weights is not None and self.content_weights is not None: + warn( + "`weights` parameter is provided. This will override the `content_weights` set during intialization. " + ) + message_tokens = self.tokenizer(message) message_vectors, spell_corrected = self.model_search(message_tokens) @@ -159,9 +169,15 @@ def score_contents(self, message, return_spell_corrected=False, **kwargs): result = self._score_contents(message_vectors, spell_corrected, **kwargs) if self.weighting_method is not None and self.content_weights is not None: + if weights: + content_weights = weights + else: + content_weights = self.content_weights + weighted_scores = self.weighting_method( - result["overall_scores"], self.content_weights, **self.weighting_kwargs + result["overall_scores"], content_weights, **self.weighting_kwargs ) + result["overall_scores"] = weighted_scores if return_spell_corrected: diff --git a/requirements.txt b/requirements.txt index feee708..549bd82 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,7 @@ nltk>=3.7 numpy>=1.22.2 -gensim@git+https://github.com/IDinsight/gensim@4.1.3#egg=gensim +gensim==4.3.0 POT==0.8.2 cyhunspell==2.0.2 +scikit-learn==1.1.2 +pandas==1.5.3 \ No newline at end of file diff --git a/requirements_dev.txt b/requirements_dev.txt index db51a86..2d9d149 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -5,7 +5,8 @@ pre-commit>=2.11.1 pytest>=6.2.2 coverage>=6.3.2 smart-open[s3]>=5.0.0 +sphinx==4.5.0 +sphinx-material==0.0.35 furo==2022.12.7 -scikit-learn==1.1.2 numpydoc==1.2 - +joblib==1.2.0 \ No newline at end of file diff --git a/tests/data/contextualization.yaml b/tests/data/contextualization.yaml new file mode 100644 index 0000000..418d1de --- /dev/null +++ b/tests/data/contextualization.yaml @@ -0,0 +1 @@ +contexts_list: ["design", "code", "test", "deploy", "maintain"] \ No newline at end of file diff --git a/tests/data/tag_test_data.yaml b/tests/data/tag_test_data.yaml index c5e8297..6bc5ec7 100644 --- a/tests/data/tag_test_data.yaml +++ b/tests/data/tag_test_data.yaml @@ -1,8 +1,9 @@ tags_refresh_data: - id: 1 - tags: ["rock", "guitar", "melody", "chord"] - title: "music" + tags: ["rock", "guitar", "melody", "chord"] content_to_send: "Where words fail, music speaks" + context: ["design","test"] + title: "music" example_questions: - "I can't describe in words how I feel." - "What do you think of music?" @@ -10,6 +11,7 @@ tags_refresh_data: - id: 2 tags: ["cheese", "tomato", "bread", "mustard"] content_to_send: "We finish each other's sandwiches" + context: ["code","test"] title: "sandwich" example_questions: - "What should we do with these sandwiches?" @@ -17,6 +19,7 @@ tags_refresh_data: - id: 3 tags: ["rock", "lake", "mountain", "sky"] content_to_send: "Let's do it. But outside." + context: ["deploy"] title: "nature" example_questions: - "should we go climbing?" @@ -25,6 +28,7 @@ tags_refresh_data: - id: 4 tags: ["trace", "vector", "length", "angle"] content_to_send: "Whenever you can, count" + context: ["code","maintain"] title: "math" example_questions: - "How can I know the cardinality of everything" @@ -34,15 +38,17 @@ tags_refresh_data: content_to_send: > When I feel like exercising, I just lie down until the feeling goes away + context: ["design","maintain"] title: "hobby" example_questions: - "Do you like to exercise?" weight: 1 - - id: 5 + - id: 6 tags: ["digest", "eat", "chew", "expel"] content_to_send: > Nothing would be more tiresome than eating and drinking if God had not made them a pleasure as well as a necessity. + context: ["test","code","maintain"] title: "life" example_questions: - "Do you like to eat and drink" diff --git a/tests/test_models/test_contextualization.py b/tests/test_models/test_contextualization.py new file mode 100644 index 0000000..9ffa8bb --- /dev/null +++ b/tests/test_models/test_contextualization.py @@ -0,0 +1,307 @@ +from functools import partial +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest +import yaml +from faqt import Contextualization +from faqt.model.faq_matching.contextualization import get_ordered_distance_matrix + + +@pytest.fixture(scope="module") +def default_content_dict(): + full_path = Path(__file__).parents[1] / "data/tag_test_data.yaml" + with open(full_path) as file: + yaml_dict = yaml.full_load(file) + + tagsets_data = yaml_dict["tags_refresh_data"] + return {tagset_data["id"]: tagset_data["context"] for tagset_data in tagsets_data} + + +@pytest.fixture(scope="module") +def default_distance_matrix(): + + full_path = Path(__file__).parents[1] / "data/contextualization.yaml" + with open(full_path) as file: + yaml_dict = yaml.full_load(file) + + contexts = yaml_dict["contexts_list"] + + distance_matrix = get_ordered_distance_matrix(contexts) + return distance_matrix + + +@pytest.fixture(scope="module") +def default_contents_id(): + full_path = Path(__file__).parents[1] / "data/tag_test_data.yaml" + with open(full_path) as file: + yaml_dict = yaml.full_load(file) + + tagsets_data = yaml_dict["tags_refresh_data"] + return [tagset_data["id"] for tagset_data in tagsets_data] + + +class TestContextualization: + @pytest.mark.parametrize( + "contents_dict,context", + [ + ( + { + 0: ["word", "music"], + 1: ["beat", "album"], + 2: ["beat", "music"], + }, + ["word", "beat", "music", "album"], + ), + ( + { + 0: ["jump", "run"], + 1: ["shoot", "score"], + 2: ["sprint", "jump", "shoot"], + 3: ["run", "sprint", "jump", "shoot", "score"], + }, + ["run", "sprint", "jump", "shoot", "score"], + ), + ], + ) + def test_context_matrix_shape(self, contents_dict, context): + distance_matrix = get_ordered_distance_matrix(context_list=context) + contextualizer = Contextualization( + contents_dict=contents_dict, + distance_matrix=distance_matrix, + ) + assert contextualizer._context_matrix.shape == ( + len(contents_dict), + len(distance_matrix.columns), + ) + + def test_empty_distance_matrix_return_error(self, default_content_dict): + distance_matrix = pd.DataFrame() + with pytest.raises(AssertionError): + contextualizer = Contextualization( + contents_dict=default_content_dict, + distance_matrix=distance_matrix, + ) + + @pytest.mark.parametrize( + "contents_dict,context", + [ + ( + { + 0: ["word", "music"], + 1: ["beat", "album"], + 2: ["beat", "music"], + }, + ["word", "beat", "music", "album"], + ), + ( + { + 0: ["jump", "run"], + 1: ["shoot", "score"], + 2: ["sprint", "jump", "shoot"], + 3: ["run", "sprint", "jump", "shoot", "score"], + }, + ["run", "sprint", "jump", "shoot", "score"], + ), + ], + ) + def test_context_matrix_only_have_0_and_1(self, contents_dict, context): + distance_matrix = get_ordered_distance_matrix(context_list=context) + contextualizer = Contextualization( + contents_dict=contents_dict, + distance_matrix=distance_matrix, + ) + unique_values = set(contextualizer._context_matrix.flatten()) + assert len(unique_values - {0, 1}) == 0 + + @pytest.mark.parametrize( + "contents_dict,context", + [ + ( + { + 0: ["word", "music"], + 1: ["beat", "album"], + 2: ["beat", "music", "single"], + }, + ["word", "beat", "music", "album"], + ), + ( + { + 0: ["jump", "run"], + 1: ["shoot", "score"], + 2: ["sprint", "jump", "shoot", "danse"], + 3: ["run", "sprint", "jump", "shoot", "score"], + }, + ["run", "sprint", "jump", "shoot", "score"], + ), + ], + ) + def test_unknown_content_context_return_error(self, contents_dict, context): + distance_matrix = get_ordered_distance_matrix(context_list=context) + with pytest.raises(ValueError): + contextualizer = Contextualization( + contents_dict=contents_dict, + distance_matrix=distance_matrix, + ) + + @pytest.mark.filterwarnings("ignore::UserWarning") + def test_empty_contents_return_empty_list(self, default_distance_matrix): + message_context = ["code"] + contextualizer = Contextualization( + contents_dict=dict(), distance_matrix=default_distance_matrix + ) + weights = contextualizer.get_context_weights(message_context) + assert len(weights) == 0 + + def test_empty_message_context_throws_error( + self, default_content_dict, default_distance_matrix + ): + message_context = [] + contextualizer = Contextualization( + contents_dict=default_content_dict, + distance_matrix=default_distance_matrix, + ) + with pytest.raises(ValueError): + weights = contextualizer.get_context_weights(message_context) + + @pytest.mark.parametrize( + "message_context", + [ + (["design", "code ", "appreciation"]), + (["design", "test", "deploy", "maintain", "musik"]), + ], + ) + def test_unknown_message_context_returns_error( + self, + default_content_dict, + default_distance_matrix, + message_context, + ): + contextualizer = Contextualization( + contents_dict=default_content_dict, + distance_matrix=default_distance_matrix, + ) + with pytest.raises(ValueError): + weights = contextualizer.get_context_weights(message_context) + + @pytest.mark.parametrize( + "message_context", + [ + (["deploy", "maintain"]), + (["design", "deploy"]), + ], + ) + def test_length_weights_vector( + self, + default_content_dict, + default_distance_matrix, + message_context, + ): + + contextualizer = Contextualization( + contents_dict=default_content_dict, + distance_matrix=default_distance_matrix, + ) + weights = contextualizer.get_context_weights(message_context) + assert len(weights.values()) == len(default_content_dict) + + @pytest.mark.parametrize( + "message_context", + [ + (["deploy", "maintain"]), + (["design", "deploy"]), + ], + ) + def test_weights_are_int_or_float( + self, + default_content_dict, + default_distance_matrix, + message_context, + ): + contextualizer = Contextualization( + contents_dict=default_content_dict, + distance_matrix=default_distance_matrix, + ) + weights = contextualizer.get_context_weights(message_context) + assert np.array(list(weights.values())).dtype in (float, int) + + @pytest.mark.parametrize( + "context_list", + [ + (["morning", "night"]), + (["breakfast", "lunch", "supper", "dinner"]), + ], + ) + def test_distance_matrix_is_square_matrix(self, context_list): + distance_matrix = get_ordered_distance_matrix(context_list=context_list) + size = len(context_list) + assert distance_matrix.shape == (size, size) + + +class TestContextualizationAlgorithm: + @pytest.fixture(scope="class") + def contextualizer(self, default_distance_matrix, default_content_dict): + return Contextualization( + contents_dict=default_content_dict, distance_matrix=default_distance_matrix + ) + + @pytest.fixture(scope="class") + def contextualizer_1(self, default_distance_matrix, default_content_dict): + return Contextualization( + contents_dict=default_content_dict, + distance_matrix=default_distance_matrix, + variance=1, + ) + + @pytest.fixture(scope="class") + def contextualizer_10(self, default_distance_matrix, default_content_dict): + return Contextualization( + contents_dict=default_content_dict, + distance_matrix=default_distance_matrix, + variance=10, + ) + + @pytest.mark.parametrize( + "message_context,expected_key_max,expected_key_min", + [(["code", "test"], [1, 2, 4], 3), (["code", "maintain"], [2, 4, 6], 1)], + ) + def test_two_context_in_message( + self, contextualizer, message_context, expected_key_max, expected_key_min + ): + weights = contextualizer.get_context_weights(message_context) + key_max = max(weights, key=weights.get) + assert key_max == expected_key_max[0] + assert weights[expected_key_min] < weights[expected_key_max[0]] + if len(expected_key_max) > 1: + assert ( + weights[expected_key_max[0]] + == weights[expected_key_max[1]] + == weights[expected_key_max[2]] + ) + + @pytest.mark.parametrize( + "message_context,expected_key_min", + [(["test"], 3), (["maintain"], 3), (["deploy"], 1)], + ) + def test_contextualisation_variance( + self, + contextualizer, + contextualizer_1, + contextualizer_10, + message_context, + expected_key_min, + ): + weights_0_1 = contextualizer.get_context_weights(message_context) + weights_1 = contextualizer_1.get_context_weights(message_context) + weights_10 = contextualizer_10.get_context_weights(message_context) + assert ( + max(weights_0_1, key=weights_0_1.get) + == max(weights_1, key=weights_1.get) + == max(weights_10, key=weights_10.get) + ) + assert ( + weights_10[expected_key_min] + < weights_1[expected_key_min] + < weights_0_1[expected_key_min] + )