Implement contextualisation in Faqt (#24)

* Ïmplement contextualisation in faqt * Ädd contextualisation tests * Add test casesfor coverage * Minor changes to use contextualization instead of contextualisation * Add content id to return dictionary rather than list * Add content id to return dictionary rather than list * Add content id to return dictionary rather than list * Add contents as dictionary * Add ordered contexts * Add ordered contexts * Add ordered contexts * Add test cases for algorithm * Add test cases for algorithm * Add test cases for algorithm
IDinsight · Feb 21, 2023 · 699ea74 · 699ea74
1 parent e0899ba
commit 699ea74
Show file tree

Hide file tree

Showing 10 changed files with 469 additions and 9 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,8 @@
 # data folder
 data/
 !tests/data
-!tests/data/*.yaml
 tests/data/*
+!tests/data/*.yaml
 # google new binary
 GoogleNews-vectors-negative300-prenorm.bin
 

diff --git a/faqt/__init__.py b/faqt/__init__.py
@@ -6,6 +6,7 @@
     QuestionAnswerBERTScorer,
     StepwiseKeyedVectorsScorer,
     WMDScorer,
+    Contextualization,
 )
 from .preprocessing import (
     preprocess_text_for_keyword_rule,

diff --git a/faqt/model/__init__.py b/faqt/model/__init__.py
@@ -2,6 +2,7 @@
     StepwiseKeyedVectorsScorer,
     WMDScorer,
 )
+from faqt.model.faq_matching.contextualization import Contextualization
 from faqt.model.faq_matching.bert import QuestionAnswerBERTScorer
 
 from faqt.model.urgency_detection.urgency_detection_base import KeywordRule, RuleBasedUD
diff --git a/faqt/model/faq_matching/contextualization.py b/faqt/model/faq_matching/contextualization.py
@@ -0,0 +1,125 @@
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import MultiLabelBinarizer
+from warnings import warn
+
+VARIANCE = 0.1
+
+
+class Contextualization:
+    """
+    Contextualization class to use  context information to calculate weights.
+
+    Contextualization can be used to calculate weights to be attributed to each content while scoring.
+    This weight is calculated using some contexts obtained from each content and the context of the message.
+
+    Parameters
+    ----------
+    contents_contexts: Dict[str, List[str]]
+        Dictionnary of contents with the contents ID as key and the contexts list as value.
+    distance_matrix :pandas.DataFrame
+        A square matrix in the form of a pandas dataframe with the contexts list as
+        both columns and index and distance between each pair of contexts as values.
+    variance: float
+        The variance parameter for the kernelization using the radial basis function.
+
+    """
+
+    def __init__(self, contents_dict, distance_matrix, variance=VARIANCE):
+        """Define constructor"""
+
+        self.check_inputs(contents_dict, distance_matrix)
+        self.contexts = list(distance_matrix.columns)
+        self.contents_id = contents_dict.keys()
+        self.binarizer = MultiLabelBinarizer(classes=self.contexts)
+        self._context_matrix = self._get_context_matrix(list(contents_dict.values()))
+        self._distance_matrix = distance_matrix.values
+
+        self.variance = variance
+
+    def check_inputs(self, contents_dict, distance_matrix):
+        assert (
+            len(distance_matrix) > 0
+        ), "Empty dataframe, please provided a distance matrix as a dataframe"
+        assert (
+            len(distance_matrix.shape) == 2
+            and distance_matrix.shape[0] == distance_matrix.shape[1]
+        ), "Distance matrix is not a square matrix"
+        if len(contents_dict) < 1:
+            warn("No faqs detected, No weight will be calculated.")
+        else:
+            flattened_contexts = np.hstack(list(contents_dict.values()))
+            unique_values = np.unique(flattened_contexts)
+            invalid = np.setdiff1d(unique_values, distance_matrix.columns)
+            if len(invalid) > 0:
+                raise ValueError(
+                    f"contexts {str(invalid)} cannot be found in 'distance_matrix'"
+                )
+
+    def _get_context_matrix(self, content_contexts):
+        """Convert contexts provided as list of strings into a binary vector representation"""
+        return self.binarizer.fit_transform(content_contexts)
+
+    def _message_context_vector(self, message_context):
+        """Convert message context list into vector of indexes as they appear in the content context list"""
+
+        if len(message_context) < 1:
+            raise ValueError("Message context cannot be empty")
+
+        message_vector = [
+            self.contexts.index(value)
+            for value in message_context
+            if value in self.contexts
+        ]
+        if len(message_vector) != len(message_context):
+            invalid = [value for value in message_context if value not in self.contexts]
+            raise ValueError(f"Unknown contexts : {str(invalid)} ")
+        else:
+            return message_vector
+
+    def get_context_weights(self, message_context):
+        """
+        Get context weights from the message contexts.
+
+        Parameters
+        ----------
+
+
+        message_context :List[str]
+            list of contexts
+
+        Returns
+        -------
+        weights : list of str
+            List of tokens, with entities connected.
+        """
+
+        def rbf(variance, vectors):
+            return np.exp(-((variance * vectors) ** 2))
+
+        message_vector = self._message_context_vector(message_context)
+
+        distance_vectors = self._distance_matrix[message_vector].min(axis=0)
+
+        rbf_weights = rbf(self.variance, distance_vectors)
+        weights = (rbf_weights * self._context_matrix).max(axis=1)
+        content_weights = {
+            content_id: weight
+            for (content_id, weight) in zip(self.contents_id, weights)
+        }
+        return content_weights
+
+
+def get_ordered_distance_matrix(context_list):
+    """Create a distance matrix by asssuming that the distance between each adjacent context is 1"""
+    size = len(context_list)
+
+    a = np.abs(np.arange(-size, size))
+    distance_matrix = np.empty((size, size))
+
+    for i in np.arange(size):
+        distance_matrix[i] = a[size - i : 2 * size - i]
+    distance_matrix = pd.DataFrame(
+        distance_matrix, columns=context_list, index=context_list, dtype=int
+    )
+    return distance_matrix
diff --git a/faqt/model/faq_matching/keyed_vectors_scoring.py b/faqt/model/faq_matching/keyed_vectors_scoring.py
@@ -114,7 +114,9 @@ def set_contents(self, contents, weights=None):
         optionally saves word-vectors to `self.content_vectors`."""
         raise NotImplementedError
 
-    def score_contents(self, message, return_spell_corrected=False, **kwargs):
+    def score_contents(
+        self, message, return_spell_corrected=False, weights=None, **kwargs
+    ):
         """
         Scores contents and applies weighting if `self.weighting_method` is
         not None
@@ -123,6 +125,9 @@ def score_contents(self, message, return_spell_corrected=False, **kwargs):
         ----------
         message : str
         return_spell_corrected : bool, default=False
+        weights: List[str] or None
+            Weight of each FAQ, will override content_weights if added
+
         kwargs :
             additional keyword arguments to pass.
             e.g. for StepwiseKeyedVectorsScorer, `return_tag_scores=True` will
@@ -144,6 +149,11 @@ def score_contents(self, message, return_spell_corrected=False, **kwargs):
                 "Contents have not been set. Set contents with " "`self.set_contents()`"
             )
 
+        if weights is not None and self.content_weights is not None:
+            warn(
+                "`weights` parameter is provided. This will override the `content_weights` set during intialization. "
+            )
+
         message_tokens = self.tokenizer(message)
         message_vectors, spell_corrected = self.model_search(message_tokens)
 
@@ -159,9 +169,15 @@ def score_contents(self, message, return_spell_corrected=False, **kwargs):
         result = self._score_contents(message_vectors, spell_corrected, **kwargs)
 
         if self.weighting_method is not None and self.content_weights is not None:
+            if weights:
+                content_weights = weights
+            else:
+                content_weights = self.content_weights
+
             weighted_scores = self.weighting_method(
-                result["overall_scores"], self.content_weights, **self.weighting_kwargs
+                result["overall_scores"], content_weights, **self.weighting_kwargs
             )
+
             result["overall_scores"] = weighted_scores
 
         if return_spell_corrected:

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,7 @@
 nltk>=3.7
 numpy>=1.22.2
-gensim@git+https://github.com/IDinsight/gensim@4.1.3#egg=gensim
+gensim==4.3.0
 POT==0.8.2
 cyhunspell==2.0.2
+scikit-learn==1.1.2
+pandas==1.5.3
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -5,7 +5,8 @@ pre-commit>=2.11.1
 pytest>=6.2.2
 coverage>=6.3.2
 smart-open[s3]>=5.0.0
+sphinx==4.5.0
+sphinx-material==0.0.35
 furo==2022.12.7
-scikit-learn==1.1.2
 numpydoc==1.2
-
+joblib==1.2.0
diff --git a/tests/data/contextualization.yaml b/tests/data/contextualization.yaml
@@ -0,0 +1 @@
+contexts_list: ["design", "code", "test", "deploy", "maintain"]
diff --git a/tests/data/tag_test_data.yaml b/tests/data/tag_test_data.yaml
@@ -1,22 +1,25 @@
 tags_refresh_data:
   - id: 1
-    tags: ["rock", "guitar", "melody", "chord"]
-    title: "music"
+    tags: ["rock", "guitar", "melody", "chord"] 
     content_to_send: "Where words fail, music speaks"
+    context: ["design","test"]
+    title: "music"
     example_questions:
       - "I can't describe in words how I feel."
       - "What do you think of music?"
     weight: 1
   - id: 2
     tags: ["cheese", "tomato", "bread", "mustard"]
     content_to_send: "We finish each other's sandwiches"
+    context: ["code","test"]
     title: "sandwich"
     example_questions:
       - "What should we do with these sandwiches?"
     weight: 3
   - id: 3
     tags: ["rock", "lake", "mountain", "sky"]
     content_to_send: "Let's do it. But outside."
+    context: ["deploy"]
     title: "nature"
     example_questions:
       - "should we go climbing?"
@@ -25,6 +28,7 @@ tags_refresh_data:
   - id: 4
     tags: ["trace", "vector", "length", "angle"]
     content_to_send: "Whenever you can, count"
+    context: ["code","maintain"]
     title: "math"
     example_questions:
       - "How can I know the cardinality of everything"
@@ -34,15 +38,17 @@ tags_refresh_data:
     content_to_send: >
       When I feel like exercising,
       I just lie down until the feeling goes away
+    context: ["design","maintain"]
     title: "hobby"
     example_questions:
       - "Do you like to exercise?"
     weight: 1
-  - id: 5
+  - id: 6
     tags: ["digest", "eat", "chew", "expel"]
     content_to_send: >
       Nothing would be more tiresome than eating and drinking
       if God had not made them a pleasure as well as a necessity.
+    context: ["test","code","maintain"]
     title: "life"
     example_questions:
       - "Do you like to eat and drink"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		contexts_list: ["design", "code", "test", "deploy", "maintain"]