Skip to content

Commit

Permalink
Implement contextualisation in Faqt (#24)
Browse files Browse the repository at this point in the history
* Ïmplement contextualisation in faqt

* Ädd contextualisation tests

* Add test casesfor coverage

* Minor changes to use contextualization instead of contextualisation

* Add content id to return  dictionary rather than list

* Add content id to return  dictionary rather than list

* Add content id to return  dictionary rather than list

* Add contents as dictionary

* Add ordered contexts

* Add ordered contexts

* Add ordered contexts

* Add test cases for algorithm

* Add test cases for algorithm

* Add test cases for algorithm
  • Loading branch information
lickem22 authored Feb 21, 2023
1 parent e0899ba commit 699ea74
Show file tree
Hide file tree
Showing 10 changed files with 469 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# data folder
data/
!tests/data
!tests/data/*.yaml
tests/data/*
!tests/data/*.yaml
# google new binary
GoogleNews-vectors-negative300-prenorm.bin

Expand Down
1 change: 1 addition & 0 deletions faqt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
QuestionAnswerBERTScorer,
StepwiseKeyedVectorsScorer,
WMDScorer,
Contextualization,
)
from .preprocessing import (
preprocess_text_for_keyword_rule,
Expand Down
1 change: 1 addition & 0 deletions faqt/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
StepwiseKeyedVectorsScorer,
WMDScorer,
)
from faqt.model.faq_matching.contextualization import Contextualization
from faqt.model.faq_matching.bert import QuestionAnswerBERTScorer

from faqt.model.urgency_detection.urgency_detection_base import KeywordRule, RuleBasedUD
125 changes: 125 additions & 0 deletions faqt/model/faq_matching/contextualization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from warnings import warn

VARIANCE = 0.1


class Contextualization:
"""
Contextualization class to use context information to calculate weights.
Contextualization can be used to calculate weights to be attributed to each content while scoring.
This weight is calculated using some contexts obtained from each content and the context of the message.
Parameters
----------
contents_contexts: Dict[str, List[str]]
Dictionnary of contents with the contents ID as key and the contexts list as value.
distance_matrix :pandas.DataFrame
A square matrix in the form of a pandas dataframe with the contexts list as
both columns and index and distance between each pair of contexts as values.
variance: float
The variance parameter for the kernelization using the radial basis function.
"""

def __init__(self, contents_dict, distance_matrix, variance=VARIANCE):
"""Define constructor"""

self.check_inputs(contents_dict, distance_matrix)
self.contexts = list(distance_matrix.columns)
self.contents_id = contents_dict.keys()
self.binarizer = MultiLabelBinarizer(classes=self.contexts)
self._context_matrix = self._get_context_matrix(list(contents_dict.values()))
self._distance_matrix = distance_matrix.values

self.variance = variance

def check_inputs(self, contents_dict, distance_matrix):
assert (
len(distance_matrix) > 0
), "Empty dataframe, please provided a distance matrix as a dataframe"
assert (
len(distance_matrix.shape) == 2
and distance_matrix.shape[0] == distance_matrix.shape[1]
), "Distance matrix is not a square matrix"
if len(contents_dict) < 1:
warn("No faqs detected, No weight will be calculated.")
else:
flattened_contexts = np.hstack(list(contents_dict.values()))
unique_values = np.unique(flattened_contexts)
invalid = np.setdiff1d(unique_values, distance_matrix.columns)
if len(invalid) > 0:
raise ValueError(
f"contexts {str(invalid)} cannot be found in 'distance_matrix'"
)

def _get_context_matrix(self, content_contexts):
"""Convert contexts provided as list of strings into a binary vector representation"""
return self.binarizer.fit_transform(content_contexts)

def _message_context_vector(self, message_context):
"""Convert message context list into vector of indexes as they appear in the content context list"""

if len(message_context) < 1:
raise ValueError("Message context cannot be empty")

message_vector = [
self.contexts.index(value)
for value in message_context
if value in self.contexts
]
if len(message_vector) != len(message_context):
invalid = [value for value in message_context if value not in self.contexts]
raise ValueError(f"Unknown contexts : {str(invalid)} ")
else:
return message_vector

def get_context_weights(self, message_context):
"""
Get context weights from the message contexts.
Parameters
----------
message_context :List[str]
list of contexts
Returns
-------
weights : list of str
List of tokens, with entities connected.
"""

def rbf(variance, vectors):
return np.exp(-((variance * vectors) ** 2))

message_vector = self._message_context_vector(message_context)

distance_vectors = self._distance_matrix[message_vector].min(axis=0)

rbf_weights = rbf(self.variance, distance_vectors)
weights = (rbf_weights * self._context_matrix).max(axis=1)
content_weights = {
content_id: weight
for (content_id, weight) in zip(self.contents_id, weights)
}
return content_weights


def get_ordered_distance_matrix(context_list):
"""Create a distance matrix by asssuming that the distance between each adjacent context is 1"""
size = len(context_list)

a = np.abs(np.arange(-size, size))
distance_matrix = np.empty((size, size))

for i in np.arange(size):
distance_matrix[i] = a[size - i : 2 * size - i]
distance_matrix = pd.DataFrame(
distance_matrix, columns=context_list, index=context_list, dtype=int
)
return distance_matrix
20 changes: 18 additions & 2 deletions faqt/model/faq_matching/keyed_vectors_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ def set_contents(self, contents, weights=None):
optionally saves word-vectors to `self.content_vectors`."""
raise NotImplementedError

def score_contents(self, message, return_spell_corrected=False, **kwargs):
def score_contents(
self, message, return_spell_corrected=False, weights=None, **kwargs
):
"""
Scores contents and applies weighting if `self.weighting_method` is
not None
Expand All @@ -123,6 +125,9 @@ def score_contents(self, message, return_spell_corrected=False, **kwargs):
----------
message : str
return_spell_corrected : bool, default=False
weights: List[str] or None
Weight of each FAQ, will override content_weights if added
kwargs :
additional keyword arguments to pass.
e.g. for StepwiseKeyedVectorsScorer, `return_tag_scores=True` will
Expand All @@ -144,6 +149,11 @@ def score_contents(self, message, return_spell_corrected=False, **kwargs):
"Contents have not been set. Set contents with " "`self.set_contents()`"
)

if weights is not None and self.content_weights is not None:
warn(
"`weights` parameter is provided. This will override the `content_weights` set during intialization. "
)

message_tokens = self.tokenizer(message)
message_vectors, spell_corrected = self.model_search(message_tokens)

Expand All @@ -159,9 +169,15 @@ def score_contents(self, message, return_spell_corrected=False, **kwargs):
result = self._score_contents(message_vectors, spell_corrected, **kwargs)

if self.weighting_method is not None and self.content_weights is not None:
if weights:
content_weights = weights
else:
content_weights = self.content_weights

weighted_scores = self.weighting_method(
result["overall_scores"], self.content_weights, **self.weighting_kwargs
result["overall_scores"], content_weights, **self.weighting_kwargs
)

result["overall_scores"] = weighted_scores

if return_spell_corrected:
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
nltk>=3.7
numpy>=1.22.2
gensim@git+https://github.com/IDinsight/gensim@4.1.3#egg=gensim
gensim==4.3.0
POT==0.8.2
cyhunspell==2.0.2
scikit-learn==1.1.2
pandas==1.5.3
5 changes: 3 additions & 2 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ pre-commit>=2.11.1
pytest>=6.2.2
coverage>=6.3.2
smart-open[s3]>=5.0.0
sphinx==4.5.0
sphinx-material==0.0.35
furo==2022.12.7
scikit-learn==1.1.2
numpydoc==1.2

joblib==1.2.0
1 change: 1 addition & 0 deletions tests/data/contextualization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
contexts_list: ["design", "code", "test", "deploy", "maintain"]
12 changes: 9 additions & 3 deletions tests/data/tag_test_data.yaml
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
tags_refresh_data:
- id: 1
tags: ["rock", "guitar", "melody", "chord"]
title: "music"
tags: ["rock", "guitar", "melody", "chord"]
content_to_send: "Where words fail, music speaks"
context: ["design","test"]
title: "music"
example_questions:
- "I can't describe in words how I feel."
- "What do you think of music?"
weight: 1
- id: 2
tags: ["cheese", "tomato", "bread", "mustard"]
content_to_send: "We finish each other's sandwiches"
context: ["code","test"]
title: "sandwich"
example_questions:
- "What should we do with these sandwiches?"
weight: 3
- id: 3
tags: ["rock", "lake", "mountain", "sky"]
content_to_send: "Let's do it. But outside."
context: ["deploy"]
title: "nature"
example_questions:
- "should we go climbing?"
Expand All @@ -25,6 +28,7 @@ tags_refresh_data:
- id: 4
tags: ["trace", "vector", "length", "angle"]
content_to_send: "Whenever you can, count"
context: ["code","maintain"]
title: "math"
example_questions:
- "How can I know the cardinality of everything"
Expand All @@ -34,15 +38,17 @@ tags_refresh_data:
content_to_send: >
When I feel like exercising,
I just lie down until the feeling goes away
context: ["design","maintain"]
title: "hobby"
example_questions:
- "Do you like to exercise?"
weight: 1
- id: 5
- id: 6
tags: ["digest", "eat", "chew", "expel"]
content_to_send: >
Nothing would be more tiresome than eating and drinking
if God had not made them a pleasure as well as a necessity.
context: ["test","code","maintain"]
title: "life"
example_questions:
- "Do you like to eat and drink"
Expand Down
Loading

0 comments on commit 699ea74

Please sign in to comment.