Skip to content

Commit

Permalink
[PNC-719] Update prod codebase with ML framework (#19)
Browse files Browse the repository at this point in the history
* [PNC-719] Update prod codebase with ML framework

* [PNC-719] Update prod codebase with ML framework

* [PNC-719] Update prod codebase with ML framework

* [PNC-719] Update prod codebase with ML framework

* [PNC-719]Update prod codebase with ML framework

* Update faqt/model/urgency_detection/urgency_detection_base.py

Co-authored-by: Suzin You <suzinyou.sy@gmail.com>

* Update tests/test_models/test_ml_ud_scorer.py

Co-authored-by: Suzin You <suzinyou.sy@gmail.com>

* Update tests/test_models/test_keyword_rule_scorer.py

Co-authored-by: Suzin You <suzinyou.sy@gmail.com>

* Update faqt/model/urgency_detection/urgency_detection_base.py

Co-authored-by: Suzin You <suzinyou.sy@gmail.com>

* [PNC-719] Update prod codebase with ML framework

* [PNC-719] Update prod codebase with ML framework

Co-authored-by: Carlos Samey <carlossamey@Carloss-MacBook-Pro.local>
Co-authored-by: Suzin You <suzinyou.sy@gmail.com>
  • Loading branch information
3 people authored Dec 12, 2022
1 parent a450ec5 commit 0477477
Show file tree
Hide file tree
Showing 5 changed files with 185 additions and 10 deletions.
12 changes: 12 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ jobs:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}

- name: Make huggingface model dir
run: mkdir -p ./tests/data/sequence_classification_models/huggingface_model
- uses: keithweaver/aws-s3-github-action@v1.0.0 # Verifies the recursive flag
Expand All @@ -48,6 +49,17 @@ jobs:
tar -xvf ./tests/data/sequence_classification_models/huggingface_model.tar.gz -C ./tests/data/sequence_classification_models/huggingface_model
rm ./tests/data/sequence_classification_models/huggingface_model.tar.gz
echo "model saved to ./tests/data/sequence_classification_models/huggingface_model"
- name: Make ud_ml_model dir
run: mkdir -p ./tests/data/ud_ml_models
- uses: keithweaver/aws-s3-github-action@v1.0.0 # Verifies the recursive flag
name: Download Urgency Detection ML model
with:
command: cp
source: s3://${{ secrets.WORD2VEC_BINARY_BUCKET }}/model_test.joblib
destination: ./tests/data/ud_ml_models/model_test.joblib
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws_region: ${{ secrets.AWS_REGION }}
- name: Run Unit Tests
env:
WORD2VEC_BINARY_BUCKET: ${{secrets.WORD2VEC_BINARY_BUCKET}}
Expand Down
66 changes: 57 additions & 9 deletions faqt/model/urgency_detection/urgency_detection_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,33 +67,38 @@ class UrgencyDetectionBase(ABC):

def __init__(self, model, preprocessor):
"""
Setting model (whether it is rule based or not
Setting model (whether it is rule based or not)
Parameters
-----------
model : sklearn.models.Pipeline or faqt.model.urgency_detection.KeywordRule
model : sklearn.pipeline.Pipeline or List[faqt.model.urgency_detection.KeywordRule]
Model to use for predictions.
preprocessor : function
Function to preprocess the message
"""
self.preprocess = preprocessor
self.preprocessor = preprocessor
self.model = model

@abstractmethod
def predict(self, messages):
"""make prediction on the text"""
raise NotImplementedError

@abstractmethod
def is_set(self):
"""Check if the model is set."""
raise NotImplementedError


class RuleBasedUD(UrgencyDetectionBase):
"""Rule-based model"""

def __init__(self, model, preprocessor):
"""
Setting model (rule based models)
Parameters
-----------
model : List[faqt.model.urgency_detection.KeywordRule]
List of KeywordRule objects to use for predictions.
preprocessor : function
Function to preprocess the message
"""
super(RuleBasedUD, self).__init__(model, preprocessor)

def is_set(self):
Expand Down Expand Up @@ -139,10 +144,53 @@ def predict_scores(self, message):

if not self.is_set():
raise ValueError("Rules have not been added")
preprocessed_message = self.preprocess(message)
preprocessed_message = self.preprocessor(message)
evaluations = [
evaluate_keyword_rule(preprocessed_message, rule) for rule in self.model
]
scores = list(map(float, evaluations))

return scores


class MLBasedUD(UrgencyDetectionBase):
"""Machine Learning based model"""

def __init__(self, model, preprocessor):
"""
Setting model (ML based models)
Parameters
-----------
model : sklearn.models.Pipeline
Machine Learning model to use for predictions.
preprocessor : function
Function to preprocess the message. During prediction, the raw text will be preprocessed using this function, and then passed to the `model`'s predict function.
"""
super(MLBasedUD, self).__init__(model, preprocessor)

def predict(self, message):
"""
return final urgency score.
Parameters
----------
message : str
A string or a list of pre-processed tokens to classify as urgent or not.
Returns
-------
float: urgency_score
"""

preprocessed_message = " ".join(self.preprocessor(message))
prediction = self.model.predict([preprocessed_message])
return float(prediction)

def get_model(self):
"""
return prediction model
Returns
-------
sklearn.models.Pipeline: model
"""
return self.model
1 change: 1 addition & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ coverage>=6.3.2
smart-open[s3]>=5.0.0
sphinx==4.5.0
sphinx-material==0.0.35
scikit-learn==1.1.2
numpydoc==1.2
54 changes: 53 additions & 1 deletion tests/test_models/test_keyword_rule_scorer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from functools import partial

from pathlib import Path
import joblib
import pytest
from faqt.model.urgency_detection.urgency_detection_base import (
KeywordRule,
evaluate_keyword_rule,
RuleBasedUD,
MLBasedUD,
)
from faqt.preprocessing import preprocess_text_for_keyword_rule
from hunspell import Hunspell
Expand Down Expand Up @@ -110,6 +112,56 @@ def test_rule_true(self, preprocess_func, keyword_rules, rule_id, message):
msg = preprocess_func(message)
assert evaluate_keyword_rule(msg, keyword_rules[rule_id]) is True

def test_rules_and_preprocessing_set_correctly(
self, keyword_rules, preprocess_func
):
predictor = RuleBasedUD(model=keyword_rules, preprocessor=preprocess_func)
assert (
predictor.model == keyword_rules
and predictor.preprocessor == preprocess_func
)

@pytest.mark.parametrize(
"rule_id, message",
[
(
0,
"I have a headache, feel dizzy, and everything looks blurry",
),
# True because it contains all included keywords
(1, "My back pain is killing me :("), # True, because it
# includes all included keywords and no excluded keyword
(2, "hi hi"), # True, because it doesn't have excluded keyword
],
)
def test_rule_based_ud_predict_score_return_array(
self, keyword_rules, preprocess_func, rule_id, message
):
predictor = RuleBasedUD(model=keyword_rules, preprocessor=preprocess_func)
scores = predictor.predict_scores(message)
assert isinstance(scores, list)
assert len(scores) == len(keyword_rules)

@pytest.mark.parametrize(
"rule_id, message",
[
(
0,
"I have a headache, feel dizzy, and everything looks blurry",
),
# True because it contains all included keywords
(1, "My back pain is killing me :("), # True, because it
# includes all included keywords and no excluded keyword
(2, "hi hi"), # True, because it doesn't have excluded keyword
],
)
def test_rule_based_ud_predict_return_float(
self, keyword_rules, preprocess_func, rule_id, message
):
predictor = RuleBasedUD(model=keyword_rules, preprocessor=preprocess_func)
scores = predictor.predict(message)
assert isinstance(scores, float)

@pytest.mark.parametrize(
"rule_id, message, expected",
[
Expand Down
62 changes: 62 additions & 0 deletions tests/test_models/test_ml_ud_scorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from functools import partial
from pathlib import Path

import pytest

from faqt.preprocessing import preprocess_text_for_keyword_rule
from faqt.model.urgency_detection.urgency_detection_base import MLBasedUD

from hunspell import Hunspell
from nltk.stem import PorterStemmer
import joblib


class TestMLBasedUD:
@pytest.fixture
def ml_model(self):
full_path = Path(__file__).parents[1] / "data/ud_ml_models/model_test.joblib"
return joblib.load(full_path)

@pytest.fixture(scope="class")
def preprocess_func(self):
huns = Hunspell()
stemmer = PorterStemmer()
preprocess_func = partial(
preprocess_text_for_keyword_rule,
n_min_dashed_words_url=3,
stem_func=stemmer.stem,
spell_checker=huns,
reincluded_stop_words=["what", "not", "how", "much", "where", "me"],
ngram_min=1,
ngram_max=2,
)
return preprocess_func

def test_model_and_preprocessing_set_correctly(self, ml_model, preprocess_func):
predictor = MLBasedUD(model=ml_model, preprocessor=preprocess_func)
assert predictor.model == ml_model
assert predictor.preprocessor == preprocess_func

def test_get_model_returns_model(self, ml_model, preprocess_func):
predictor = MLBasedUD(model=ml_model, preprocessor=preprocess_func)
returned_model = predictor.get_model()
assert ml_model == returned_model

@pytest.mark.parametrize(
"rule_id, message",
[
(
0,
"I have a headache, feel dizzy, and everything looks blurry",
),
# True because it contains all included keywords
(1, "My back pain is killing me :("), # True, because it
# includes all included keywords and no excluded keyword
# True, because it doesn't have excluded keyword
],
)
def test_model_returns_prediction(
self, ml_model, preprocess_func, rule_id, message
):
predictor = MLBasedUD(model=ml_model, preprocessor=preprocess_func)
assert isinstance(predictor.predict(message), float)

0 comments on commit 0477477

Please sign in to comment.