From 0477477a9c19724ca8ce39ea7eedab6b451413ea Mon Sep 17 00:00:00 2001 From: lickem22 <44327443+lickem22@users.noreply.github.com> Date: Mon, 12 Dec 2022 13:00:24 +0300 Subject: [PATCH] [PNC-719] Update prod codebase with ML framework (#19) * [PNC-719] Update prod codebase with ML framework * [PNC-719] Update prod codebase with ML framework * [PNC-719] Update prod codebase with ML framework * [PNC-719] Update prod codebase with ML framework * [PNC-719]Update prod codebase with ML framework * Update faqt/model/urgency_detection/urgency_detection_base.py Co-authored-by: Suzin You * Update tests/test_models/test_ml_ud_scorer.py Co-authored-by: Suzin You * Update tests/test_models/test_keyword_rule_scorer.py Co-authored-by: Suzin You * Update faqt/model/urgency_detection/urgency_detection_base.py Co-authored-by: Suzin You * [PNC-719] Update prod codebase with ML framework * [PNC-719] Update prod codebase with ML framework Co-authored-by: Carlos Samey Co-authored-by: Suzin You --- .github/workflows/tests.yml | 12 ++++ .../urgency_detection_base.py | 66 ++++++++++++++++--- requirements_dev.txt | 1 + tests/test_models/test_keyword_rule_scorer.py | 54 ++++++++++++++- tests/test_models/test_ml_ud_scorer.py | 62 +++++++++++++++++ 5 files changed, 185 insertions(+), 10 deletions(-) create mode 100644 tests/test_models/test_ml_ud_scorer.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b31fec5..68175ff 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -32,6 +32,7 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} aws-region: ${{ secrets.AWS_REGION }} + - name: Make huggingface model dir run: mkdir -p ./tests/data/sequence_classification_models/huggingface_model - uses: keithweaver/aws-s3-github-action@v1.0.0 # Verifies the recursive flag @@ -48,6 +49,17 @@ jobs: tar -xvf ./tests/data/sequence_classification_models/huggingface_model.tar.gz -C ./tests/data/sequence_classification_models/huggingface_model rm ./tests/data/sequence_classification_models/huggingface_model.tar.gz echo "model saved to ./tests/data/sequence_classification_models/huggingface_model" + - name: Make ud_ml_model dir + run: mkdir -p ./tests/data/ud_ml_models + - uses: keithweaver/aws-s3-github-action@v1.0.0 # Verifies the recursive flag + name: Download Urgency Detection ML model + with: + command: cp + source: s3://${{ secrets.WORD2VEC_BINARY_BUCKET }}/model_test.joblib + destination: ./tests/data/ud_ml_models/model_test.joblib + aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws_region: ${{ secrets.AWS_REGION }} - name: Run Unit Tests env: WORD2VEC_BINARY_BUCKET: ${{secrets.WORD2VEC_BINARY_BUCKET}} diff --git a/faqt/model/urgency_detection/urgency_detection_base.py b/faqt/model/urgency_detection/urgency_detection_base.py index 6bb6e8e..66281f3 100644 --- a/faqt/model/urgency_detection/urgency_detection_base.py +++ b/faqt/model/urgency_detection/urgency_detection_base.py @@ -67,16 +67,16 @@ class UrgencyDetectionBase(ABC): def __init__(self, model, preprocessor): """ - Setting model (whether it is rule based or not + Setting model (whether it is rule based or not) Parameters ----------- - model : sklearn.models.Pipeline or faqt.model.urgency_detection.KeywordRule + model : sklearn.pipeline.Pipeline or List[faqt.model.urgency_detection.KeywordRule] Model to use for predictions. preprocessor : function Function to preprocess the message """ - self.preprocess = preprocessor + self.preprocessor = preprocessor self.model = model @abstractmethod @@ -84,16 +84,21 @@ def predict(self, messages): """make prediction on the text""" raise NotImplementedError - @abstractmethod - def is_set(self): - """Check if the model is set.""" - raise NotImplementedError - class RuleBasedUD(UrgencyDetectionBase): """Rule-based model""" def __init__(self, model, preprocessor): + """ + Setting model (rule based models) + + Parameters + ----------- + model : List[faqt.model.urgency_detection.KeywordRule] + List of KeywordRule objects to use for predictions. + preprocessor : function + Function to preprocess the message + """ super(RuleBasedUD, self).__init__(model, preprocessor) def is_set(self): @@ -139,10 +144,53 @@ def predict_scores(self, message): if not self.is_set(): raise ValueError("Rules have not been added") - preprocessed_message = self.preprocess(message) + preprocessed_message = self.preprocessor(message) evaluations = [ evaluate_keyword_rule(preprocessed_message, rule) for rule in self.model ] scores = list(map(float, evaluations)) return scores + + +class MLBasedUD(UrgencyDetectionBase): + """Machine Learning based model""" + + def __init__(self, model, preprocessor): + """ + Setting model (ML based models) + + Parameters + ----------- + model : sklearn.models.Pipeline + Machine Learning model to use for predictions. + preprocessor : function + Function to preprocess the message. During prediction, the raw text will be preprocessed using this function, and then passed to the `model`'s predict function. + """ + super(MLBasedUD, self).__init__(model, preprocessor) + + def predict(self, message): + """ + return final urgency score. + Parameters + ---------- + message : str + A string or a list of pre-processed tokens to classify as urgent or not. + Returns + ------- + float: urgency_score + + """ + + preprocessed_message = " ".join(self.preprocessor(message)) + prediction = self.model.predict([preprocessed_message]) + return float(prediction) + + def get_model(self): + """ + return prediction model + Returns + ------- + sklearn.models.Pipeline: model + """ + return self.model diff --git a/requirements_dev.txt b/requirements_dev.txt index c8e37cc..f496583 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -7,4 +7,5 @@ coverage>=6.3.2 smart-open[s3]>=5.0.0 sphinx==4.5.0 sphinx-material==0.0.35 +scikit-learn==1.1.2 numpydoc==1.2 diff --git a/tests/test_models/test_keyword_rule_scorer.py b/tests/test_models/test_keyword_rule_scorer.py index 345ec1e..9db2f00 100644 --- a/tests/test_models/test_keyword_rule_scorer.py +++ b/tests/test_models/test_keyword_rule_scorer.py @@ -1,10 +1,12 @@ from functools import partial - +from pathlib import Path +import joblib import pytest from faqt.model.urgency_detection.urgency_detection_base import ( KeywordRule, evaluate_keyword_rule, RuleBasedUD, + MLBasedUD, ) from faqt.preprocessing import preprocess_text_for_keyword_rule from hunspell import Hunspell @@ -110,6 +112,56 @@ def test_rule_true(self, preprocess_func, keyword_rules, rule_id, message): msg = preprocess_func(message) assert evaluate_keyword_rule(msg, keyword_rules[rule_id]) is True + def test_rules_and_preprocessing_set_correctly( + self, keyword_rules, preprocess_func + ): + predictor = RuleBasedUD(model=keyword_rules, preprocessor=preprocess_func) + assert ( + predictor.model == keyword_rules + and predictor.preprocessor == preprocess_func + ) + + @pytest.mark.parametrize( + "rule_id, message", + [ + ( + 0, + "I have a headache, feel dizzy, and everything looks blurry", + ), + # True because it contains all included keywords + (1, "My back pain is killing me :("), # True, because it + # includes all included keywords and no excluded keyword + (2, "hi hi"), # True, because it doesn't have excluded keyword + ], + ) + def test_rule_based_ud_predict_score_return_array( + self, keyword_rules, preprocess_func, rule_id, message + ): + predictor = RuleBasedUD(model=keyword_rules, preprocessor=preprocess_func) + scores = predictor.predict_scores(message) + assert isinstance(scores, list) + assert len(scores) == len(keyword_rules) + + @pytest.mark.parametrize( + "rule_id, message", + [ + ( + 0, + "I have a headache, feel dizzy, and everything looks blurry", + ), + # True because it contains all included keywords + (1, "My back pain is killing me :("), # True, because it + # includes all included keywords and no excluded keyword + (2, "hi hi"), # True, because it doesn't have excluded keyword + ], + ) + def test_rule_based_ud_predict_return_float( + self, keyword_rules, preprocess_func, rule_id, message + ): + predictor = RuleBasedUD(model=keyword_rules, preprocessor=preprocess_func) + scores = predictor.predict(message) + assert isinstance(scores, float) + @pytest.mark.parametrize( "rule_id, message, expected", [ diff --git a/tests/test_models/test_ml_ud_scorer.py b/tests/test_models/test_ml_ud_scorer.py new file mode 100644 index 0000000..0db146d --- /dev/null +++ b/tests/test_models/test_ml_ud_scorer.py @@ -0,0 +1,62 @@ +from functools import partial +from pathlib import Path + +import pytest + +from faqt.preprocessing import preprocess_text_for_keyword_rule +from faqt.model.urgency_detection.urgency_detection_base import MLBasedUD + +from hunspell import Hunspell +from nltk.stem import PorterStemmer +import joblib + + +class TestMLBasedUD: + @pytest.fixture + def ml_model(self): + full_path = Path(__file__).parents[1] / "data/ud_ml_models/model_test.joblib" + return joblib.load(full_path) + + @pytest.fixture(scope="class") + def preprocess_func(self): + huns = Hunspell() + stemmer = PorterStemmer() + preprocess_func = partial( + preprocess_text_for_keyword_rule, + n_min_dashed_words_url=3, + stem_func=stemmer.stem, + spell_checker=huns, + reincluded_stop_words=["what", "not", "how", "much", "where", "me"], + ngram_min=1, + ngram_max=2, + ) + return preprocess_func + + def test_model_and_preprocessing_set_correctly(self, ml_model, preprocess_func): + predictor = MLBasedUD(model=ml_model, preprocessor=preprocess_func) + assert predictor.model == ml_model + assert predictor.preprocessor == preprocess_func + + def test_get_model_returns_model(self, ml_model, preprocess_func): + predictor = MLBasedUD(model=ml_model, preprocessor=preprocess_func) + returned_model = predictor.get_model() + assert ml_model == returned_model + + @pytest.mark.parametrize( + "rule_id, message", + [ + ( + 0, + "I have a headache, feel dizzy, and everything looks blurry", + ), + # True because it contains all included keywords + (1, "My back pain is killing me :("), # True, because it + # includes all included keywords and no excluded keyword + # True, because it doesn't have excluded keyword + ], + ) + def test_model_returns_prediction( + self, ml_model, preprocess_func, rule_id, message + ): + predictor = MLBasedUD(model=ml_model, preprocessor=preprocess_func) + assert isinstance(predictor.predict(message), float)