[PNC-719] Update prod codebase with ML framework (#19)

* [PNC-719] Update prod codebase with ML framework * [PNC-719] Update prod codebase with ML framework * [PNC-719] Update prod codebase with ML framework * [PNC-719] Update prod codebase with ML framework * [PNC-719]Update prod codebase with ML framework * Update faqt/model/urgency_detection/urgency_detection_base.py Co-authored-by: Suzin You <suzinyou.sy@gmail.com> * Update tests/test_models/test_ml_ud_scorer.py Co-authored-by: Suzin You <suzinyou.sy@gmail.com> * Update tests/test_models/test_keyword_rule_scorer.py Co-authored-by: Suzin You <suzinyou.sy@gmail.com> * Update faqt/model/urgency_detection/urgency_detection_base.py Co-authored-by: Suzin You <suzinyou.sy@gmail.com> * [PNC-719] Update prod codebase with ML framework * [PNC-719] Update prod codebase with ML framework Co-authored-by: Carlos Samey <carlossamey@Carloss-MacBook-Pro.local> Co-authored-by: Suzin You <suzinyou.sy@gmail.com>
IDinsight · Dec 12, 2022 · 0477477 · 0477477
1 parent a450ec5
commit 0477477
Show file tree

Hide file tree

Showing 5 changed files with 185 additions and 10 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -32,6 +32,7 @@ jobs:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
           aws-region: ${{ secrets.AWS_REGION }}
+
       - name: Make huggingface model dir
         run: mkdir -p ./tests/data/sequence_classification_models/huggingface_model
       - uses: keithweaver/aws-s3-github-action@v1.0.0 # Verifies the recursive flag
@@ -48,6 +49,17 @@ jobs:
           tar -xvf ./tests/data/sequence_classification_models/huggingface_model.tar.gz -C ./tests/data/sequence_classification_models/huggingface_model
           rm ./tests/data/sequence_classification_models/huggingface_model.tar.gz
           echo "model saved to ./tests/data/sequence_classification_models/huggingface_model"
+      - name: Make ud_ml_model dir
+        run: mkdir -p ./tests/data/ud_ml_models
+      - uses: keithweaver/aws-s3-github-action@v1.0.0 # Verifies the recursive flag
+        name: Download Urgency Detection ML model
+        with:
+          command: cp
+          source: s3://${{ secrets.WORD2VEC_BINARY_BUCKET }}/model_test.joblib
+          destination: ./tests/data/ud_ml_models/model_test.joblib
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws_region: ${{ secrets.AWS_REGION }}
       - name: Run Unit Tests
         env:
           WORD2VEC_BINARY_BUCKET: ${{secrets.WORD2VEC_BINARY_BUCKET}}

diff --git a/faqt/model/urgency_detection/urgency_detection_base.py b/faqt/model/urgency_detection/urgency_detection_base.py
@@ -67,33 +67,38 @@ class UrgencyDetectionBase(ABC):
 
     def __init__(self, model, preprocessor):
         """
-        Setting model (whether it is rule based or not
+        Setting model (whether it is rule based or not)
 
         Parameters
         -----------
-        model : sklearn.models.Pipeline or faqt.model.urgency_detection.KeywordRule
+        model : sklearn.pipeline.Pipeline or List[faqt.model.urgency_detection.KeywordRule]
             Model to use for predictions.
         preprocessor : function
             Function to preprocess the message
         """
-        self.preprocess = preprocessor
+        self.preprocessor = preprocessor
         self.model = model
 
     @abstractmethod
     def predict(self, messages):
         """make prediction on the text"""
         raise NotImplementedError
 
-    @abstractmethod
-    def is_set(self):
-        """Check if the model is set."""
-        raise NotImplementedError
-
 
 class RuleBasedUD(UrgencyDetectionBase):
     """Rule-based  model"""
 
     def __init__(self, model, preprocessor):
+        """
+        Setting model (rule based models)
+
+        Parameters
+        -----------
+        model : List[faqt.model.urgency_detection.KeywordRule]
+            List of KeywordRule objects to use for predictions.
+        preprocessor : function
+            Function to preprocess the message
+        """
         super(RuleBasedUD, self).__init__(model, preprocessor)
 
     def is_set(self):
@@ -139,10 +144,53 @@ def predict_scores(self, message):
 
         if not self.is_set():
             raise ValueError("Rules have not been added")
-        preprocessed_message = self.preprocess(message)
+        preprocessed_message = self.preprocessor(message)
         evaluations = [
             evaluate_keyword_rule(preprocessed_message, rule) for rule in self.model
         ]
         scores = list(map(float, evaluations))
 
         return scores
+
+
+class MLBasedUD(UrgencyDetectionBase):
+    """Machine Learning  based  model"""
+
+    def __init__(self, model, preprocessor):
+        """
+        Setting model (ML based models)
+
+        Parameters
+        -----------
+        model : sklearn.models.Pipeline
+            Machine Learning model to use for predictions.
+        preprocessor : function
+            Function to preprocess the message. During prediction, the raw text will be preprocessed using this function, and then passed to the `model`'s predict function.
+        """
+        super(MLBasedUD, self).__init__(model, preprocessor)
+
+    def predict(self, message):
+        """
+        return  final urgency score.
+        Parameters
+        ----------
+        message : str
+            A string or a list of pre-processed tokens to classify as urgent or not.
+        Returns
+        -------
+        float: urgency_score
+
+        """
+
+        preprocessed_message = " ".join(self.preprocessor(message))
+        prediction = self.model.predict([preprocessed_message])
+        return float(prediction)
+
+    def get_model(self):
+        """
+            return prediction model
+        Returns
+        -------
+        sklearn.models.Pipeline: model
+        """
+        return self.model
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -7,4 +7,5 @@ coverage>=6.3.2
 smart-open[s3]>=5.0.0
 sphinx==4.5.0
 sphinx-material==0.0.35
+scikit-learn==1.1.2
 numpydoc==1.2
diff --git a/tests/test_models/test_keyword_rule_scorer.py b/tests/test_models/test_keyword_rule_scorer.py
@@ -1,10 +1,12 @@
 from functools import partial
-
+from pathlib import Path
+import joblib
 import pytest
 from faqt.model.urgency_detection.urgency_detection_base import (
     KeywordRule,
     evaluate_keyword_rule,
     RuleBasedUD,
+    MLBasedUD,
 )
 from faqt.preprocessing import preprocess_text_for_keyword_rule
 from hunspell import Hunspell
@@ -110,6 +112,56 @@ def test_rule_true(self, preprocess_func, keyword_rules, rule_id, message):
         msg = preprocess_func(message)
         assert evaluate_keyword_rule(msg, keyword_rules[rule_id]) is True
 
+    def test_rules_and_preprocessing_set_correctly(
+        self, keyword_rules, preprocess_func
+    ):
+        predictor = RuleBasedUD(model=keyword_rules, preprocessor=preprocess_func)
+        assert (
+            predictor.model == keyword_rules
+            and predictor.preprocessor == preprocess_func
+        )
+
+    @pytest.mark.parametrize(
+        "rule_id, message",
+        [
+            (
+                0,
+                "I have a headache, feel dizzy, and everything looks blurry",
+            ),
+            # True because it contains all included keywords
+            (1, "My back pain is killing me :("),  # True, because it
+            # includes all included keywords and no excluded keyword
+            (2, "hi hi"),  # True, because it doesn't have excluded keyword
+        ],
+    )
+    def test_rule_based_ud_predict_score_return_array(
+        self, keyword_rules, preprocess_func, rule_id, message
+    ):
+        predictor = RuleBasedUD(model=keyword_rules, preprocessor=preprocess_func)
+        scores = predictor.predict_scores(message)
+        assert isinstance(scores, list)
+        assert len(scores) == len(keyword_rules)
+
+    @pytest.mark.parametrize(
+        "rule_id, message",
+        [
+            (
+                0,
+                "I have a headache, feel dizzy, and everything looks blurry",
+            ),
+            # True because it contains all included keywords
+            (1, "My back pain is killing me :("),  # True, because it
+            # includes all included keywords and no excluded keyword
+            (2, "hi hi"),  # True, because it doesn't have excluded keyword
+        ],
+    )
+    def test_rule_based_ud_predict_return_float(
+        self, keyword_rules, preprocess_func, rule_id, message
+    ):
+        predictor = RuleBasedUD(model=keyword_rules, preprocessor=preprocess_func)
+        scores = predictor.predict(message)
+        assert isinstance(scores, float)
+
     @pytest.mark.parametrize(
         "rule_id, message, expected",
         [

diff --git a/tests/test_models/test_ml_ud_scorer.py b/tests/test_models/test_ml_ud_scorer.py
@@ -0,0 +1,62 @@
+from functools import partial
+from pathlib import Path
+
+import pytest
+
+from faqt.preprocessing import preprocess_text_for_keyword_rule
+from faqt.model.urgency_detection.urgency_detection_base import MLBasedUD
+
+from hunspell import Hunspell
+from nltk.stem import PorterStemmer
+import joblib
+
+
+class TestMLBasedUD:
+    @pytest.fixture
+    def ml_model(self):
+        full_path = Path(__file__).parents[1] / "data/ud_ml_models/model_test.joblib"
+        return joblib.load(full_path)
+
+    @pytest.fixture(scope="class")
+    def preprocess_func(self):
+        huns = Hunspell()
+        stemmer = PorterStemmer()
+        preprocess_func = partial(
+            preprocess_text_for_keyword_rule,
+            n_min_dashed_words_url=3,
+            stem_func=stemmer.stem,
+            spell_checker=huns,
+            reincluded_stop_words=["what", "not", "how", "much", "where", "me"],
+            ngram_min=1,
+            ngram_max=2,
+        )
+        return preprocess_func
+
+    def test_model_and_preprocessing_set_correctly(self, ml_model, preprocess_func):
+        predictor = MLBasedUD(model=ml_model, preprocessor=preprocess_func)
+        assert predictor.model == ml_model
+        assert predictor.preprocessor == preprocess_func
+
+    def test_get_model_returns_model(self, ml_model, preprocess_func):
+        predictor = MLBasedUD(model=ml_model, preprocessor=preprocess_func)
+        returned_model = predictor.get_model()
+        assert ml_model == returned_model
+
+    @pytest.mark.parametrize(
+        "rule_id, message",
+        [
+            (
+                0,
+                "I have a headache, feel dizzy, and everything looks blurry",
+            ),
+            # True because it contains all included keywords
+            (1, "My back pain is killing me :("),  # True, because it
+            # includes all included keywords and no excluded keyword
+            # True, because it doesn't have excluded keyword
+        ],
+    )
+    def test_model_returns_prediction(
+        self, ml_model, preprocess_func, rule_id, message
+    ):
+        predictor = MLBasedUD(model=ml_model, preprocessor=preprocess_func)
+        assert isinstance(predictor.predict(message), float)