Merge pull request #55 from kiraplenkin/check_min_pcn_group_in_modeling

Added min_pcn_group_check
kiraplenkin · Oct 19, 2022 · 88bb03b · 88bb03b
2 parents 8733c81 + c155c89
commit 88bb03b
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 6 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "woe_scoring"
-version = "0.7.7"
+version = "0.7.8"
 description = "Weight Of Evidence Transformer and LogisticRegression model with scikit-learn API"
 authors = ["Stroganov Kirill <kiraplenkin@gmail.com>"]
 license = "MIT"

diff --git a/woe_scoring/__init__.py b/woe_scoring/__init__.py
@@ -1,3 +1,3 @@
 from woe_scoring.core import CreateModel, WOETransformer
 
-__version__ = "0.7.7"
+__version__ = "0.7.8"
diff --git a/woe_scoring/core/main.py b/woe_scoring/core/main.py
@@ -183,6 +183,7 @@ def __init__(
             gini_threshold: float = 5.0,
             iv_threshold: float = 0.05,
             corr_threshold: float = 0.5,
+            min_pct_group: float = 0.05,
             random_state: int = None,
             class_weight: str = None,
             direction: str = "forward",
@@ -200,6 +201,7 @@ def __init__(
         self.gini_threshold = gini_threshold
         self.iv_threshold = iv_threshold
         self.corr_threshold = corr_threshold
+        self.min_pct_group = min_pct_group
         self.random_state = random_state
         self.class_weight = class_weight
         self.direction = direction

diff --git a/woe_scoring/core/model/functions.py b/woe_scoring/core/model/functions.py
@@ -87,7 +87,7 @@ def _check_correlation_threshold(
     for var_a, var_b in iter:
         if (var_a != var_b) and (var_a in feature_names) and (var_b in feature_names) and abs(
                 x[feature_names].corr()[var_a][var_b]
-                ) >= corr_threshold:
+        ) >= corr_threshold:
             if _calc_score(
                     x,
                     y,
@@ -115,6 +115,17 @@ def _check_correlation_threshold(
     return feature_names
 
 
+def _check_min_pct_group(
+        x: [pd.DataFrame, np.ndarray],
+        feature_names: List[str],
+        min_pct_group: float,
+) -> List[str]:
+    to_drop = [
+        feature_name for feature_name in feature_names if x[feature_name].value_counts().min() < min_pct_group
+    ]
+    return [var for var in feature_names if var not in to_drop]
+
+
 def _feature_selector(
         x: [pd.DataFrame, np.ndarray],
         y: Union[pd.Series, np.ndarray],
@@ -151,6 +162,7 @@ def sequential_feature_select(
         feature_names: List[str],
         gini_threshold: float,
         corr_threshold: float,
+        min_pct_group: float,
         random_state: int,
         class_weight: str,
         max_vars: Union[int, float],
@@ -160,6 +172,12 @@ def sequential_feature_select(
         scoring: str,
         n_jobs: int,
 ) -> List[str]:
+    feature_names = _check_min_pct_group(
+        x,
+        feature_names=feature_names,
+        min_pct_group=min_pct_group,
+    )
+
     feature_names = _check_features_gini_threshold(
         x, y,
         feature_names=feature_names,
@@ -218,6 +236,7 @@ def iv_feature_select(
         max_vars: int,
         n_jobs: int,
         corr_threshold: float,
+        min_pct_group: float,
         random_state: int,
         class_weight: str,
         cv: int,
@@ -233,6 +252,13 @@ def iv_feature_select(
 
     feature_names = [feature for feature in dict(sorted(res_dict.items(), key=itemgetter(1), reverse=True)) if
                      res_dict[feature] >= iv_threshold][:max_vars]
+
+    feature_names = _check_min_pct_group(
+        x,
+        feature_names=feature_names,
+        min_pct_group=min_pct_group,
+    )
+
     feature_names = _check_correlation_threshold(
         x, y,
         feature_names=feature_names,
@@ -312,9 +338,9 @@ def generate_sql(
                         f" WHEN {var.replace('WOE_', '')} in {bin['bin']} THEN {bin['woe']}".replace(
                             "[", "("
                         )
-                            .replace("]", ")")
-                            .replace(", -1", "")
-                            .replace(", Missing", "")
+                        .replace("]", ")")
+                        .replace(", -1", "")
+                        .replace(", Missing", "")
                         for bin in encoder.woe_iv_dict[i][
                             var.replace("WOE_", "")
                         ]