Skip to content

Commit

Permalink
Merge pull request #55 from kiraplenkin/check_min_pcn_group_in_modeling
Browse files Browse the repository at this point in the history
Added min_pcn_group_check
  • Loading branch information
kiraplenkin authored Oct 19, 2022
2 parents 8733c81 + c155c89 commit 88bb03b
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "woe_scoring"
version = "0.7.7"
version = "0.7.8"
description = "Weight Of Evidence Transformer and LogisticRegression model with scikit-learn API"
authors = ["Stroganov Kirill <kiraplenkin@gmail.com>"]
license = "MIT"
Expand Down
2 changes: 1 addition & 1 deletion woe_scoring/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from woe_scoring.core import CreateModel, WOETransformer

__version__ = "0.7.7"
__version__ = "0.7.8"
2 changes: 2 additions & 0 deletions woe_scoring/core/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def __init__(
gini_threshold: float = 5.0,
iv_threshold: float = 0.05,
corr_threshold: float = 0.5,
min_pct_group: float = 0.05,
random_state: int = None,
class_weight: str = None,
direction: str = "forward",
Expand All @@ -200,6 +201,7 @@ def __init__(
self.gini_threshold = gini_threshold
self.iv_threshold = iv_threshold
self.corr_threshold = corr_threshold
self.min_pct_group = min_pct_group
self.random_state = random_state
self.class_weight = class_weight
self.direction = direction
Expand Down
34 changes: 30 additions & 4 deletions woe_scoring/core/model/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _check_correlation_threshold(
for var_a, var_b in iter:
if (var_a != var_b) and (var_a in feature_names) and (var_b in feature_names) and abs(
x[feature_names].corr()[var_a][var_b]
) >= corr_threshold:
) >= corr_threshold:
if _calc_score(
x,
y,
Expand Down Expand Up @@ -115,6 +115,17 @@ def _check_correlation_threshold(
return feature_names


def _check_min_pct_group(
x: [pd.DataFrame, np.ndarray],
feature_names: List[str],
min_pct_group: float,
) -> List[str]:
to_drop = [
feature_name for feature_name in feature_names if x[feature_name].value_counts().min() < min_pct_group
]
return [var for var in feature_names if var not in to_drop]


def _feature_selector(
x: [pd.DataFrame, np.ndarray],
y: Union[pd.Series, np.ndarray],
Expand Down Expand Up @@ -151,6 +162,7 @@ def sequential_feature_select(
feature_names: List[str],
gini_threshold: float,
corr_threshold: float,
min_pct_group: float,
random_state: int,
class_weight: str,
max_vars: Union[int, float],
Expand All @@ -160,6 +172,12 @@ def sequential_feature_select(
scoring: str,
n_jobs: int,
) -> List[str]:
feature_names = _check_min_pct_group(
x,
feature_names=feature_names,
min_pct_group=min_pct_group,
)

feature_names = _check_features_gini_threshold(
x, y,
feature_names=feature_names,
Expand Down Expand Up @@ -218,6 +236,7 @@ def iv_feature_select(
max_vars: int,
n_jobs: int,
corr_threshold: float,
min_pct_group: float,
random_state: int,
class_weight: str,
cv: int,
Expand All @@ -233,6 +252,13 @@ def iv_feature_select(

feature_names = [feature for feature in dict(sorted(res_dict.items(), key=itemgetter(1), reverse=True)) if
res_dict[feature] >= iv_threshold][:max_vars]

feature_names = _check_min_pct_group(
x,
feature_names=feature_names,
min_pct_group=min_pct_group,
)

feature_names = _check_correlation_threshold(
x, y,
feature_names=feature_names,
Expand Down Expand Up @@ -312,9 +338,9 @@ def generate_sql(
f" WHEN {var.replace('WOE_', '')} in {bin['bin']} THEN {bin['woe']}".replace(
"[", "("
)
.replace("]", ")")
.replace(", -1", "")
.replace(", Missing", "")
.replace("]", ")")
.replace(", -1", "")
.replace(", Missing", "")
for bin in encoder.woe_iv_dict[i][
var.replace("WOE_", "")
]
Expand Down

0 comments on commit 88bb03b

Please sign in to comment.