[Feature] introduce sckit-learn class (#17)

RektPunk · Sep 22, 2024 · c9c22d6 · c9c22d6
1 parent 2dc8a63
commit c9c22d6
Show file tree

Hide file tree

Showing 5 changed files with 111 additions and 18 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
     - id: check-merge-conflict
 
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.5
+  rev: v0.6.7
   hooks:
     - id: ruff
       args: [ --fix ]

diff --git a/imlightgbm/__init__.py b/imlightgbm/__init__.py
@@ -1,4 +1,5 @@
 # ruff: noqa
 from imlightgbm.engine import cv, train
+from imlightgbm.sklearn import ImbalancedLGBMClassifier
 
-__version__ = "0.0.2"
+__version__ = "0.0.3"
diff --git a/imlightgbm/objective.py b/imlightgbm/objective.py
@@ -32,36 +32,57 @@ def _sigmoid(x: np.ndarray) -> np.ndarray:
     return 1 / (1 + np.exp(-x))
 
 
-def binary_focal_objective(
-    pred: np.ndarray, train_data: Dataset, gamma: float
+def sklearn_binary_focal_objective(
+    y_true: np.ndarray, y_pred: np.ndarray, gamma: float
 ) -> tuple[np.ndarray, np.ndarray]:
     """Return grad, hess for binary focal objective."""
-    label = train_data.get_label()
-    pred_prob = _sigmoid(pred)
+    pred_prob = _sigmoid(y_pred)
 
     # gradient
     g1 = pred_prob * (1 - pred_prob)
-    g2 = label + ((-1) ** label) * pred_prob
-    g3 = pred_prob + label - 1
-    g4 = 1 - label - ((-1) ** label) * pred_prob
-    g5 = label + ((-1) ** label) * pred_prob
-    grad = gamma * g3 * _power(g2, gamma) * _log(g4) + ((-1) ** label) * _power(
+    g2 = y_true + ((-1) ** y_true) * pred_prob
+    g3 = pred_prob + y_true - 1
+    g4 = 1 - y_true - ((-1) ** y_true) * pred_prob
+    g5 = y_true + ((-1) ** y_true) * pred_prob
+    grad = gamma * g3 * _power(g2, gamma) * _log(g4) + ((-1) ** y_true) * _power(
         g5, (gamma + 1)
     )
-
     # hess
-    h1 = _power(g2, gamma) + gamma * ((-1) ** label) * g3 * _power(g2, (gamma - 1))
-    h2 = ((-1) ** label) * g3 * _power(g2, gamma) / g4
+    h1 = _power(g2, gamma) + gamma * ((-1) ** y_true) * g3 * _power(g2, (gamma - 1))
+    h2 = ((-1) ** y_true) * g3 * _power(g2, gamma) / g4
     hess = ((h1 * _log(g4) - h2) * gamma + (gamma + 1) * _power(g5, gamma)) * g1
     return grad, hess
 
 
+def sklearn_binary_weighted_objective(
+    y_true: np.ndarray, y_pred: np.ndarray, alpha: float
+) -> tuple[np.ndarray, np.ndarray]:
+    """Return grad, hess for binary weighted objective."""
+    pred_prob = _sigmoid(y_pred)
+    grad = -(alpha**y_true) * (y_true - pred_prob)
+    hess = (alpha**y_true) * pred_prob * (1.0 - pred_prob)
+    return grad, hess
+
+
+def binary_focal_objective(
+    pred: np.ndarray, train_data: Dataset, gamma: float
+) -> tuple[np.ndarray, np.ndarray]:
+    """Return grad, hess for binary focal objective."""
+    label = train_data.get_label()
+    grad, hess = sklearn_binary_focal_objective(
+        y_true=label,
+        y_pred=pred,
+        gamma=gamma,
+    )
+    return grad, hess
+
+
 def binary_weighted_objective(pred: np.ndarray, train_data: Dataset, alpha: float):
     """Return grad, hess for binary weighted objective."""
     label = train_data.get_label()
-    pred_prob = _sigmoid(pred)
-    grad = -(alpha**label) * (label - pred_prob)
-    hess = (alpha**label) * pred_prob * (1.0 - pred_prob)
+    grad, hess = sklearn_binary_weighted_objective(
+        y_true=label, y_pred=pred, alpha=alpha
+    )
     return grad, hess
 
 

diff --git a/imlightgbm/sklearn.py b/imlightgbm/sklearn.py
@@ -0,0 +1,71 @@
+from typing import Callable, Literal
+
+import numpy as np
+from lightgbm import LGBMClassifier
+
+from imlightgbm.objective import (
+    sklearn_binary_focal_objective,
+    sklearn_binary_weighted_objective,
+)
+
+_Objective = Literal["binary_focal", "binary_weighted"]
+
+
+class ImbalancedLGBMClassifier(LGBMClassifier):
+    def __init__(
+        self,
+        objective: _Objective,
+        boosting_type: str = "gbdt",
+        num_leaves: int = 31,
+        max_depth: int = -1,
+        learning_rate: float = 0.1,
+        n_estimators: int = 100,
+        subsample_for_bin: int = 200000,
+        class_weight: dict | str | None = None,
+        min_split_gain: float = 0.0,
+        min_child_weight: float = 1e-3,
+        min_child_samples: int = 20,
+        subsample: float = 1.0,
+        subsample_freq: int = 0,
+        colsample_bytree: float = 1.0,
+        reg_alpha: float = 0.0,
+        reg_lambda: float = 0.0,
+        random_state: int | np.random.RandomState | np.random.Generator | None = None,
+        n_jobs: int | None = None,
+        importance_type: str = "split",
+        alpha: float = 0.25,
+        gamma: float = 2.0,
+    ) -> None:
+        self.alpha = alpha
+        self.gamma = gamma
+        _OBJECTIVE_MAPPER: dict[
+            str, Callable[[np.ndarray, np.ndarray], tuple[np.ndarray, np.ndarray]]
+        ] = {
+            "binary_focal": lambda y_true, y_pred: sklearn_binary_focal_objective(
+                y_true, y_pred, gamma=gamma
+            ),
+            "binary_weighted": lambda y_true, y_pred: sklearn_binary_weighted_objective(
+                y_true, y_pred, alpha=alpha
+            ),
+        }
+        super().__init__(
+            boosting_type=boosting_type,
+            num_leaves=num_leaves,
+            max_depth=max_depth,
+            learning_rate=learning_rate,
+            n_estimators=n_estimators,
+            subsample_for_bin=subsample_for_bin,
+            objective=_OBJECTIVE_MAPPER[objective],
+            class_weight=class_weight,
+            min_split_gain=min_split_gain,
+            min_child_weight=min_child_weight,
+            min_child_samples=min_child_samples,
+            subsample=subsample,
+            subsample_freq=subsample_freq,
+            colsample_bytree=colsample_bytree,
+            reg_alpha=reg_alpha,
+            reg_lambda=reg_lambda,
+            random_state=random_state,
+            n_jobs=n_jobs,
+            importance_type=importance_type,
+        )
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "imlightgbm"
-version = "0.0.2"
+version = "0.0.3"
 description = "LightGBM for label-imbalanced data with focal and weighted loss function"
 authors = ["RektPunk <rektpunk@gmail.com>"]
 license = "MIT"