Skip to content

[Feature] add fit and predict logic for multiclass in sklearn #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/multiclass_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,5 +77,6 @@
# Evaluate models
print("\nClassification Report for Standard:")
print(classification_report(y_test, y_pred_standard_label))

print("\nClassification Report for Imbalanced:")
print(classification_report(y_test, y_pred_focal_label))
42 changes: 42 additions & 0 deletions examples/multiclass_sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import imlightgbm as imlgb

# Generate dataset
X, y = make_classification(
n_samples=5000,
n_features=10,
n_classes=3,
n_informative=5,
weights=[0.05, 0.15, 0.8],
flip_y=0,
random_state=42,
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)

# Initialize the ImbalancedLGBMClassifier using binary focal loss
clf = imlgb.ImbalancedLGBMClassifier(
objective="multiclass_focal", # multiclass_weighted
gamma=2.0, # alpha with multiclass_weighted
num_class=3,
learning_rate=0.05,
num_leaves=31,
)

# Train the classifier on the training data
clf.fit(X=X_train, y=y_train)

# Make predictions on the test data
y_pred_focal = clf.predict(X_test)


# Evaluate the model performance using accuracy, log loss, and ROC AUC
# Evaluate models
print("\nClassification Report:")
print(classification_report(y_test, y_pred_focal))
2 changes: 1 addition & 1 deletion imlightgbm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
from imlightgbm.engine import cv, train
from imlightgbm.sklearn import ImbalancedLGBMClassifier

__version__ = "0.0.4"
__version__ = "0.1.0"
30 changes: 13 additions & 17 deletions imlightgbm/objective/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from scipy.special import expit, softmax


def _safe_power(num_base: np.ndarray, num_pow: float):
def _safe_power(num_base: np.ndarray, num_pow: float) -> np.ndarray:
"""Safe power."""
return np.sign(num_base) * (np.abs(num_base)) ** (num_pow)

Expand Down Expand Up @@ -54,21 +54,19 @@ def binary_focal_objective(
) -> tuple[np.ndarray, np.ndarray]:
"""Return grad, hess for binary focal objective for engine."""
label = train_data.get_label()
grad, hess = sklearn_binary_focal_objective(
return sklearn_binary_focal_objective(
y_true=label,
y_pred=pred,
gamma=gamma,
)
return grad, hess


def binary_weighted_objective(pred: np.ndarray, train_data: Dataset, alpha: float):
def binary_weighted_objective(
pred: np.ndarray, train_data: Dataset, alpha: float
) -> tuple[np.ndarray, np.ndarray]:
"""Return grad, hess for binary weighted objective for engine."""
label = train_data.get_label()
grad, hess = sklearn_binary_weighted_objective(
y_true=label, y_pred=pred, alpha=alpha
)
return grad, hess
return sklearn_binary_weighted_objective(y_true=label, y_pred=pred, alpha=alpha)


def sklearn_multiclass_focal_objective(
Expand All @@ -79,7 +77,7 @@ def sklearn_multiclass_focal_objective(
) -> tuple[np.ndarray, np.ndarray]:
"""Return grad, hess for multclass focal objective for sklearn API.."""
pred_prob = softmax(y_pred, axis=1)
y_true_onehot = np.eye(num_class)[y_true]
y_true_onehot = np.eye(num_class)[y_true.astype(int)]

# gradient
g1 = pred_prob * (1 - pred_prob)
Expand Down Expand Up @@ -110,7 +108,7 @@ def sklearn_multiclass_weighted_objective(
) -> tuple[np.ndarray, np.ndarray]:
"""Return grad, hess for multclass weighted objective for sklearn API."""
pred_prob = softmax(y_pred, axis=1)
y_true_onehot = np.eye(num_class)[y_true]
y_true_onehot = np.eye(num_class)[y_true.astype(int)]
grad = -(alpha**y_true_onehot) * (y_true_onehot - pred_prob)
hess = (alpha**y_true_onehot) * pred_prob * (1.0 - pred_prob)
return grad, hess
Expand All @@ -123,28 +121,26 @@ def multiclass_focal_objective(
num_class: int,
) -> tuple[np.ndarray, np.ndarray]:
"""Return grad, hess for multclass focal objective for engine."""
label = train_data.get_label().astype(int)
grad, hess = sklearn_multiclass_focal_objective(
label = train_data.get_label()
return sklearn_multiclass_focal_objective(
y_true=label,
y_pred=pred,
gamma=gamma,
num_class=num_class,
)
return grad, hess


def multiclass_weighted_objective(
pred: np.ndarray,
train_data: Dataset,
alpha: float,
num_class: int,
) -> tuple[str, float, bool]:
) -> tuple[np.ndarray, np.ndarray]:
"""Return grad, hess for multclass weighted objective for engine."""
label = train_data.get_label().astype(int)
grad, hess = sklearn_multiclass_weighted_objective(
label = train_data.get_label()
return sklearn_multiclass_weighted_objective(
y_true=label,
y_pred=pred,
alpha=alpha,
num_class=num_class,
)
return grad, hess
54 changes: 42 additions & 12 deletions imlightgbm/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from imlightgbm.objective.core import (
sklearn_binary_focal_objective,
sklearn_binary_weighted_objective,
sklearn_multiclass_focal_objective,
sklearn_multiclass_weighted_objective,
)
from imlightgbm.utils import validate_positive_number

Expand Down Expand Up @@ -44,6 +46,7 @@ def __init__(
random_state: int | np.random.RandomState | np.random.Generator | None = None,
n_jobs: int | None = None,
importance_type: str = "split",
num_class: int | None = None,
) -> None:
"""Construct a gradient boosting model.

Expand All @@ -52,20 +55,42 @@ def __init__(
objective : str
Specify the learning objective. Options are 'binary_focal' and 'binary_weighted'.
alpha: float
For 'binary_weighted' objective
gamma: float
Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more other parameters.
For 'binary_focal' objective
other parameters:
Check http://lightgbm.readthedocs.io/en/latest/Parameters.html for more details.
"""
validate_positive_number(alpha)
validate_positive_number(gamma)

self.alpha = alpha
self.gamma = gamma
self.num_class = num_class
_objective = Objective.get(objective)
_OBJECTIVE_MAPPER: dict[Objective, _SklearnObjLike] = {
if _objective in {
Objective.multiclass_focal,
Objective.multiclass_weighted,
} and not isinstance(num_class, int):
raise ValueError("num_class must be provided")

_objective_mapper: dict[Objective, _SklearnObjLike] = {
Objective.binary_focal: lambda y_true,
y_pred: sklearn_binary_focal_objective(y_true, y_pred, gamma=gamma),
y_pred: sklearn_binary_focal_objective(
y_true=y_true, y_pred=y_pred, gamma=gamma
),
Objective.binary_weighted: lambda y_true,
y_pred: sklearn_binary_weighted_objective(y_true, y_pred, alpha=alpha),
y_pred: sklearn_binary_weighted_objective(
y_true=y_true, y_pred=y_pred, alpha=alpha
),
Objective.multiclass_focal: lambda y_true,
y_pred: sklearn_multiclass_focal_objective(
y_true=y_true, y_pred=y_pred, gamma=gamma, num_class=num_class
),
Objective.multiclass_weighted: lambda y_true,
y_pred: sklearn_multiclass_weighted_objective(
y_true=y_true, y_pred=y_pred, alpha=alpha, num_class=num_class
),
}
super().__init__(
boosting_type=boosting_type,
Expand All @@ -74,7 +99,7 @@ def __init__(
learning_rate=learning_rate,
n_estimators=n_estimators,
subsample_for_bin=subsample_for_bin,
objective=_OBJECTIVE_MAPPER[_objective],
objective=_objective_mapper[_objective],
class_weight=class_weight,
min_split_gain=min_split_gain,
min_child_weight=min_child_weight,
Expand Down Expand Up @@ -102,7 +127,7 @@ def predict(
**kwargs: Any,
) -> np.ndarray | spmatrix | list[spmatrix]:
"""Docstring is inherited from the LGBMClassifier."""
result = super().predict(
_predict = super().predict(
X=X,
raw_score=raw_score,
start_iteration=start_iteration,
Expand All @@ -112,13 +137,18 @@ def predict(
validate_features=validate_features,
**kwargs,
)
if raw_score or pred_leaf or pred_contrib:
return result
if (
raw_score
or pred_leaf
or pred_contrib
or isinstance(_predict, spmatrix | list)
):
return _predict

if self._LGBMClassifier__is_multiclass: # TODO: multiclass
class_index = np.argmax(result, axis=1)
return self._LGBMClassifier_le.inverse_transform(class_index)
if self._LGBMClassifier__is_multiclass:
class_index = np.argmax(_predict, axis=1)
return self._le.inverse_transform(class_index)
else:
return expit(result)
return expit(_predict)

predict.__doc__ = LGBMClassifier.predict.__doc__
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "imlightgbm"
version = "0.0.4"
version = "0.1.0"
description = "LightGBM for label-imbalanced data with focal and weighted loss function"
authors = ["RektPunk <rektpunk@gmail.com>"]
license = "MIT"
Expand Down