Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] introduce weighted loss #8

Merged
merged 8 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions experiments/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,33 @@
"early_stopping_rounds": 10,
}

# # Train standard LightGBM model
# Train standard LightGBM model
bst_standard = lgb.train(
params_standard, train_data, num_boost_round=100, valid_sets=[test_data]
)

# Parameters for Imbalanced LightGBM model
params_imbalanced = {
"objective": "weighted", # focal
"metric": "binary_logloss", # auc
"learning_rate": 0.05,
"num_leaves": 31,
"feature_fraction": 0.9,
"bagging_fraction": 0.8,
"bagging_freq": 5,
"seed": 42,
"early_stopping_rounds": 10,
}

bst_focal = imlgb.train(
params_standard, train_data, num_boost_round=100, valid_sets=[test_data]
params_imbalanced, train_data, num_boost_round=100, valid_sets=[test_data]
)

# Predict using the standard LightGBM model
# Predict using standard LightGBM model
y_pred_standard = bst_standard.predict(X_test)
y_pred_standard_binary = (y_pred_standard > 0.5).astype(int)

# Predict using the focal loss model
# Predict using Imbalanced LightGBM model
y_pred_focal = bst_focal.predict(X_test)
y_pred_focal_binary = (y_pred_focal > 0.5).astype(int)

Expand All @@ -64,5 +76,3 @@
print(
f"LightGBM with Focal Loss - Accuracy: {accuracy_focal:.4f}, Log Loss: {logloss_focal:.4f}, rocauc: {rocauc_focal:.4f}"
)
# Standard LightGBM - Accuracy: 0.9737, Log Loss: 0.1029, rocauc: 0.9931
# LightGBM with Focal Loss - Accuracy: 0.8158, Log Loss: 0.6955, rocauc: 0.9843
2 changes: 1 addition & 1 deletion imlightgbm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# ruff: noqa
from imlightgbm.engine import cv, optimize, train
from imlightgbm.engine import cv, train
35 changes: 35 additions & 0 deletions imlightgbm/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from enum import Enum


class BaseEnum(str, Enum):
@classmethod
def get(cls, text: str) -> Enum:
cls.__check_valid(text)
return cls[text]

@classmethod
def __check_valid(cls, text: str) -> None:
if text not in cls._member_map_.keys():
valid_members = ", ".join(list(cls._member_map_.keys()))
raise ValueError(
f"Invalid value: '{text}'. Expected one of: {valid_members}."
)


class SupportedTask(BaseEnum):
binary: str = "binary"
multiclass: str = "multiclass"


class Metric(BaseEnum):
auc: str = "auc"
binary_logloss: str = "binary_logloss"
binary_error: str = "binary_error"
auc_mu: str = "auc_mu"
multi_logloss: str = "multi_logloss"
multi_error: str = "multi_error"


class Objective(BaseEnum):
focal: str = "focal"
weighted: str = "weighted"
23 changes: 1 addition & 22 deletions imlightgbm/docstring.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,13 @@
"nfold": f"int, optional (default=5){_space}Number of folds in CV.",
"stratified": f"bool, optional (default=True){_space}Whether to perform stratified sampling.",
"shuffle": f"bool, optional (default=True){_space}Whether to shuffle before splitting data.",
"metrics": f"str, list of str, or None, optional (default=None){_space}Evaluation metrics to be monitored while CV.",
"init_model": f"str, pathlib.Path, Booster or None, optional (default=None){_space}Filename of LightGBM model or Booster instance used for continue training.",
"fpreproc": f"callable or None, optional (default=None){_space}Preprocessing function that takes (dtrain, dtest, params) and returns transformed versions of those.",
"seed": f"int, optional (default=0){_space}Seed used to generate the folds (passed to numpy.random.seed).",
"keep_training_booster": f"bool, optional (default=False){_space}Whether the returned Booster will be used to keep training.{_space}If False, the returned value will be converted into _InnerPredictor before returning.{_space}This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.{_space}When your model is very large and cause the memory error,{_space}you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.{_space}You can still use _InnerPredictor as ``init_model`` for future continue training.",
"callbacks": f"list of callable, or None, optional (default=None){_space}List of callback functions that are applied at each iteration.{_space}See Callbacks in Python API for more information.",
"eval_train_metric": f"bool, optional (default=False){_space}Whether to display the train metric in progress.",
"return_cvbooster": f"bool, optional (default=False){_space}Whether to return Booster models trained on each fold through ``CVBooster``.",
"keep_training_booster": f"bool, optional (default=False){_space}Whether the returned Booster will be used to keep training.{_space}If False, the returned value will be converted into _InnerPredictor before returning.{_space}This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.{_space}When your model is very large and cause the memory error,{_space}you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.{_space}You can still use _InnerPredictor as ``init_model`` for future continue training.",
"num_trials": f"int, optional (default=10){_space}Number of hyperparameter tuning trials.",
"get_params": 'callable, optional (default=get_params)\n Number of hyperparameter tuning trials.\n def get_params(trial: optuna.Trial):\n return {\n "alpha": trial.suggest_float("alpha", 0.25, 0.75),\n "gamma": trial.suggest_float("gamma", 0.0, 3.0),\n "num_leaves": trial.suggest_int("num_leaves", 20, 150),\n "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),\n "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),\n "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),\n "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),\n }',
}


Expand Down Expand Up @@ -58,24 +55,6 @@
],
"return_description": "eval_results: dict\n History of evaluation results of each metric.\n The dictionary has the following format:\n {'valid metric1-mean': [values], 'valid metric1-stdv': [values],\n 'valid metric2-mean': [values], 'valid metric2-stdv': [values],\n ...}.\n If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.\n If ``eval_train_metric=True``, also returns the train metric history.\n In this case, the dictionary has the following format:\n {'train metric1-mean': [values], 'valid metric1-mean': [values],\n 'train metric2-mean': [values], 'valid metric2-mean': [values],\n ...}.",
},
"optimize": {
"description": "Perform the hyperparameter tuning with optuna.",
"selected_params": [
"train_set",
"num_trials",
"num_boost_round",
"folds",
"nfold",
"stratified",
"shuffle",
"get_params",
"init_model",
"fpreproc",
"seed",
"callbacks",
],
"return_description": f"study: optuna.Study{_space}study.best_params{_space}study.best_value",
},
}


Expand Down
47 changes: 1 addition & 46 deletions imlightgbm/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@

import lightgbm as lgb
import numpy as np
import optuna
from sklearn.model_selection import BaseCrossValidator

from imlightgbm.docstring import add_docstring
from imlightgbm.objective import get_params, set_params
from imlightgbm.objective import set_params


@add_docstring("train")
Expand Down Expand Up @@ -70,47 +69,3 @@ def cv(
eval_train_metric=eval_train_metric,
return_cvbooster=return_cvbooster,
)


@add_docstring("optimize")
def optimize(
train_set: lgb.Dataset,
num_trials: int = 10,
num_boost_round: int = 100,
folds: Iterable[tuple[np.ndarray, np.ndarray]] | BaseCrossValidator | None = None,
nfold: int = 5,
stratified: bool = True,
shuffle: bool = True,
get_params: Callable[[optuna.Trial], dict[str, Any]] = get_params,
init_model: str | lgb.Path | lgb.Booster | None = None,
fpreproc: Callable[
[lgb.Dataset, lgb.Dataset, dict[str, Any]],
tuple[lgb.Dataset, lgb.Dataset, dict[str, Any]],
]
| None = None,
seed: int = 0,
callbacks: list[Callable] | None = None,
) -> optuna.Study:
def _objective(trial: optuna.Trial):
"""Optuna objective function."""
params = get_params(trial)
cv_results = cv(
params=params,
train_set=train_set,
num_boost_round=num_boost_round,
folds=folds,
nfold=nfold,
stratified=stratified,
shuffle=shuffle,
init_model=init_model,
fpreproc=fpreproc,
seed=seed,
callbacks=callbacks,
)
_keys = [_ for _ in cv_results.keys() if _.endswith("mean")]
assert len(_keys) == 1
return min(cv_results[_keys[0]])

study = optuna.create_study(direction="minimize")
study.optimize(_objective, n_trials=num_trials)
return study
130 changes: 78 additions & 52 deletions imlightgbm/objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
from typing import Any, Callable

import numpy as np
import optuna
from lightgbm import Dataset
from sklearn.utils.multiclass import type_of_target

from imlightgbm.utils import logger
from imlightgbm.base import Metric, Objective, SupportedTask

EvalLike = Callable[[np.ndarray, Dataset], tuple[str, float, bool]]
ObjLike = Callable[[np.ndarray, Dataset], tuple[np.ndarray, np.ndarray]]
ALPHA_DEFAULT: float = 0.25
GAMMA_DEFAULT: float = 2.0
Expand Down Expand Up @@ -37,7 +35,7 @@ def _sigmoid(x: np.ndarray) -> np.ndarray:
def binary_focal_objective(
pred: np.ndarray, train_data: Dataset, gamma: float
) -> tuple[np.ndarray, np.ndarray]:
"""Return binary focal objective."""
"""Return grad, hess for binary focal objective."""
label = train_data.get_label()
pred_prob = _sigmoid(pred)

Expand All @@ -58,17 +56,13 @@ def binary_focal_objective(
return grad, hess


def binary_focal_eval(
pred: np.ndarray, train_data: Dataset, alpha: float, gamma: float
) -> tuple[str, float, bool]:
"""Return binary focal eval."""
def binary_weighted_objective(pred: np.ndarray, train_data: Dataset, alpha: float):
"""Return grad, hess for binary weighted objective."""
label = train_data.get_label()
pred_prob = _sigmoid(pred)
p_t = np.where(label == 1, pred_prob, 1 - pred_prob)
loss = -alpha * ((1 - p_t) ** gamma) * _log(p_t, True)

focal_loss = np.mean(loss)
return "focal", focal_loss, IS_HIGHER_BETTER
grad = -(alpha**label) * (label - pred_prob)
hess = (alpha**label) * pred_prob * (1.0 - pred_prob)
return grad, hess


def multiclass_focal_objective(
Expand All @@ -78,59 +72,91 @@ def multiclass_focal_objective(
return


def multiclass_focal_eval(
def multiclass_weighted_objective(
pred: np.ndarray, train_data: Dataset, alpha: float, gamma: float
) -> tuple[str, float, bool]:
# TODO
return


def _set_fobj_feval(
train_set: Dataset, alpha: float, gamma: float
) -> tuple[ObjLike, EvalLike]:
"""Return obj and eval with respect to task type."""
inferred_task = type_of_target(train_set.get_label())
if inferred_task not in {"binary", "multiclass"}:
raise ValueError(
f"Invalid target type: {inferred_task}. Supported types are 'binary' or 'multiclass'."
)
objective_mapper: dict[str, ObjLike] = {
"binary": partial(binary_focal_objective, gamma=gamma),
"multiclass": partial(multiclass_focal_objective, alpha=alpha, gamma=gamma),
def _get_metric(task_enum: SupportedTask, metric: str | None) -> str:
"""Retrieve the appropriate metric function based on task."""
metric_mapper: dict[SupportedTask, list[Metric]] = {
SupportedTask.binary: [Metric.auc, Metric.binary_error, Metric.binary_logloss],
SupportedTask.multiclass: [
Metric.auc_mu,
Metric.multi_logloss,
Metric.multi_error,
],
}
eval_mapper: dict[str, EvalLike] = {
"binary": "binary_logloss",
"multiclass": "multi_logloss",
if metric:
metric_enum = Metric.get(metric)
metric_enums = metric_mapper[task_enum]
if metric_enum not in metric_enums:
valid_metrics = ", ".join([m.value for m in metric_enums])
raise ValueError(f"Invalid metric: Supported metrics are {valid_metrics}")
return metric_enum.value

return metric_mapper[task_enum][0].value


def _get_objective(
task_enum: SupportedTask, objective: str | None, alpha: float, gamma: float
) -> ObjLike:
"""Retrieve the appropriate objective function based on task and objective type."""
objective_mapper: dict[SupportedTask, dict[Objective, ObjLike]] = {
SupportedTask.binary: {
Objective.focal: partial(binary_focal_objective, gamma=gamma),
Objective.weighted: partial(binary_weighted_objective, alpha=alpha),
},
SupportedTask.multiclass: {
Objective.focal: partial(
multiclass_focal_objective, alpha=alpha, gamma=gamma
),
Objective.weighted: partial(
multiclass_weighted_objective, alpha=alpha, gamma=gamma
),
},
}
fobj = objective_mapper[inferred_task]
feval = eval_mapper[inferred_task]
if objective:
objective_enum = Objective.get(objective)
return objective_mapper[task_enum][objective_enum]

return objective_mapper[task_enum][Objective.focal]


def _get_fobj_feval(
train_set: Dataset,
alpha: float,
gamma: float,
objective: str | None,
metric: str | None,
) -> tuple[ObjLike, str]:
"""Return obj and eval with respect to task type."""
_task = type_of_target(train_set.get_label())
task_enum = SupportedTask.get(_task)
feval = _get_metric(task_enum=task_enum, metric=metric)
fobj = _get_objective(
task_enum=task_enum, objective=objective, alpha=alpha, gamma=gamma
)
return fobj, feval


def set_params(params: dict[str, Any], train_set: Dataset) -> dict[str, Any]:
"""Set params and eval finction, objective in params."""
_params = deepcopy(params)
if OBJECTIVE_STR in _params:
logger.warning(f"'{OBJECTIVE_STR}' exists in params will not used.")
del _params[OBJECTIVE_STR]

_alpha = _params.pop("alpha", ALPHA_DEFAULT)
_gamma = _params.pop("gamma", GAMMA_DEFAULT)

fobj, feval = _set_fobj_feval(train_set=train_set, alpha=_alpha, gamma=_gamma)
_objective = _params.pop(OBJECTIVE_STR, None)
_metric = _params.pop(METRIC_STR, None)

if _metric and not isinstance(_metric, str):
raise ValueError("metric must be str")

fobj, feval = _get_fobj_feval(
train_set=train_set,
alpha=_params.pop("alpha", ALPHA_DEFAULT),
gamma=_params.pop("gamma", GAMMA_DEFAULT),
objective=_objective,
metric=_metric,
)
_params.update({OBJECTIVE_STR: fobj, METRIC_STR: feval})
return _params


def get_params(trial: optuna.Trial) -> dict[str, Any]:
"""Get default params."""
return {
"alpha": trial.suggest_float("alpha", 0.25, 0.75),
"gamma": trial.suggest_float("gamma", 0.0, 3.0),
"num_leaves": trial.suggest_int("num_leaves", 20, 150),
"learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),
"feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
"bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
}
10 changes: 0 additions & 10 deletions imlightgbm/utils.py

This file was deleted.

Loading