RektPunk · RektPunk · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024 · Sep 18, 2024
diff --git a/experiments/basic.py b/experiments/basic.py
@@ -31,21 +31,33 @@
     "early_stopping_rounds": 10,
 }
 
-# # Train standard LightGBM model
+# Train standard LightGBM model
 bst_standard = lgb.train(
     params_standard, train_data, num_boost_round=100, valid_sets=[test_data]
 )
 
+# Parameters for Imbalanced LightGBM model
+params_imbalanced = {
+    "objective": "weighted",  # focal
+    "metric": "binary_logloss",  # auc
+    "learning_rate": 0.05,
+    "num_leaves": 31,
+    "feature_fraction": 0.9,
+    "bagging_fraction": 0.8,
+    "bagging_freq": 5,
+    "seed": 42,
+    "early_stopping_rounds": 10,
+}
 
 bst_focal = imlgb.train(
-    params_standard, train_data, num_boost_round=100, valid_sets=[test_data]
+    params_imbalanced, train_data, num_boost_round=100, valid_sets=[test_data]
 )
 
-# Predict using the standard LightGBM model
+# Predict using standard LightGBM model
 y_pred_standard = bst_standard.predict(X_test)
 y_pred_standard_binary = (y_pred_standard > 0.5).astype(int)
 
-# Predict using the focal loss model
+# Predict using Imbalanced LightGBM model
 y_pred_focal = bst_focal.predict(X_test)
 y_pred_focal_binary = (y_pred_focal > 0.5).astype(int)
 
@@ -64,5 +76,3 @@
 print(
     f"LightGBM with Focal Loss - Accuracy: {accuracy_focal:.4f}, Log Loss: {logloss_focal:.4f}, rocauc: {rocauc_focal:.4f}"
 )
-# Standard LightGBM - Accuracy: 0.9737, Log Loss: 0.1029, rocauc: 0.9931
-# LightGBM with Focal Loss - Accuracy: 0.8158, Log Loss: 0.6955, rocauc: 0.9843
diff --git a/imlightgbm/__init__.py b/imlightgbm/__init__.py
@@ -1,2 +1,2 @@
 # ruff: noqa
-from imlightgbm.engine import cv, optimize, train
+from imlightgbm.engine import cv, train
diff --git a/imlightgbm/base.py b/imlightgbm/base.py
@@ -0,0 +1,35 @@
+from enum import Enum
+
+
+class BaseEnum(str, Enum):
+    @classmethod
+    def get(cls, text: str) -> Enum:
+        cls.__check_valid(text)
+        return cls[text]
+
+    @classmethod
+    def __check_valid(cls, text: str) -> None:
+        if text not in cls._member_map_.keys():
+            valid_members = ", ".join(list(cls._member_map_.keys()))
+            raise ValueError(
+                f"Invalid value: '{text}'. Expected one of: {valid_members}."
+            )
+
+
+class SupportedTask(BaseEnum):
+    binary: str = "binary"
+    multiclass: str = "multiclass"
+
+
+class Metric(BaseEnum):
+    auc: str = "auc"
+    binary_logloss: str = "binary_logloss"
+    binary_error: str = "binary_error"
+    auc_mu: str = "auc_mu"
+    multi_logloss: str = "multi_logloss"
+    multi_error: str = "multi_error"
+
+
+class Objective(BaseEnum):
+    focal: str = "focal"
+    weighted: str = "weighted"
diff --git a/imlightgbm/docstring.py b/imlightgbm/docstring.py
@@ -11,16 +11,13 @@
     "nfold": f"int, optional (default=5){_space}Number of folds in CV.",
     "stratified": f"bool, optional (default=True){_space}Whether to perform stratified sampling.",
     "shuffle": f"bool, optional (default=True){_space}Whether to shuffle before splitting data.",
-    "metrics": f"str, list of str, or None, optional (default=None){_space}Evaluation metrics to be monitored while CV.",
     "init_model": f"str, pathlib.Path, Booster or None, optional (default=None){_space}Filename of LightGBM model or Booster instance used for continue training.",
     "fpreproc": f"callable or None, optional (default=None){_space}Preprocessing function that takes (dtrain, dtest, params) and returns transformed versions of those.",
     "seed": f"int, optional (default=0){_space}Seed used to generate the folds (passed to numpy.random.seed).",
+    "keep_training_booster": f"bool, optional (default=False){_space}Whether the returned Booster will be used to keep training.{_space}If False, the returned value will be converted into _InnerPredictor before returning.{_space}This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.{_space}When your model is very large and cause the memory error,{_space}you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.{_space}You can still use _InnerPredictor as ``init_model`` for future continue training.",
     "callbacks": f"list of callable, or None, optional (default=None){_space}List of callback functions that are applied at each iteration.{_space}See Callbacks in Python API for more information.",
     "eval_train_metric": f"bool, optional (default=False){_space}Whether to display the train metric in progress.",
     "return_cvbooster": f"bool, optional (default=False){_space}Whether to return Booster models trained on each fold through ``CVBooster``.",
-    "keep_training_booster": f"bool, optional (default=False){_space}Whether the returned Booster will be used to keep training.{_space}If False, the returned value will be converted into _InnerPredictor before returning.{_space}This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.{_space}When your model is very large and cause the memory error,{_space}you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.{_space}You can still use _InnerPredictor as ``init_model`` for future continue training.",
-    "num_trials": f"int, optional (default=10){_space}Number of hyperparameter tuning trials.",
-    "get_params": 'callable, optional (default=get_params)\n        Number of hyperparameter tuning trials.\n        def get_params(trial: optuna.Trial):\n            return {\n                "alpha": trial.suggest_float("alpha", 0.25, 0.75),\n                "gamma": trial.suggest_float("gamma", 0.0, 3.0),\n                "num_leaves": trial.suggest_int("num_leaves", 20, 150),\n                "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),\n                "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),\n                "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),\n                "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),\n            }',
 }
 
 
@@ -58,24 +55,6 @@
         ],
         "return_description": "eval_results: dict\n        History of evaluation results of each metric.\n        The dictionary has the following format:\n        {'valid metric1-mean': [values], 'valid metric1-stdv': [values],\n        'valid metric2-mean': [values], 'valid metric2-stdv': [values],\n        ...}.\n        If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.\n        If ``eval_train_metric=True``, also returns the train metric history.\n        In this case, the dictionary has the following format:\n        {'train metric1-mean': [values], 'valid metric1-mean': [values],\n        'train metric2-mean': [values], 'valid metric2-mean': [values],\n        ...}.",
     },
-    "optimize": {
-        "description": "Perform the hyperparameter tuning with optuna.",
-        "selected_params": [
-            "train_set",
-            "num_trials",
-            "num_boost_round",
-            "folds",
-            "nfold",
-            "stratified",
-            "shuffle",
-            "get_params",
-            "init_model",
-            "fpreproc",
-            "seed",
-            "callbacks",
-        ],
-        "return_description": f"study: optuna.Study{_space}study.best_params{_space}study.best_value",
-    },
 }
 
 

diff --git a/imlightgbm/engine.py b/imlightgbm/engine.py
@@ -3,11 +3,10 @@
 
 import lightgbm as lgb
 import numpy as np
-import optuna
 from sklearn.model_selection import BaseCrossValidator
 
 from imlightgbm.docstring import add_docstring
-from imlightgbm.objective import get_params, set_params
+from imlightgbm.objective import set_params
 
 
 @add_docstring("train")
@@ -70,47 +69,3 @@ def cv(
         eval_train_metric=eval_train_metric,
         return_cvbooster=return_cvbooster,
     )
-
-
-@add_docstring("optimize")
-def optimize(
-    train_set: lgb.Dataset,
-    num_trials: int = 10,
-    num_boost_round: int = 100,
-    folds: Iterable[tuple[np.ndarray, np.ndarray]] | BaseCrossValidator | None = None,
-    nfold: int = 5,
-    stratified: bool = True,
-    shuffle: bool = True,
-    get_params: Callable[[optuna.Trial], dict[str, Any]] = get_params,
-    init_model: str | lgb.Path | lgb.Booster | None = None,
-    fpreproc: Callable[
-        [lgb.Dataset, lgb.Dataset, dict[str, Any]],
-        tuple[lgb.Dataset, lgb.Dataset, dict[str, Any]],
-    ]
-    | None = None,
-    seed: int = 0,
-    callbacks: list[Callable] | None = None,
-) -> optuna.Study:
-    def _objective(trial: optuna.Trial):
-        """Optuna objective function."""
-        params = get_params(trial)
-        cv_results = cv(
-            params=params,
-            train_set=train_set,
-            num_boost_round=num_boost_round,
-            folds=folds,
-            nfold=nfold,
-            stratified=stratified,
-            shuffle=shuffle,
-            init_model=init_model,
-            fpreproc=fpreproc,
-            seed=seed,
-            callbacks=callbacks,
-        )
-        _keys = [_ for _ in cv_results.keys() if _.endswith("mean")]
-        assert len(_keys) == 1
-        return min(cv_results[_keys[0]])
-
-    study = optuna.create_study(direction="minimize")
-    study.optimize(_objective, n_trials=num_trials)
-    return study
diff --git a/imlightgbm/objective.py b/imlightgbm/objective.py
@@ -3,13 +3,11 @@
 from typing import Any, Callable
 
 import numpy as np
-import optuna
 from lightgbm import Dataset
 from sklearn.utils.multiclass import type_of_target
 
-from imlightgbm.utils import logger
+from imlightgbm.base import Metric, Objective, SupportedTask
 
-EvalLike = Callable[[np.ndarray, Dataset], tuple[str, float, bool]]
 ObjLike = Callable[[np.ndarray, Dataset], tuple[np.ndarray, np.ndarray]]
 ALPHA_DEFAULT: float = 0.25
 GAMMA_DEFAULT: float = 2.0
@@ -37,7 +35,7 @@ def _sigmoid(x: np.ndarray) -> np.ndarray:
 def binary_focal_objective(
     pred: np.ndarray, train_data: Dataset, gamma: float
 ) -> tuple[np.ndarray, np.ndarray]:
-    """Return binary focal objective."""
+    """Return grad, hess for binary focal objective."""
     label = train_data.get_label()
     pred_prob = _sigmoid(pred)
 
@@ -58,17 +56,13 @@ def binary_focal_objective(
     return grad, hess
 
 
-def binary_focal_eval(
-    pred: np.ndarray, train_data: Dataset, alpha: float, gamma: float
-) -> tuple[str, float, bool]:
-    """Return binary focal eval."""
+def binary_weighted_objective(pred: np.ndarray, train_data: Dataset, alpha: float):
+    """Return grad, hess for binary weighted objective."""
     label = train_data.get_label()
     pred_prob = _sigmoid(pred)
-    p_t = np.where(label == 1, pred_prob, 1 - pred_prob)
-    loss = -alpha * ((1 - p_t) ** gamma) * _log(p_t, True)
-
-    focal_loss = np.mean(loss)
-    return "focal", focal_loss, IS_HIGHER_BETTER
+    grad = -(alpha**label) * (label - pred_prob)
+    hess = (alpha**label) * pred_prob * (1.0 - pred_prob)
+    return grad, hess
 
 
 def multiclass_focal_objective(
@@ -78,59 +72,91 @@ def multiclass_focal_objective(
     return
 
 
-def multiclass_focal_eval(
+def multiclass_weighted_objective(
     pred: np.ndarray, train_data: Dataset, alpha: float, gamma: float
 ) -> tuple[str, float, bool]:
     # TODO
     return
 
 
-def _set_fobj_feval(
-    train_set: Dataset, alpha: float, gamma: float
-) -> tuple[ObjLike, EvalLike]:
-    """Return obj and eval with respect to task type."""
-    inferred_task = type_of_target(train_set.get_label())
-    if inferred_task not in {"binary", "multiclass"}:
-        raise ValueError(
-            f"Invalid target type: {inferred_task}. Supported types are 'binary' or 'multiclass'."
-        )
-    objective_mapper: dict[str, ObjLike] = {
-        "binary": partial(binary_focal_objective, gamma=gamma),
-        "multiclass": partial(multiclass_focal_objective, alpha=alpha, gamma=gamma),
+def _get_metric(task_enum: SupportedTask, metric: str | None) -> str:
+    """Retrieve the appropriate metric function based on task."""
+    metric_mapper: dict[SupportedTask, list[Metric]] = {
+        SupportedTask.binary: [Metric.auc, Metric.binary_error, Metric.binary_logloss],
+        SupportedTask.multiclass: [
+            Metric.auc_mu,
+            Metric.multi_logloss,
+            Metric.multi_error,
+        ],
     }
-    eval_mapper: dict[str, EvalLike] = {
-        "binary": "binary_logloss",
-        "multiclass": "multi_logloss",
+    if metric:
+        metric_enum = Metric.get(metric)
+        metric_enums = metric_mapper[task_enum]
+        if metric_enum not in metric_enums:
+            valid_metrics = ", ".join([m.value for m in metric_enums])
+            raise ValueError(f"Invalid metric: Supported metrics are {valid_metrics}")
+        return metric_enum.value
+
+    return metric_mapper[task_enum][0].value
+
+
+def _get_objective(
+    task_enum: SupportedTask, objective: str | None, alpha: float, gamma: float
+) -> ObjLike:
+    """Retrieve the appropriate objective function based on task and objective type."""
+    objective_mapper: dict[SupportedTask, dict[Objective, ObjLike]] = {
+        SupportedTask.binary: {
+            Objective.focal: partial(binary_focal_objective, gamma=gamma),
+            Objective.weighted: partial(binary_weighted_objective, alpha=alpha),
+        },
+        SupportedTask.multiclass: {
+            Objective.focal: partial(
+                multiclass_focal_objective, alpha=alpha, gamma=gamma
+            ),
+            Objective.weighted: partial(
+                multiclass_weighted_objective, alpha=alpha, gamma=gamma
+            ),
+        },
     }
-    fobj = objective_mapper[inferred_task]
-    feval = eval_mapper[inferred_task]
+    if objective:
+        objective_enum = Objective.get(objective)
+        return objective_mapper[task_enum][objective_enum]
+
+    return objective_mapper[task_enum][Objective.focal]
 
+
+def _get_fobj_feval(
+    train_set: Dataset,
+    alpha: float,
+    gamma: float,
+    objective: str | None,
+    metric: str | None,
+) -> tuple[ObjLike, str]:
+    """Return obj and eval with respect to task type."""
+    _task = type_of_target(train_set.get_label())
+    task_enum = SupportedTask.get(_task)
+    feval = _get_metric(task_enum=task_enum, metric=metric)
+    fobj = _get_objective(
+        task_enum=task_enum, objective=objective, alpha=alpha, gamma=gamma
+    )
     return fobj, feval
 
 
 def set_params(params: dict[str, Any], train_set: Dataset) -> dict[str, Any]:
     """Set params and eval finction, objective in params."""
     _params = deepcopy(params)
-    if OBJECTIVE_STR in _params:
-        logger.warning(f"'{OBJECTIVE_STR}' exists in params will not used.")
-        del _params[OBJECTIVE_STR]
-
-    _alpha = _params.pop("alpha", ALPHA_DEFAULT)
-    _gamma = _params.pop("gamma", GAMMA_DEFAULT)
-
-    fobj, feval = _set_fobj_feval(train_set=train_set, alpha=_alpha, gamma=_gamma)
+    _objective = _params.pop(OBJECTIVE_STR, None)
+    _metric = _params.pop(METRIC_STR, None)
+
+    if _metric and not isinstance(_metric, str):
+        raise ValueError("metric must be str")
+
+    fobj, feval = _get_fobj_feval(
+        train_set=train_set,
+        alpha=_params.pop("alpha", ALPHA_DEFAULT),
+        gamma=_params.pop("gamma", GAMMA_DEFAULT),
+        objective=_objective,
+        metric=_metric,
+    )
     _params.update({OBJECTIVE_STR: fobj, METRIC_STR: feval})
     return _params
-
-
-def get_params(trial: optuna.Trial) -> dict[str, Any]:
-    """Get default params."""
-    return {
-        "alpha": trial.suggest_float("alpha", 0.25, 0.75),
-        "gamma": trial.suggest_float("gamma", 0.0, 3.0),
-        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
-        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),
-        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
-        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
-        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
-    }
diff --git a/imlightgbm/utils.py b/imlightgbm/utils.py