From 37b488485f3ec5bd289b89a07cdf3f7b2d4b0e43 Mon Sep 17 00:00:00 2001
From: RektPunk <110188257+RektPunk@users.noreply.github.com>
Date: Wed, 18 Sep 2024 14:47:59 +0900
Subject: [PATCH] [Feature] add docstring (#7)

---
 imlightgbm/docstring.py | 103 ++++++++++++++++++++++++++++++++++++++++
 imlightgbm/engine.py    |  46 ++++--------------
 imlightgbm/objective.py |  14 ++++++
 imlightgbm/utils.py     |  86 ---------------------------------
 4 files changed, 125 insertions(+), 124 deletions(-)
 create mode 100644 imlightgbm/docstring.py

diff --git a/imlightgbm/docstring.py b/imlightgbm/docstring.py
new file mode 100644
index 0000000..aff6554
--- /dev/null
+++ b/imlightgbm/docstring.py
@@ -0,0 +1,103 @@
+from typing import Callable
+
+_space = "\n        "
+ALL_PARAMS = {
+    "params": f"dict{_space}Parameters for training. Values passed through ``params`` take precedence over those supplied via arguments.",
+    "train_set": f"Dataset{_space}Data to be trained on.",
+    "num_boost_round": f"int, optional (default=100){_space}Number of boosting iterations.",
+    "valid_sets": f"list of Dataset, or None, optional (default=None){_space}List of data to be evaluated on during training.",
+    "valid_names": f"list of str, or None, optional (default=None){_space}Names of ``valid_sets``.",
+    "folds": f"generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None){_space}If generator or iterator, it should yield the train and test indices for each fold.{_space}If object, it should be one of the scikit-learn splitter classes{_space}(https://scikit-learn.org/stable/modules/classes.html#splitter-classes){_space}and have ``split`` method.{_space}This argument has highest priority over other data split arguments.",
+    "nfold": f"int, optional (default=5){_space}Number of folds in CV.",
+    "stratified": f"bool, optional (default=True){_space}Whether to perform stratified sampling.",
+    "shuffle": f"bool, optional (default=True){_space}Whether to shuffle before splitting data.",
+    "metrics": f"str, list of str, or None, optional (default=None){_space}Evaluation metrics to be monitored while CV.",
+    "init_model": f"str, pathlib.Path, Booster or None, optional (default=None){_space}Filename of LightGBM model or Booster instance used for continue training.",
+    "fpreproc": f"callable or None, optional (default=None){_space}Preprocessing function that takes (dtrain, dtest, params) and returns transformed versions of those.",
+    "seed": f"int, optional (default=0){_space}Seed used to generate the folds (passed to numpy.random.seed).",
+    "callbacks": f"list of callable, or None, optional (default=None){_space}List of callback functions that are applied at each iteration.{_space}See Callbacks in Python API for more information.",
+    "eval_train_metric": f"bool, optional (default=False){_space}Whether to display the train metric in progress.",
+    "return_cvbooster": f"bool, optional (default=False){_space}Whether to return Booster models trained on each fold through ``CVBooster``.",
+    "keep_training_booster": f"bool, optional (default=False){_space}Whether the returned Booster will be used to keep training.{_space}If False, the returned value will be converted into _InnerPredictor before returning.{_space}This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.{_space}When your model is very large and cause the memory error,{_space}you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.{_space}You can still use _InnerPredictor as ``init_model`` for future continue training.",
+    "num_trials": f"int, optional (default=10){_space}Number of hyperparameter tuning trials.",
+    "get_params": 'callable, optional (default=get_params)\n        Number of hyperparameter tuning trials.\n        def get_params(trial: optuna.Trial):\n            return {\n                "alpha": trial.suggest_float("alpha", 0.25, 0.75),\n                "gamma": trial.suggest_float("gamma", 0.0, 3.0),\n                "num_leaves": trial.suggest_int("num_leaves", 20, 150),\n                "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),\n                "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),\n                "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),\n                "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),\n            }',
+}
+
+
+PARAMS_MAPPER = {
+    "train": {
+        "description": "Perform the training with given parameters.",
+        "selected_params": [
+            "params",
+            "train_set",
+            "num_boost_round",
+            "valid_sets",
+            "valid_names",
+            "init_model",
+            "keep_training_booster",
+            "callbacks",
+        ],
+        "return_description": f"booster: Booster{_space}The trained Booster model.",
+    },
+    "cv": {
+        "description": "Perform the cross-validation with given parameters.",
+        "selected_params": [
+            "params",
+            "train_set",
+            "num_boost_round",
+            "folds",
+            "nfold",
+            "stratified",
+            "shuffle",
+            "init_model",
+            "fpreproc",
+            "seed",
+            "callbacks",
+            "eval_train_metric",
+            "return_cvbooster",
+        ],
+        "return_description": "eval_results: dict\n        History of evaluation results of each metric.\n        The dictionary has the following format:\n        {'valid metric1-mean': [values], 'valid metric1-stdv': [values],\n        'valid metric2-mean': [values], 'valid metric2-stdv': [values],\n        ...}.\n        If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.\n        If ``eval_train_metric=True``, also returns the train metric history.\n        In this case, the dictionary has the following format:\n        {'train metric1-mean': [values], 'valid metric1-mean': [values],\n        'train metric2-mean': [values], 'valid metric2-mean': [values],\n        ...}.",
+    },
+    "optimize": {
+        "description": "Perform the hyperparameter tuning with optuna.",
+        "selected_params": [
+            "train_set",
+            "num_trials",
+            "num_boost_round",
+            "folds",
+            "nfold",
+            "stratified",
+            "shuffle",
+            "get_params",
+            "init_model",
+            "fpreproc",
+            "seed",
+            "callbacks",
+        ],
+        "return_description": f"study: optuna.Study{_space}study.best_params{_space}study.best_value",
+    },
+}
+
+
+def generate_docstring(
+    description: str,
+    selected_params: list[str],
+    return_description: str = "",
+) -> str:
+    """Generate a docstring with a provided description, selected parameters, and optional return description."""
+    docstring = f"{description}\n\n    Parameters\n    ----------\n"
+    for param in selected_params:
+        docstring += f"    {param}: {ALL_PARAMS[param]}\n"
+    if return_description:
+        docstring += f"\n    Returns\n    -------\n    {return_description}\n"
+    return docstring
+
+
+def add_docstring(func_name: str) -> Callable:
+    """Decorator to add a docstring to a function based on provided parameters and descriptions."""
+
+    def decorator(func: Callable) -> Callable:
+        func.__doc__ = generate_docstring(**PARAMS_MAPPER[func_name])
+        return func
+
+    return decorator
diff --git a/imlightgbm/engine.py b/imlightgbm/engine.py
index db6fb6f..53180b4 100644
--- a/imlightgbm/engine.py
+++ b/imlightgbm/engine.py
@@ -1,25 +1,23 @@
 from collections.abc import Iterable
-from typing import Any, Callable, Literal
+from typing import Any, Callable
 
 import lightgbm as lgb
 import numpy as np
 import optuna
 from sklearn.model_selection import BaseCrossValidator
 
-from imlightgbm.objective import set_params
-from imlightgbm.utils import docstring, optimize_doc
+from imlightgbm.docstring import add_docstring
+from imlightgbm.objective import get_params, set_params
 
 
-@docstring(lgb.train.__doc__)
+@add_docstring("train")
 def train(
     params: dict[str, Any],
     train_set: lgb.Dataset,
+    num_boost_round: int = 100,
     valid_sets: list[lgb.Dataset] = None,
     valid_names: list[str] = None,
-    num_boost_round: int = 100,
     init_model: str | lgb.Path | lgb.Booster | None = None,
-    feature_name: list[str] | Literal["auto"] = "auto",
-    categorical_feature: list[str] | list[int] | Literal["auto"] = "auto",
     keep_training_booster: bool = False,
     callbacks: list[Callable] | None = None,
 ) -> lgb.Booster:
@@ -27,18 +25,16 @@ def train(
     return lgb.train(
         params=_params,
         train_set=train_set,
+        num_boost_round=num_boost_round,
         valid_sets=valid_sets,
         valid_names=valid_names,
-        num_boost_round=num_boost_round,
         init_model=init_model,
-        feature_name=feature_name,
-        categorical_feature=categorical_feature,
         keep_training_booster=keep_training_booster,
         callbacks=callbacks,
     )
 
 
-@docstring(lgb.cv.__doc__)
+@add_docstring("cv")
 def cv(
     params: dict[str, Any],
     train_set: lgb.Dataset,
@@ -47,10 +43,7 @@ def cv(
     nfold: int = 5,
     stratified: bool = True,
     shuffle: bool = True,
-    metrics: str | list[str] | None = None,
     init_model: str | lgb.Path | lgb.Booster | None = None,
-    feature_name: list[str] | Literal["auto"] = "auto",
-    categorical_feature: list[str] | list[int] | Literal["auto"] = "auto",
     fpreproc: Callable[
         [lgb.Dataset, lgb.Dataset, dict[str, Any]],
         tuple[lgb.Dataset, lgb.Dataset, dict[str, Any]],
@@ -70,10 +63,7 @@ def cv(
         nfold=nfold,
         stratified=stratified,
         shuffle=shuffle,
-        metrics=metrics,
         init_model=init_model,
-        feature_name=feature_name,
-        categorical_feature=categorical_feature,
         fpreproc=fpreproc,
         seed=seed,
         callbacks=callbacks,
@@ -82,18 +72,7 @@ def cv(
     )
 
 
-def get_params(trial: optuna.Trial):
-    return {
-        "alpha": trial.suggest_float("alpha", 0.25, 0.75),
-        "gamma": trial.suggest_float("gamma", 0.0, 3.0),
-        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
-        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),
-        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
-        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
-        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
-    }
-
-
+@add_docstring("optimize")
 def optimize(
     train_set: lgb.Dataset,
     num_trials: int = 10,
@@ -104,8 +83,6 @@ def optimize(
     shuffle: bool = True,
     get_params: Callable[[optuna.Trial], dict[str, Any]] = get_params,
     init_model: str | lgb.Path | lgb.Booster | None = None,
-    feature_name: list[str] | Literal["auto"] = "auto",
-    categorical_feature: list[str] | list[int] | Literal["auto"] = "auto",
     fpreproc: Callable[
         [lgb.Dataset, lgb.Dataset, dict[str, Any]],
         tuple[lgb.Dataset, lgb.Dataset, dict[str, Any]],
@@ -126,13 +103,9 @@ def _objective(trial: optuna.Trial):
             stratified=stratified,
             shuffle=shuffle,
             init_model=init_model,
-            feature_name=feature_name,
-            categorical_feature=categorical_feature,
             fpreproc=fpreproc,
             seed=seed,
             callbacks=callbacks,
-            eval_train_metric=False,
-            return_cvbooster=False,
         )
         _keys = [_ for _ in cv_results.keys() if _.endswith("mean")]
         assert len(_keys) == 1
@@ -141,6 +114,3 @@ def _objective(trial: optuna.Trial):
     study = optuna.create_study(direction="minimize")
     study.optimize(_objective, n_trials=num_trials)
     return study
-
-
-optimize.__doc__ = optimize_doc
diff --git a/imlightgbm/objective.py b/imlightgbm/objective.py
index e13c9bc..26b84d0 100644
--- a/imlightgbm/objective.py
+++ b/imlightgbm/objective.py
@@ -3,6 +3,7 @@
 from typing import Any, Callable
 
 import numpy as np
+import optuna
 from lightgbm import Dataset
 from sklearn.utils.multiclass import type_of_target
 
@@ -120,3 +121,16 @@ def set_params(params: dict[str, Any], train_set: Dataset) -> dict[str, Any]:
     fobj, feval = _set_fobj_feval(train_set=train_set, alpha=_alpha, gamma=_gamma)
     _params.update({OBJECTIVE_STR: fobj, METRIC_STR: feval})
     return _params
+
+
+def get_params(trial: optuna.Trial) -> dict[str, Any]:
+    """Get default params."""
+    return {
+        "alpha": trial.suggest_float("alpha", 0.25, 0.75),
+        "gamma": trial.suggest_float("gamma", 0.0, 3.0),
+        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
+        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),
+        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
+        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
+        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
+    }
diff --git a/imlightgbm/utils.py b/imlightgbm/utils.py
index 7955ae6..0bc1aa7 100644
--- a/imlightgbm/utils.py
+++ b/imlightgbm/utils.py
@@ -1,26 +1,4 @@
 import logging
-from typing import Callable
-
-
-def _modify_docstring(docstring: str) -> str:
-    lines = docstring.splitlines()
-
-    feval_start = next(i for i, line in enumerate(lines) if "feval" in line)
-    init_model_start = next(i for i, line in enumerate(lines) if "init_model" in line)
-    del lines[feval_start:init_model_start]
-
-    note_start = next(i for i, line in enumerate(lines) if "Note" in line)
-    returns_start = next(i for i, line in enumerate(lines) if "Returns" in line)
-    del lines[note_start:returns_start]
-    return "\n".join(lines)
-
-
-def docstring(doc: str) -> Callable[[Callable], Callable]:
-    def decorator(func: Callable) -> Callable:
-        func.__doc__ = _modify_docstring(doc)
-        return func
-
-    return decorator
 
 
 def init_logger() -> logging.Logger:
@@ -30,67 +8,3 @@ def init_logger() -> logging.Logger:
 
 
 logger = init_logger()
-
-
-optimize_doc = """Perform the cross-validation with given parameters.
-Parameters
-----------
-train_set : Dataset
-    Data to be trained on.
-num_trials : int, optional (default=10)
-    Number of hyperparameter search trials.
-num_boost_round : int, optional (default=100)
-    Number of boosting iterations.
-folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None)
-    If generator or iterator, it should yield the train and test indices for each fold.
-    If object, it should be one of the scikit-learn splitter classes
-    (https://scikit-learn.org/stable/modules/classes.html#splitter-classes)
-    and have ``split`` method.
-    This argument has highest priority over other data split arguments.
-nfold : int, optional (default=5)
-    Number of folds in CV.
-stratified : bool, optional (default=True)
-    Whether to perform stratified sampling.
-shuffle : bool, optional (default=True)
-    Whether to shuffle before splitting data.
-get_params : callable, optional (default=get_params)
-    def get_params(trial: optuna.Trial):
-        return {
-            'alpha': trial.suggest_float('alpha', .25, .75),
-            'gamma': trial.suggest_float('gamma', .0, 3.),
-            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
-            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
-            'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
-            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
-            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
-        }
-init_model : str, pathlib.Path, Booster or None, optional (default=None)
-    Filename of LightGBM model or Booster instance used for continue training.
-feature_name : list of str, or 'auto', optional (default="auto")
-    **Deprecated.** Set ``feature_name`` on ``train_set`` instead.
-    Feature names.
-    If 'auto' and data is pandas DataFrame, data columns names are used.
-categorical_feature : list of str or int, or 'auto', optional (default="auto")
-    **Deprecated.** Set ``categorical_feature`` on ``train_set`` instead.
-    Categorical features.
-    If list of int, interpreted as indices.
-    If list of str, interpreted as feature names (need to specify ``feature_name`` as well).
-    If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
-    All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647).
-    Large values could be memory consuming. Consider using consecutive integers starting from zero.
-    All negative values in categorical features will be treated as missing values.
-    The output cannot be monotonically constrained with respect to a categorical feature.
-    Floating point numbers in categorical features will be rounded towards 0.
-fpreproc : callable or None, optional (default=None)
-    Preprocessing function that takes (dtrain, dtest, params)
-    and returns transformed versions of those.
-seed : int, optional (default=0)
-    Seed used to generate the folds (passed to numpy.random.seed).
-callbacks : list of callable, or None, optional (default=None)
-    List of callback functions that are applied at each iteration.
-    See Callbacks in Python API for more information.
-
-Returns
--------
-study: optuna.Study
-"""