From 37b488485f3ec5bd289b89a07cdf3f7b2d4b0e43 Mon Sep 17 00:00:00 2001 From: RektPunk <110188257+RektPunk@users.noreply.github.com> Date: Wed, 18 Sep 2024 14:47:59 +0900 Subject: [PATCH] [Feature] add docstring (#7) --- imlightgbm/docstring.py | 103 ++++++++++++++++++++++++++++++++++++++++ imlightgbm/engine.py | 46 ++++-------------- imlightgbm/objective.py | 14 ++++++ imlightgbm/utils.py | 86 --------------------------------- 4 files changed, 125 insertions(+), 124 deletions(-) create mode 100644 imlightgbm/docstring.py diff --git a/imlightgbm/docstring.py b/imlightgbm/docstring.py new file mode 100644 index 0000000..aff6554 --- /dev/null +++ b/imlightgbm/docstring.py @@ -0,0 +1,103 @@ +from typing import Callable + +_space = "\n " +ALL_PARAMS = { + "params": f"dict{_space}Parameters for training. Values passed through ``params`` take precedence over those supplied via arguments.", + "train_set": f"Dataset{_space}Data to be trained on.", + "num_boost_round": f"int, optional (default=100){_space}Number of boosting iterations.", + "valid_sets": f"list of Dataset, or None, optional (default=None){_space}List of data to be evaluated on during training.", + "valid_names": f"list of str, or None, optional (default=None){_space}Names of ``valid_sets``.", + "folds": f"generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None){_space}If generator or iterator, it should yield the train and test indices for each fold.{_space}If object, it should be one of the scikit-learn splitter classes{_space}(https://scikit-learn.org/stable/modules/classes.html#splitter-classes){_space}and have ``split`` method.{_space}This argument has highest priority over other data split arguments.", + "nfold": f"int, optional (default=5){_space}Number of folds in CV.", + "stratified": f"bool, optional (default=True){_space}Whether to perform stratified sampling.", + "shuffle": f"bool, optional (default=True){_space}Whether to shuffle before splitting data.", + "metrics": f"str, list of str, or None, optional (default=None){_space}Evaluation metrics to be monitored while CV.", + "init_model": f"str, pathlib.Path, Booster or None, optional (default=None){_space}Filename of LightGBM model or Booster instance used for continue training.", + "fpreproc": f"callable or None, optional (default=None){_space}Preprocessing function that takes (dtrain, dtest, params) and returns transformed versions of those.", + "seed": f"int, optional (default=0){_space}Seed used to generate the folds (passed to numpy.random.seed).", + "callbacks": f"list of callable, or None, optional (default=None){_space}List of callback functions that are applied at each iteration.{_space}See Callbacks in Python API for more information.", + "eval_train_metric": f"bool, optional (default=False){_space}Whether to display the train metric in progress.", + "return_cvbooster": f"bool, optional (default=False){_space}Whether to return Booster models trained on each fold through ``CVBooster``.", + "keep_training_booster": f"bool, optional (default=False){_space}Whether the returned Booster will be used to keep training.{_space}If False, the returned value will be converted into _InnerPredictor before returning.{_space}This means you won't be able to use ``eval``, ``eval_train`` or ``eval_valid`` methods of the returned Booster.{_space}When your model is very large and cause the memory error,{_space}you can try to set this param to ``True`` to avoid the model conversion performed during the internal call of ``model_to_string``.{_space}You can still use _InnerPredictor as ``init_model`` for future continue training.", + "num_trials": f"int, optional (default=10){_space}Number of hyperparameter tuning trials.", + "get_params": 'callable, optional (default=get_params)\n Number of hyperparameter tuning trials.\n def get_params(trial: optuna.Trial):\n return {\n "alpha": trial.suggest_float("alpha", 0.25, 0.75),\n "gamma": trial.suggest_float("gamma", 0.0, 3.0),\n "num_leaves": trial.suggest_int("num_leaves", 20, 150),\n "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1),\n "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),\n "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),\n "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),\n }', +} + + +PARAMS_MAPPER = { + "train": { + "description": "Perform the training with given parameters.", + "selected_params": [ + "params", + "train_set", + "num_boost_round", + "valid_sets", + "valid_names", + "init_model", + "keep_training_booster", + "callbacks", + ], + "return_description": f"booster: Booster{_space}The trained Booster model.", + }, + "cv": { + "description": "Perform the cross-validation with given parameters.", + "selected_params": [ + "params", + "train_set", + "num_boost_round", + "folds", + "nfold", + "stratified", + "shuffle", + "init_model", + "fpreproc", + "seed", + "callbacks", + "eval_train_metric", + "return_cvbooster", + ], + "return_description": "eval_results: dict\n History of evaluation results of each metric.\n The dictionary has the following format:\n {'valid metric1-mean': [values], 'valid metric1-stdv': [values],\n 'valid metric2-mean': [values], 'valid metric2-stdv': [values],\n ...}.\n If ``return_cvbooster=True``, also returns trained boosters wrapped in a ``CVBooster`` object via ``cvbooster`` key.\n If ``eval_train_metric=True``, also returns the train metric history.\n In this case, the dictionary has the following format:\n {'train metric1-mean': [values], 'valid metric1-mean': [values],\n 'train metric2-mean': [values], 'valid metric2-mean': [values],\n ...}.", + }, + "optimize": { + "description": "Perform the hyperparameter tuning with optuna.", + "selected_params": [ + "train_set", + "num_trials", + "num_boost_round", + "folds", + "nfold", + "stratified", + "shuffle", + "get_params", + "init_model", + "fpreproc", + "seed", + "callbacks", + ], + "return_description": f"study: optuna.Study{_space}study.best_params{_space}study.best_value", + }, +} + + +def generate_docstring( + description: str, + selected_params: list[str], + return_description: str = "", +) -> str: + """Generate a docstring with a provided description, selected parameters, and optional return description.""" + docstring = f"{description}\n\n Parameters\n ----------\n" + for param in selected_params: + docstring += f" {param}: {ALL_PARAMS[param]}\n" + if return_description: + docstring += f"\n Returns\n -------\n {return_description}\n" + return docstring + + +def add_docstring(func_name: str) -> Callable: + """Decorator to add a docstring to a function based on provided parameters and descriptions.""" + + def decorator(func: Callable) -> Callable: + func.__doc__ = generate_docstring(**PARAMS_MAPPER[func_name]) + return func + + return decorator diff --git a/imlightgbm/engine.py b/imlightgbm/engine.py index db6fb6f..53180b4 100644 --- a/imlightgbm/engine.py +++ b/imlightgbm/engine.py @@ -1,25 +1,23 @@ from collections.abc import Iterable -from typing import Any, Callable, Literal +from typing import Any, Callable import lightgbm as lgb import numpy as np import optuna from sklearn.model_selection import BaseCrossValidator -from imlightgbm.objective import set_params -from imlightgbm.utils import docstring, optimize_doc +from imlightgbm.docstring import add_docstring +from imlightgbm.objective import get_params, set_params -@docstring(lgb.train.__doc__) +@add_docstring("train") def train( params: dict[str, Any], train_set: lgb.Dataset, + num_boost_round: int = 100, valid_sets: list[lgb.Dataset] = None, valid_names: list[str] = None, - num_boost_round: int = 100, init_model: str | lgb.Path | lgb.Booster | None = None, - feature_name: list[str] | Literal["auto"] = "auto", - categorical_feature: list[str] | list[int] | Literal["auto"] = "auto", keep_training_booster: bool = False, callbacks: list[Callable] | None = None, ) -> lgb.Booster: @@ -27,18 +25,16 @@ def train( return lgb.train( params=_params, train_set=train_set, + num_boost_round=num_boost_round, valid_sets=valid_sets, valid_names=valid_names, - num_boost_round=num_boost_round, init_model=init_model, - feature_name=feature_name, - categorical_feature=categorical_feature, keep_training_booster=keep_training_booster, callbacks=callbacks, ) -@docstring(lgb.cv.__doc__) +@add_docstring("cv") def cv( params: dict[str, Any], train_set: lgb.Dataset, @@ -47,10 +43,7 @@ def cv( nfold: int = 5, stratified: bool = True, shuffle: bool = True, - metrics: str | list[str] | None = None, init_model: str | lgb.Path | lgb.Booster | None = None, - feature_name: list[str] | Literal["auto"] = "auto", - categorical_feature: list[str] | list[int] | Literal["auto"] = "auto", fpreproc: Callable[ [lgb.Dataset, lgb.Dataset, dict[str, Any]], tuple[lgb.Dataset, lgb.Dataset, dict[str, Any]], @@ -70,10 +63,7 @@ def cv( nfold=nfold, stratified=stratified, shuffle=shuffle, - metrics=metrics, init_model=init_model, - feature_name=feature_name, - categorical_feature=categorical_feature, fpreproc=fpreproc, seed=seed, callbacks=callbacks, @@ -82,18 +72,7 @@ def cv( ) -def get_params(trial: optuna.Trial): - return { - "alpha": trial.suggest_float("alpha", 0.25, 0.75), - "gamma": trial.suggest_float("gamma", 0.0, 3.0), - "num_leaves": trial.suggest_int("num_leaves", 20, 150), - "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1), - "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0), - "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0), - "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), - } - - +@add_docstring("optimize") def optimize( train_set: lgb.Dataset, num_trials: int = 10, @@ -104,8 +83,6 @@ def optimize( shuffle: bool = True, get_params: Callable[[optuna.Trial], dict[str, Any]] = get_params, init_model: str | lgb.Path | lgb.Booster | None = None, - feature_name: list[str] | Literal["auto"] = "auto", - categorical_feature: list[str] | list[int] | Literal["auto"] = "auto", fpreproc: Callable[ [lgb.Dataset, lgb.Dataset, dict[str, Any]], tuple[lgb.Dataset, lgb.Dataset, dict[str, Any]], @@ -126,13 +103,9 @@ def _objective(trial: optuna.Trial): stratified=stratified, shuffle=shuffle, init_model=init_model, - feature_name=feature_name, - categorical_feature=categorical_feature, fpreproc=fpreproc, seed=seed, callbacks=callbacks, - eval_train_metric=False, - return_cvbooster=False, ) _keys = [_ for _ in cv_results.keys() if _.endswith("mean")] assert len(_keys) == 1 @@ -141,6 +114,3 @@ def _objective(trial: optuna.Trial): study = optuna.create_study(direction="minimize") study.optimize(_objective, n_trials=num_trials) return study - - -optimize.__doc__ = optimize_doc diff --git a/imlightgbm/objective.py b/imlightgbm/objective.py index e13c9bc..26b84d0 100644 --- a/imlightgbm/objective.py +++ b/imlightgbm/objective.py @@ -3,6 +3,7 @@ from typing import Any, Callable import numpy as np +import optuna from lightgbm import Dataset from sklearn.utils.multiclass import type_of_target @@ -120,3 +121,16 @@ def set_params(params: dict[str, Any], train_set: Dataset) -> dict[str, Any]: fobj, feval = _set_fobj_feval(train_set=train_set, alpha=_alpha, gamma=_gamma) _params.update({OBJECTIVE_STR: fobj, METRIC_STR: feval}) return _params + + +def get_params(trial: optuna.Trial) -> dict[str, Any]: + """Get default params.""" + return { + "alpha": trial.suggest_float("alpha", 0.25, 0.75), + "gamma": trial.suggest_float("gamma", 0.0, 3.0), + "num_leaves": trial.suggest_int("num_leaves", 20, 150), + "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1), + "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0), + "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0), + "bagging_freq": trial.suggest_int("bagging_freq", 1, 7), + } diff --git a/imlightgbm/utils.py b/imlightgbm/utils.py index 7955ae6..0bc1aa7 100644 --- a/imlightgbm/utils.py +++ b/imlightgbm/utils.py @@ -1,26 +1,4 @@ import logging -from typing import Callable - - -def _modify_docstring(docstring: str) -> str: - lines = docstring.splitlines() - - feval_start = next(i for i, line in enumerate(lines) if "feval" in line) - init_model_start = next(i for i, line in enumerate(lines) if "init_model" in line) - del lines[feval_start:init_model_start] - - note_start = next(i for i, line in enumerate(lines) if "Note" in line) - returns_start = next(i for i, line in enumerate(lines) if "Returns" in line) - del lines[note_start:returns_start] - return "\n".join(lines) - - -def docstring(doc: str) -> Callable[[Callable], Callable]: - def decorator(func: Callable) -> Callable: - func.__doc__ = _modify_docstring(doc) - return func - - return decorator def init_logger() -> logging.Logger: @@ -30,67 +8,3 @@ def init_logger() -> logging.Logger: logger = init_logger() - - -optimize_doc = """Perform the cross-validation with given parameters. -Parameters ----------- -train_set : Dataset - Data to be trained on. -num_trials : int, optional (default=10) - Number of hyperparameter search trials. -num_boost_round : int, optional (default=100) - Number of boosting iterations. -folds : generator or iterator of (train_idx, test_idx) tuples, scikit-learn splitter object or None, optional (default=None) - If generator or iterator, it should yield the train and test indices for each fold. - If object, it should be one of the scikit-learn splitter classes - (https://scikit-learn.org/stable/modules/classes.html#splitter-classes) - and have ``split`` method. - This argument has highest priority over other data split arguments. -nfold : int, optional (default=5) - Number of folds in CV. -stratified : bool, optional (default=True) - Whether to perform stratified sampling. -shuffle : bool, optional (default=True) - Whether to shuffle before splitting data. -get_params : callable, optional (default=get_params) - def get_params(trial: optuna.Trial): - return { - 'alpha': trial.suggest_float('alpha', .25, .75), - 'gamma': trial.suggest_float('gamma', .0, 3.), - 'num_leaves': trial.suggest_int('num_leaves', 20, 150), - 'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1), - 'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0), - 'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0), - 'bagging_freq': trial.suggest_int('bagging_freq', 1, 7), - } -init_model : str, pathlib.Path, Booster or None, optional (default=None) - Filename of LightGBM model or Booster instance used for continue training. -feature_name : list of str, or 'auto', optional (default="auto") - **Deprecated.** Set ``feature_name`` on ``train_set`` instead. - Feature names. - If 'auto' and data is pandas DataFrame, data columns names are used. -categorical_feature : list of str or int, or 'auto', optional (default="auto") - **Deprecated.** Set ``categorical_feature`` on ``train_set`` instead. - Categorical features. - If list of int, interpreted as indices. - If list of str, interpreted as feature names (need to specify ``feature_name`` as well). - If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used. - All values in categorical features will be cast to int32 and thus should be less than int32 max value (2147483647). - Large values could be memory consuming. Consider using consecutive integers starting from zero. - All negative values in categorical features will be treated as missing values. - The output cannot be monotonically constrained with respect to a categorical feature. - Floating point numbers in categorical features will be rounded towards 0. -fpreproc : callable or None, optional (default=None) - Preprocessing function that takes (dtrain, dtest, params) - and returns transformed versions of those. -seed : int, optional (default=0) - Seed used to generate the folds (passed to numpy.random.seed). -callbacks : list of callable, or None, optional (default=None) - List of callback functions that are applied at each iteration. - See Callbacks in Python API for more information. - -Returns -------- -study: optuna.Study -"""