diff --git a/cyclic_boosting/base.py b/cyclic_boosting/base.py index 7b80883..4475798 100644 --- a/cyclic_boosting/base.py +++ b/cyclic_boosting/base.py @@ -923,7 +923,7 @@ def get_subestimators_as_items(self, prototypes=True) -> List[Tuple]: return [(feature.feature_id, feature.smoother) for feature in self.features] @abc.abstractmethod - def calc_parameters(self, feature: Feature, y: np.ndarray, prediction_link: np.ndarray, prefit_data): + def calc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors, prefit_data): """Calculates factors and uncertainties of the bins of a feature group in the original space (not the link space) and transforms them to the link space afterwards @@ -944,8 +944,9 @@ def calc_parameters(self, feature: Feature, y: np.ndarray, prediction_link: np.n class containing all features y: np.ndarray target, truth - prediction_link: np.ndarray - prediction in link space of all *other* features. + pred + (in-sample) predictions from all other features (excluding the one + at hand) prefit_data data returned by :meth:`~.precalc_parameters` during fit @@ -958,7 +959,7 @@ class containing all features raise NotImplementedError("implement in subclass") @abc.abstractmethod - def precalc_parameters(self, feature: Feature, y: np.ndarray, prediction_link: np.ndarray): + def precalc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors): """Calculations that are not dependent on intermediate predictions. If these are not needed, return :obj:`None` in the subclass. @@ -971,8 +972,9 @@ def precalc_parameters(self, feature: Feature, y: np.ndarray, prediction_link: n class containing all features y: np.ndarray target, truth - prediction_link: np.ndarray - prediction in link space. + pred + (in-sample) predictions from all other features (excluding the one + at hand) """ return None diff --git a/cyclic_boosting/generic_loss.py b/cyclic_boosting/generic_loss.py index 9772a2e..16b4512 100644 --- a/cyclic_boosting/generic_loss.py +++ b/cyclic_boosting/generic_loss.py @@ -5,14 +5,18 @@ import warnings import numpy as np +import pandas as pd import six import sklearn.base from scipy.optimize import minimize from scipy.stats import beta -from cyclic_boosting.base import CyclicBoostingBase, gaussian_matching_by_quantiles +from cyclic_boosting.base import CyclicBoostingBase, gaussian_matching_by_quantiles, Feature, CBLinkPredictionsFactors from cyclic_boosting.link import LogLinkMixin, IdentityLinkMixin, LogitLinkMixin from cyclic_boosting.utils import continuous_quantile_from_discrete, get_X_column +from cyclic_boosting.classification import get_beta_priors + +from typing import Tuple, Union _logger = logging.getLogger(__name__) @@ -28,10 +32,12 @@ class CBGenericLoss(CyclicBoostingBase): ``CBNBinomRegressor``, or ``CBLocationRegressor``). """ - def precalc_parameters(self, feature, y, pred): + def precalc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors) -> None: pass - def calc_parameters(self, feature, y, pred, prefit_data): + def calc_parameters( + self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors, prefit_data + ) -> Tuple[np.ndarray, np.ndarray]: """ Calling of the optimization (loss minimization) for the different bins of the feature at hand. In contrast to the analytical solution in most @@ -87,7 +93,7 @@ def calc_parameters(self, feature, y, pred, prefit_data): parameters = np.log(parameters) return parameters, uncertainties - def optimization(self, y, yhat_others, weights): + def optimization(self, y: np.ndarray, yhat_others: np.ndarray, weights: np.ndarray) -> Tuple[float, float]: """ Minimization of the costs (potentially including sample weights) for individual feature bins. The initial value for the parameters is set to @@ -114,7 +120,7 @@ def optimization(self, y, yhat_others, weights): res = minimize(self.objective_function, neutral_factor, args=(yhat_others, y, weights)) return res.x, self.uncertainty(y, weights) - def objective_function(self, param, yhat_others, y, weights): + def objective_function(self, param: float, yhat_others: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: """ Calculation of the in-sample costs (potentially including sample weights) for individual feature bins according to a given loss @@ -141,15 +147,15 @@ def objective_function(self, param, yhat_others, y, weights): return self.costs(model, y, weights) @abc.abstractmethod - def costs(self, prediction, y, weights): + def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: raise NotImplementedError("implement in subclass") @abc.abstractmethod - def model(self, param, yhat_others): + def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray: raise NotImplementedError("implement in subclass") @abc.abstractmethod - def uncertainty(self, y, weights): + def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float: """ Estimation of parameter uncertainty for a given feature bin. @@ -222,7 +228,7 @@ def __init__( def _check_y(self, y: np.ndarray) -> None: check_y_multiplicative(y) - def loss(self, prediction, y, weights): + def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: """ Calculation of the in-sample quantile loss, or to be exact costs, (potentially including sample weights) after full feature cycles, i.e., @@ -244,18 +250,18 @@ def loss(self, prediction, y, weights): """ return quantile_costs(prediction, y, weights, self.quantile) - def _init_global_scale(self, X, y): + def _init_global_scale(self, X: Union[pd.DataFrame, np.ndarray], y: np.ndarray) -> None: self.global_scale_link_, self.prior_pred_link_offset_ = quantile_global_scale( X, y, self.quantile, self.weights, self.prior_prediction_column, self.link_func ) - def costs(self, prediction, y, weights): + def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: return quantile_costs(prediction, y, weights, self.quantile) - def model(self, param, yhat_others): + def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray: return model_multiplicative(param, yhat_others) - def uncertainty(self, y, weights): + def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float: return uncertainty_gamma(y, weights) @@ -312,7 +318,7 @@ def __init__( def _check_y(self, y: np.ndarray) -> None: check_y_additive(y) - def loss(self, prediction, y, weights): + def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: """ Calculation of the in-sample quantile loss, or to be exact costs, (potentially including sample weights) after full feature cycles, i.e., @@ -334,22 +340,22 @@ def loss(self, prediction, y, weights): """ return quantile_costs(prediction, y, weights, self.quantile) - def _init_global_scale(self, X, y): + def _init_global_scale(self, X: Union[pd.DataFrame, np.ndarray], y: np.ndarray) -> None: self.global_scale_link_, self.prior_pred_link_offset_ = quantile_global_scale( X, y, self.quantile, self.weights, self.prior_prediction_column, self.link_func ) - def costs(self, prediction, y, weights): + def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: return quantile_costs(prediction, y, weights, self.quantile) - def model(self, param, yhat_others): + def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray: return model_additive(param, yhat_others) - def uncertainty(self, y, weights): + def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float: return uncertainty_gaussian(y, weights) -def quantile_costs(prediction, y, weights, quantile): +def quantile_costs(prediction: np.ndarray, y: np.ndarray, weights: np.ndarray, quantile: float) -> float: """ Calculation of the in-sample quantile costs (potentially including sample weights). @@ -380,7 +386,14 @@ def quantile_costs(prediction, y, weights, quantile): return sum_weighted_error / np.nansum(weights) -def quantile_global_scale(X, y, quantile, weights, prior_prediction_column, link_func): +def quantile_global_scale( + X: Union[pd.DataFrame, np.ndarray], + y: np.ndarray, + quantile: float, + weights: np.ndarray, + prior_prediction_column: Union[str, int, None], + link_func, +) -> None: """ Calculation of the global scale for quantile regression, corresponding to the (continuous approximation of the) respective quantile of the @@ -423,15 +436,15 @@ def quantile_global_scale(X, y, quantile, weights, prior_prediction_column, link return global_scale_link_, prior_pred_link_offset_ -def model_multiplicative(param, yhat_others): +def model_multiplicative(param: float, yhat_others: np.ndarray) -> np.ndarray: return param * yhat_others -def model_additive(param, yhat_others): +def model_additive(param: float, yhat_others: np.ndarray) -> np.ndarray: return param + yhat_others -def uncertainty_gamma(y, weights): +def uncertainty_gamma(y: np.ndarray, weights: np.ndarray) -> float: # use moment-matching of a Gamma posterior with a log-normal # distribution as approximation alpha_prior = 2 @@ -440,15 +453,14 @@ def uncertainty_gamma(y, weights): return sigma -def uncertainty_gaussian(y, weights): +def uncertainty_gaussian(y: np.ndarray, weights: np.ndarray) -> float: return np.sqrt(np.mean(y) / len(y)) -def uncertainty_beta(y, weights, link_func): +def uncertainty_beta(y: np.ndarray, weights: np.ndarray, link_func) -> float: # use moment-matching of a Beta posterior with a log-normal # distribution as approximation - alpha_prior = 1.001 - beta_prior = 1.001 + alpha_prior, beta_prior = get_beta_priors() alpha_posterior = np.sum(y) + alpha_prior beta_posterior = np.sum(1 - y) + beta_prior shift = 0.4 * (alpha_posterior / (alpha_posterior + beta_posterior) - 0.5) @@ -528,19 +540,19 @@ def __init__( self.costs = costs - def loss(self, prediction, y, weights): + def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: return self.costs(prediction, y, weights) def _check_y(self, y: np.ndarray) -> None: check_y_multiplicative(y) - def costs(self, prediction, y, weights): + def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: return self.costs(prediction, y, weights) - def model(self, param, yhat_others): + def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray: return model_multiplicative(param, yhat_others) - def uncertainty(self, y, weights): + def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float: return uncertainty_gamma(y, weights) @@ -591,19 +603,19 @@ def __init__( self.costs = costs - def loss(self, prediction, y, weights): + def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: return self.costs(prediction, y, weights) def _check_y(self, y: np.ndarray) -> None: check_y_additive(y) - def costs(self, prediction, y, weights): + def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: return self.costs(prediction, y, weights) - def model(self, param, yhat_others): + def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray: return model_additive(param, yhat_others) - def uncertainty(self, y, weights): + def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float: return uncertainty_gaussian(y, weights) @@ -653,19 +665,19 @@ def __init__( self.costs = costs - def loss(self, prediction, y, weights): + def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: return self.costs(prediction, y, weights) def _check_y(self, y: np.ndarray) -> None: check_y_classification(y) - def costs(self, prediction, y, weights): + def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float: return self.costs(prediction, y, weights) - def model(self, param, yhat_others): + def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray: return model_multiplicative(param, yhat_others) - def uncertainty(self, y, weights): + def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float: return uncertainty_beta(y, weights, self.link_func) diff --git a/cyclic_boosting/regression.py b/cyclic_boosting/regression.py index 4e081e5..7d0257f 100644 --- a/cyclic_boosting/regression.py +++ b/cyclic_boosting/regression.py @@ -13,7 +13,7 @@ from cyclic_boosting.features import Feature from cyclic_boosting.link import LogLinkMixin -from typing import Tuple, Union +from typing import Tuple _logger = logging.getLogger(__name__)