types on functions

Blue-Yonder-OSS · Aug 20, 2023 · 14500ff · 14500ff
1 parent 6bcafad
commit 14500ff
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 46 deletions.
diff --git a/cyclic_boosting/base.py b/cyclic_boosting/base.py
@@ -923,7 +923,7 @@ def get_subestimators_as_items(self, prototypes=True) -> List[Tuple]:
             return [(feature.feature_id, feature.smoother) for feature in self.features]
 
     @abc.abstractmethod
-    def calc_parameters(self, feature: Feature, y: np.ndarray, prediction_link: np.ndarray, prefit_data):
+    def calc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors, prefit_data):
         """Calculates factors and uncertainties of the bins of a feature group
         in the original space (not the link space) and transforms them to the
         link space afterwards
@@ -944,8 +944,9 @@ def calc_parameters(self, feature: Feature, y: np.ndarray, prediction_link: np.n
             class containing all features
         y: np.ndarray
             target, truth
-        prediction_link: np.ndarray
-            prediction in link space of all *other* features.
+        pred
+            (in-sample) predictions from all other features (excluding the one
+            at hand)
         prefit_data
             data returned by :meth:`~.precalc_parameters` during fit
 
@@ -958,7 +959,7 @@ class containing all features
         raise NotImplementedError("implement in subclass")
 
     @abc.abstractmethod
-    def precalc_parameters(self, feature: Feature, y: np.ndarray, prediction_link: np.ndarray):
+    def precalc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors):
         """Calculations that are not  dependent on intermediate predictions. If
         these are not needed, return :obj:`None` in the subclass.
 
@@ -971,8 +972,9 @@ def precalc_parameters(self, feature: Feature, y: np.ndarray, prediction_link: n
             class containing all features
         y: np.ndarray
             target, truth
-        prediction_link: np.ndarray
-            prediction in link space.
+        pred
+            (in-sample) predictions from all other features (excluding the one
+            at hand)
         """
         return None
 

diff --git a/cyclic_boosting/generic_loss.py b/cyclic_boosting/generic_loss.py
@@ -5,14 +5,18 @@
 import warnings
 
 import numpy as np
+import pandas as pd
 import six
 import sklearn.base
 from scipy.optimize import minimize
 from scipy.stats import beta
 
-from cyclic_boosting.base import CyclicBoostingBase, gaussian_matching_by_quantiles
+from cyclic_boosting.base import CyclicBoostingBase, gaussian_matching_by_quantiles, Feature, CBLinkPredictionsFactors
 from cyclic_boosting.link import LogLinkMixin, IdentityLinkMixin, LogitLinkMixin
 from cyclic_boosting.utils import continuous_quantile_from_discrete, get_X_column
+from cyclic_boosting.classification import get_beta_priors
+
+from typing import Tuple, Union
 
 _logger = logging.getLogger(__name__)
 
@@ -28,10 +32,12 @@ class CBGenericLoss(CyclicBoostingBase):
     ``CBNBinomRegressor``, or ``CBLocationRegressor``).
     """
 
-    def precalc_parameters(self, feature, y, pred):
+    def precalc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors) -> None:
         pass
 
-    def calc_parameters(self, feature, y, pred, prefit_data):
+    def calc_parameters(
+        self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors, prefit_data
+    ) -> Tuple[np.ndarray, np.ndarray]:
         """
         Calling of the optimization (loss minimization) for the different bins
         of the feature at hand. In contrast to the analytical solution in most
@@ -87,7 +93,7 @@ def calc_parameters(self, feature, y, pred, prefit_data):
             parameters = np.log(parameters)
         return parameters, uncertainties
 
-    def optimization(self, y, yhat_others, weights):
+    def optimization(self, y: np.ndarray, yhat_others: np.ndarray, weights: np.ndarray) -> Tuple[float, float]:
         """
         Minimization of the costs (potentially including sample weights) for
         individual feature bins. The initial value for the parameters is set to
@@ -114,7 +120,7 @@ def optimization(self, y, yhat_others, weights):
         res = minimize(self.objective_function, neutral_factor, args=(yhat_others, y, weights))
         return res.x, self.uncertainty(y, weights)
 
-    def objective_function(self, param, yhat_others, y, weights):
+    def objective_function(self, param: float, yhat_others: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         """
         Calculation of the in-sample costs (potentially including sample
         weights) for individual feature bins according to a given loss
@@ -141,15 +147,15 @@ def objective_function(self, param, yhat_others, y, weights):
         return self.costs(model, y, weights)
 
     @abc.abstractmethod
-    def costs(self, prediction, y, weights):
+    def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         raise NotImplementedError("implement in subclass")
 
     @abc.abstractmethod
-    def model(self, param, yhat_others):
+    def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray:
         raise NotImplementedError("implement in subclass")
 
     @abc.abstractmethod
-    def uncertainty(self, y, weights):
+    def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float:
         """
         Estimation of parameter uncertainty for a given feature bin.
 
@@ -222,7 +228,7 @@ def __init__(
     def _check_y(self, y: np.ndarray) -> None:
         check_y_multiplicative(y)
 
-    def loss(self, prediction, y, weights):
+    def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         """
         Calculation of the in-sample quantile loss, or to be exact costs,
         (potentially including sample weights) after full feature cycles, i.e.,
@@ -244,18 +250,18 @@ def loss(self, prediction, y, weights):
         """
         return quantile_costs(prediction, y, weights, self.quantile)
 
-    def _init_global_scale(self, X, y):
+    def _init_global_scale(self, X: Union[pd.DataFrame, np.ndarray], y: np.ndarray) -> None:
         self.global_scale_link_, self.prior_pred_link_offset_ = quantile_global_scale(
             X, y, self.quantile, self.weights, self.prior_prediction_column, self.link_func
         )
 
-    def costs(self, prediction, y, weights):
+    def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         return quantile_costs(prediction, y, weights, self.quantile)
 
-    def model(self, param, yhat_others):
+    def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray:
         return model_multiplicative(param, yhat_others)
 
-    def uncertainty(self, y, weights):
+    def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float:
         return uncertainty_gamma(y, weights)
 
 
@@ -312,7 +318,7 @@ def __init__(
     def _check_y(self, y: np.ndarray) -> None:
         check_y_additive(y)
 
-    def loss(self, prediction, y, weights):
+    def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         """
         Calculation of the in-sample quantile loss, or to be exact costs,
         (potentially including sample weights) after full feature cycles, i.e.,
@@ -334,22 +340,22 @@ def loss(self, prediction, y, weights):
         """
         return quantile_costs(prediction, y, weights, self.quantile)
 
-    def _init_global_scale(self, X, y):
+    def _init_global_scale(self, X: Union[pd.DataFrame, np.ndarray], y: np.ndarray) -> None:
         self.global_scale_link_, self.prior_pred_link_offset_ = quantile_global_scale(
             X, y, self.quantile, self.weights, self.prior_prediction_column, self.link_func
         )
 
-    def costs(self, prediction, y, weights):
+    def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         return quantile_costs(prediction, y, weights, self.quantile)
 
-    def model(self, param, yhat_others):
+    def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray:
         return model_additive(param, yhat_others)
 
-    def uncertainty(self, y, weights):
+    def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float:
         return uncertainty_gaussian(y, weights)
 
 
-def quantile_costs(prediction, y, weights, quantile):
+def quantile_costs(prediction: np.ndarray, y: np.ndarray, weights: np.ndarray, quantile: float) -> float:
     """
     Calculation of the in-sample quantile costs (potentially including sample
     weights).
@@ -380,7 +386,14 @@ def quantile_costs(prediction, y, weights, quantile):
         return sum_weighted_error / np.nansum(weights)
 
 
-def quantile_global_scale(X, y, quantile, weights, prior_prediction_column, link_func):
+def quantile_global_scale(
+    X: Union[pd.DataFrame, np.ndarray],
+    y: np.ndarray,
+    quantile: float,
+    weights: np.ndarray,
+    prior_prediction_column: Union[str, int, None],
+    link_func,
+) -> None:
     """
     Calculation of the global scale for quantile regression, corresponding
     to the (continuous approximation of the) respective quantile of the
@@ -423,15 +436,15 @@ def quantile_global_scale(X, y, quantile, weights, prior_prediction_column, link
     return global_scale_link_, prior_pred_link_offset_
 
 
-def model_multiplicative(param, yhat_others):
+def model_multiplicative(param: float, yhat_others: np.ndarray) -> np.ndarray:
     return param * yhat_others
 
 
-def model_additive(param, yhat_others):
+def model_additive(param: float, yhat_others: np.ndarray) -> np.ndarray:
     return param + yhat_others
 
 
-def uncertainty_gamma(y, weights):
+def uncertainty_gamma(y: np.ndarray, weights: np.ndarray) -> float:
     # use moment-matching of a Gamma posterior with a log-normal
     # distribution as approximation
     alpha_prior = 2
@@ -440,15 +453,14 @@ def uncertainty_gamma(y, weights):
     return sigma
 
 
-def uncertainty_gaussian(y, weights):
+def uncertainty_gaussian(y: np.ndarray, weights: np.ndarray) -> float:
     return np.sqrt(np.mean(y) / len(y))
 
 
-def uncertainty_beta(y, weights, link_func):
+def uncertainty_beta(y: np.ndarray, weights: np.ndarray, link_func) -> float:
     # use moment-matching of a Beta posterior with a log-normal
     # distribution as approximation
-    alpha_prior = 1.001
-    beta_prior = 1.001
+    alpha_prior, beta_prior = get_beta_priors()
     alpha_posterior = np.sum(y) + alpha_prior
     beta_posterior = np.sum(1 - y) + beta_prior
     shift = 0.4 * (alpha_posterior / (alpha_posterior + beta_posterior) - 0.5)
@@ -528,19 +540,19 @@ def __init__(
 
         self.costs = costs
 
-    def loss(self, prediction, y, weights):
+    def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         return self.costs(prediction, y, weights)
 
     def _check_y(self, y: np.ndarray) -> None:
         check_y_multiplicative(y)
 
-    def costs(self, prediction, y, weights):
+    def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         return self.costs(prediction, y, weights)
 
-    def model(self, param, yhat_others):
+    def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray:
         return model_multiplicative(param, yhat_others)
 
-    def uncertainty(self, y, weights):
+    def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float:
         return uncertainty_gamma(y, weights)
 
 
@@ -591,19 +603,19 @@ def __init__(
 
         self.costs = costs
 
-    def loss(self, prediction, y, weights):
+    def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         return self.costs(prediction, y, weights)
 
     def _check_y(self, y: np.ndarray) -> None:
         check_y_additive(y)
 
-    def costs(self, prediction, y, weights):
+    def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         return self.costs(prediction, y, weights)
 
-    def model(self, param, yhat_others):
+    def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray:
         return model_additive(param, yhat_others)
 
-    def uncertainty(self, y, weights):
+    def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float:
         return uncertainty_gaussian(y, weights)
 
 
@@ -653,19 +665,19 @@ def __init__(
 
         self.costs = costs
 
-    def loss(self, prediction, y, weights):
+    def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         return self.costs(prediction, y, weights)
 
     def _check_y(self, y: np.ndarray) -> None:
         check_y_classification(y)
 
-    def costs(self, prediction, y, weights):
+    def costs(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
         return self.costs(prediction, y, weights)
 
-    def model(self, param, yhat_others):
+    def model(self, param: float, yhat_others: np.ndarray) -> np.ndarray:
         return model_multiplicative(param, yhat_others)
 
-    def uncertainty(self, y, weights):
+    def uncertainty(self, y: np.ndarray, weights: np.ndarray) -> float:
         return uncertainty_beta(y, weights, self.link_func)
 
 

diff --git a/cyclic_boosting/regression.py b/cyclic_boosting/regression.py
@@ -13,7 +13,7 @@
 from cyclic_boosting.features import Feature
 from cyclic_boosting.link import LogLinkMixin
 
-from typing import Tuple, Union
+from typing import Tuple
 
 _logger = logging.getLogger(__name__)