Merge pull request #35 from Blue-Yonder-OSS/quantile_regression

multiplicative quantile regression mode
Blue-Yonder-OSS · Aug 16, 2023 · 2a8f4c6 · 2a8f4c6
2 parents 4f2e0eb + d193a8c
commit 2a8f4c6
Show file tree

Hide file tree

Showing 6 changed files with 393 additions and 2 deletions.
diff --git a/cyclic_boosting/__init__.py b/cyclic_boosting/__init__.py
@@ -12,6 +12,7 @@
 - :class:`~.CBPoissonRegressor`
 - :class:`~.CBNBinomRegressor`
 - :class:`~.CBExponential`
+- :class:`~.CBMultiplicativeQuantileRegressor`
 
 Additive Regression
 
@@ -35,7 +36,7 @@
 
 
 from cyclic_boosting.base import CyclicBoostingBase
-from cyclic_boosting.regression import CBNBinomRegressor, CBPoissonRegressor
+from cyclic_boosting.regression import CBNBinomRegressor, CBPoissonRegressor, CBMultiplicativeQuantileRegressor
 from cyclic_boosting.price import CBExponential
 from cyclic_boosting.location import CBLocationRegressor, CBLocPoissonRegressor
 from cyclic_boosting.nbinom import CBNBinomC
@@ -50,6 +51,7 @@
     pipeline_CBLocPoissonRegressor,
     pipeline_CBNBinomC,
     pipeline_CBGBSRegressor,
+    pipeline_CBMultiplicativeQuantileRegressor,
 )
 
 __all__ = [
@@ -62,6 +64,7 @@
     "CBNBinomC",
     "CBClassifier",
     "CBGBSRegressor",
+    "CBMultiplicativeQuantileRegressor",
     "pipeline_CBPoissonRegressor",
     "pipeline_CBNBinomRegressor",
     "pipeline_CBClassifier",
@@ -70,6 +73,7 @@
     "pipeline_CBLocPoissonRegressor",
     "pipeline_CBNBinomC",
     "pipeline_CBGBSRegressor",
+    "pipeline_CBMultiplicativeQuantileRegressor",
 ]
 
 __version__ = "1.0"
diff --git a/cyclic_boosting/pipelines.py b/cyclic_boosting/pipelines.py
@@ -7,6 +7,7 @@
     CBNBinomC,
     CBClassifier,
     CBGBSRegressor,
+    CBMultiplicativeQuantileRegressor,
     binning,
 )
 
@@ -40,6 +41,7 @@ def pipeline_CB(
     bayes=False,
     n_steps=15,
     regalpha=0.0,
+    quantile=0.5,
 ):
     if estimator in [CBPoissonRegressor, CBLocPoissonRegressor, CBLocationRegressor, CBClassifier]:
         estimatorCB = estimator(
@@ -124,6 +126,22 @@ def pipeline_CB(
             aggregate=aggregate,
             regalpha=regalpha,
         )
+    elif estimator == CBMultiplicativeQuantileRegressor:
+        estimatorCB = estimator(
+            feature_groups=feature_groups,
+            feature_properties=feature_properties,
+            weight_column=weight_column,
+            prior_prediction_column=prior_prediction_column,
+            minimal_loss_change=minimal_loss_change,
+            minimal_factor_change=minimal_factor_change,
+            maximal_iterations=maximal_iterations,
+            observers=observers,
+            smoother_choice=smoother_choice,
+            output_column=output_column,
+            learn_rate=learn_rate,
+            aggregate=aggregate,
+            quantile=quantile,
+        )
     else:
         raise Exception("No valid CB estimator.")
     binner = binning.BinNumberTransformer(n_bins=number_of_bins, feature_properties=feature_properties)
@@ -185,3 +203,10 @@ def pipeline_CBGBSRegressor(**kwargs):
     Convenience function containing CBGBSRegressor (estimator) + binning.
     """
     return pipeline_CB(CBGBSRegressor, **kwargs)
+
+
+def pipeline_CBMultiplicativeQuantileRegressor(**kwargs):
+    """
+    Convenience function containing CBMultiplicativeQuantileRegressor (estimator) + binning.
+    """
+    return pipeline_CB(CBMultiplicativeQuantileRegressor, **kwargs)
diff --git a/cyclic_boosting/regression.py b/cyclic_boosting/regression.py
@@ -2,15 +2,18 @@
 
 import abc
 import logging
+import warnings
 
 import numexpr
 import numpy as np
 import six
 import sklearn.base
 import scipy.special
+from scipy.optimize import minimize
 
 from cyclic_boosting.base import CyclicBoostingBase
 from cyclic_boosting.link import LogLinkMixin
+from cyclic_boosting.utils import continuous_quantile_from_discrete, get_X_column
 
 _logger = logging.getLogger(__name__)
 
@@ -180,4 +183,246 @@ def calc_parameters(self, feature, y, pred, prefit_data):
         return _calc_factors_and_uncertainties(alpha=prefit_data, beta=prediction_sum_of_bins, link_func=self.link_func)
 
 
-__all__ = ["get_gamma_priors", "CBPoissonRegressor", "CBNBinomRegressor"]
+class CBMultiplicativeQuantileRegressor(CBBaseRegressor):
+    """
+    Cyclic Boosting multiplicative quantile-regression mode. While its general
+    structure allows arbitrary/empirical target ranges/distributions, the
+    multiplicative model of this mode requires non-negative target values.
+
+    A quantile loss, according to the desired quantile to be predicted, is
+    minimized in each bin of each feature. While binning, feature cycles,
+    smoothing, and iterations work in the same way as usual in Cyclic Boosting,
+    the minimization itself is performed via ``scipy.optimize.minimize``
+    (instead of an analytical solution like, e.g., in ``CBPoissonRegressor``,
+    ``CBNBinomRegressor``, or ``CBLocationRegressor``).
+
+    Parameters
+    ----------
+    quantile : float
+        quantile to be estimated
+    See :class:`cyclic_boosting.base` for all other parameters.
+    """
+
+    def __init__(
+        self,
+        feature_groups=None,
+        feature_properties=None,
+        weight_column=None,
+        prior_prediction_column=None,
+        minimal_loss_change=1e-10,
+        minimal_factor_change=1e-10,
+        maximal_iterations=10,
+        observers=None,
+        smoother_choice=None,
+        output_column=None,
+        learn_rate=None,
+        quantile=0.5,
+        aggregate=True,
+    ):
+        CyclicBoostingBase.__init__(
+            self,
+            feature_groups=feature_groups,
+            feature_properties=feature_properties,
+            weight_column=weight_column,
+            prior_prediction_column=prior_prediction_column,
+            minimal_loss_change=minimal_loss_change,
+            minimal_factor_change=minimal_factor_change,
+            maximal_iterations=maximal_iterations,
+            observers=observers,
+            smoother_choice=smoother_choice,
+            output_column=output_column,
+            learn_rate=learn_rate,
+            aggregate=aggregate,
+        )
+
+        self.quantile = quantile
+
+    def precalc_parameters(self, feature, y, pred):
+        pass
+
+    def loss(self, prediction, y, weights):
+        """
+        Calculation of the in-sample quantile loss, or to be exact costs,
+        (potentially including sample weights) after full feature cycles, i.e.,
+        iterations, to be used as stopping criteria.
+
+        Parameters
+        ----------
+        prediction : np.ndarray
+            (in-sample) predictions for desired quantile, containing data with `float` type
+        y : np.ndarray
+            target variable, containing data with `float` type (potentially discrete)
+        weights : np.ndarray
+            optional (otherwise set to 1) sample weights, containing data with `float` type
+
+        Returns
+        -------
+        float
+            calcualted quantile costs
+        """
+        if not len(y) > 0:
+            raise ValueError("Loss cannot be computed on empty data")
+        else:
+            sum_weighted_error = np.nansum(
+                (
+                    (y < prediction) * (1 - self.quantile) * (prediction - y)
+                    + (y >= prediction) * self.quantile * (y - prediction)
+                )
+                * weights
+            )
+            return sum_weighted_error / np.nansum(weights)
+
+    def _init_global_scale(self, X, y):
+        """
+        Calculation of the global scale for quantile regression, corresponding
+        to the (continuous approximation of the) respective quantile of the
+        target values used in the training.
+
+        The exact value of the global scale is not critical for the model
+        accuracy (as the model has enough parameters to compensate), but a
+        value not representating a good overall average leads to factors with
+        averages unequal to 1 for each feature (making interpretation more
+        difficult).
+        """
+        if self.weights is None:
+            raise RuntimeError("The weights have to be initialized.")
+
+        self.global_scale_link_ = self.link_func(continuous_quantile_from_discrete(y, self.quantile))
+
+        if self.prior_prediction_column is not None:
+            prior_pred = get_X_column(X, self.prior_prediction_column)
+            finite = np.isfinite(prior_pred)
+            if not np.all(finite):
+                _logger.warning(
+                    "Found a total number of {} non-finite values in the prior prediction column".format(
+                        np.sum(~finite)
+                    )
+                )
+
+            prior_pred_mean = np.sum(prior_pred[finite] * self.weights[finite]) / np.sum(self.weights[finite])
+
+            prior_pred_link_mean = self.link_func(prior_pred_mean)
+
+            if np.isfinite(prior_pred_link_mean):
+                self.prior_pred_link_offset_ = self.global_scale_link_ - prior_pred_link_mean
+            else:
+                warnings.warn(
+                    "The mean prior prediction in link-space is not finite. "
+                    "Therefore no indiviualization is done "
+                    "and no prior mean substraction is necessary."
+                )
+                self.prior_pred_link_offset_ = float(self.global_scale_link_)
+
+    def quantile_costs(self, param, yhat_others, y, weights):
+        """
+        Calculation of the in-sample quantile costs (potentially including
+        sample weights) for individual feature bins according to a quantile
+        loss function, to be minimized subsequently.
+
+        Parameters
+        ----------
+        param : float
+            Factor to be estimated for the feature bin at hand.
+        yhat_others : np.ndarray
+            (in-sample) predictions of all other features (excluding the one at
+            hand) for the bin at hand, containing data with `float` type
+        y : np.ndarray
+            target variable, containing data with `float` type (potentially discrete)
+        weights : np.ndarray
+            optional (otherwise set to 1) sample weights, containing data with `float` type
+
+        Returns
+        -------
+        float
+            calcualted quantile costs
+        """
+        quantile_loss = (y < (param * yhat_others)) * (1 - self.quantile) * (param * yhat_others - y) + (
+            y >= (param * yhat_others)
+        ) * self.quantile * (y - param * yhat_others)
+        sum_weighted_error = np.nansum(quantile_loss * weights)
+        return sum_weighted_error / np.nansum(weights)
+
+    def optimization(self, y, yhat_others, weights):
+        """
+        Minimization of the quantile costs (potentially including sample
+        weights) for individual feature bins. The initial value for the factors
+        is set to 1 (neutral value for multiplicative model).
+
+        Parameters
+        ----------
+        param : float
+            Factor to be estimated for the feature bin at hand.
+        yhat_others : np.ndarray
+            (in-sample) predictions from all other features (excluding the one
+            at hand) for the bin at hand, containing data with `float` type
+        y : np.ndarray
+            target variable, containing data with `float` type (potentially discrete).
+        weights : np.ndarray
+            optional (otherwise set to 1) sample weights, containing data with `float` type
+
+        Returns
+        -------
+        float, float
+            estimated parameter (factor) and its uncertainty
+        """
+        res = minimize(self.quantile_costs, 1, args=(yhat_others, y, weights))
+        # use moment-matching of a Gamma posterior with a log-normal
+        # distribution as approximation
+        uncertainty = np.sqrt(np.log(1 + 2 + np.sum(y)) - np.log(2 + np.sum(y)))
+        return res.x, uncertainty
+
+    def calc_parameters(self, feature, y, pred, prefit_data):
+        """
+        Calling of the optimization (quantile loss minimization) for the
+        different bins of the feature at hand. In contrast to the analytical
+        solution in most other Cyclic Boosting modes (e.g.,
+        ``CBPoissonRegressor``), working simply via bin statistics
+        (`bincount`), the optimization here requires a dedicated loss funtion
+        to be called for each observation.
+
+        Parameters
+        ----------
+        feature : :class:`Feature`
+            feature for which the parameters of each bin are estimated
+        y : np.ndarray
+            target variable, containing data with `float` type (potentially
+            discrete)
+        pred : np.ndarray
+            (in-sample) predictions from all other features (excluding the one
+            at hand), containing data with `float` type
+        prefit_data
+            data returned by :meth:`~.precalc_parameters` during fit, not used
+            here
+
+        Returns
+        -------
+        float, float
+            estimated parameter (factor) and its uncertainty
+        """
+        sorting = feature.lex_binned_data.argsort()
+        sorted_bins = feature.lex_binned_data[sorting]
+        splits_indices = np.unique(sorted_bins, return_index=True)[1][1:]
+
+        y_pred = np.hstack((y[..., np.newaxis], self.unlink_func(pred.predict_link())[..., np.newaxis]))
+        y_pred = np.hstack((y_pred, self.weights[..., np.newaxis]))
+        y_pred_bins = np.split(y_pred[sorting], splits_indices)
+
+        n_bins = len(y_pred_bins)
+        factors = np.zeros(n_bins)
+        uncertainties = np.zeros(n_bins)
+
+        for bin in range(n_bins):
+            factors[bin], uncertainties[bin] = self.optimization(
+                y_pred_bins[bin][:, 0], y_pred_bins[bin][:, 1], y_pred_bins[bin][:, 2]
+            )
+
+        if n_bins + 1 == feature.n_bins:
+            factors = np.append(factors, 1)
+            uncertainties = np.append(uncertainties, 0)
+
+        epsilon = 1e-5
+        factors = np.where(factors < epsilon, epsilon, factors)
+        return np.log(factors), uncertainties
+
+
+__all__ = ["get_gamma_priors", "CBPoissonRegressor", "CBNBinomRegressor", "CBMultiplicativeQuantileRegressor"]
diff --git a/cyclic_boosting/utils.py b/cyclic_boosting/utils.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 import six
+import bisect
 
 _logger = logging.getLogger(__name__)
 
@@ -974,3 +975,29 @@ def get_feature_column_names(X, exclude_columns=[]):
         if col in features:
             features.remove(col)
     return features
+
+
+def continuous_quantile_from_discrete(y, quantile):
+    """
+    Calculates a continous approximation of a given quantile from an array of potentially discrete values.
+
+    Parameters
+    ----------
+    y : np.ndarray
+        containing data with `float` type (potentially discrete)
+    quantile : float
+        desired quantile
+
+    Returns
+    -------
+    float
+        calcualted quantile value
+    """
+    sorted_y = np.sort(y)
+    quantile_index = int(quantile * (len(y) - 1))
+    quantile_y = sorted_y[quantile_index]
+    index_low = bisect.bisect_left(sorted_y, quantile_y)
+    index_high = bisect.bisect_right(sorted_y, quantile_y)
+    if index_high > index_low:
+        quantile_y += (quantile_index - index_low) / (index_high - index_low)
+    return quantile_y