Skip to content

Commit

Permalink
Merge pull request #35 from Blue-Yonder-OSS/quantile_regression
Browse files Browse the repository at this point in the history
multiplicative quantile regression mode
  • Loading branch information
FelixWick authored Aug 16, 2023
2 parents 4f2e0eb + d193a8c commit 2a8f4c6
Show file tree
Hide file tree
Showing 6 changed files with 393 additions and 2 deletions.
6 changes: 5 additions & 1 deletion cyclic_boosting/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
- :class:`~.CBPoissonRegressor`
- :class:`~.CBNBinomRegressor`
- :class:`~.CBExponential`
- :class:`~.CBMultiplicativeQuantileRegressor`
Additive Regression
Expand All @@ -35,7 +36,7 @@


from cyclic_boosting.base import CyclicBoostingBase
from cyclic_boosting.regression import CBNBinomRegressor, CBPoissonRegressor
from cyclic_boosting.regression import CBNBinomRegressor, CBPoissonRegressor, CBMultiplicativeQuantileRegressor
from cyclic_boosting.price import CBExponential
from cyclic_boosting.location import CBLocationRegressor, CBLocPoissonRegressor
from cyclic_boosting.nbinom import CBNBinomC
Expand All @@ -50,6 +51,7 @@
pipeline_CBLocPoissonRegressor,
pipeline_CBNBinomC,
pipeline_CBGBSRegressor,
pipeline_CBMultiplicativeQuantileRegressor,
)

__all__ = [
Expand All @@ -62,6 +64,7 @@
"CBNBinomC",
"CBClassifier",
"CBGBSRegressor",
"CBMultiplicativeQuantileRegressor",
"pipeline_CBPoissonRegressor",
"pipeline_CBNBinomRegressor",
"pipeline_CBClassifier",
Expand All @@ -70,6 +73,7 @@
"pipeline_CBLocPoissonRegressor",
"pipeline_CBNBinomC",
"pipeline_CBGBSRegressor",
"pipeline_CBMultiplicativeQuantileRegressor",
]

__version__ = "1.0"
25 changes: 25 additions & 0 deletions cyclic_boosting/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
CBNBinomC,
CBClassifier,
CBGBSRegressor,
CBMultiplicativeQuantileRegressor,
binning,
)

Expand Down Expand Up @@ -40,6 +41,7 @@ def pipeline_CB(
bayes=False,
n_steps=15,
regalpha=0.0,
quantile=0.5,
):
if estimator in [CBPoissonRegressor, CBLocPoissonRegressor, CBLocationRegressor, CBClassifier]:
estimatorCB = estimator(
Expand Down Expand Up @@ -124,6 +126,22 @@ def pipeline_CB(
aggregate=aggregate,
regalpha=regalpha,
)
elif estimator == CBMultiplicativeQuantileRegressor:
estimatorCB = estimator(
feature_groups=feature_groups,
feature_properties=feature_properties,
weight_column=weight_column,
prior_prediction_column=prior_prediction_column,
minimal_loss_change=minimal_loss_change,
minimal_factor_change=minimal_factor_change,
maximal_iterations=maximal_iterations,
observers=observers,
smoother_choice=smoother_choice,
output_column=output_column,
learn_rate=learn_rate,
aggregate=aggregate,
quantile=quantile,
)
else:
raise Exception("No valid CB estimator.")
binner = binning.BinNumberTransformer(n_bins=number_of_bins, feature_properties=feature_properties)
Expand Down Expand Up @@ -185,3 +203,10 @@ def pipeline_CBGBSRegressor(**kwargs):
Convenience function containing CBGBSRegressor (estimator) + binning.
"""
return pipeline_CB(CBGBSRegressor, **kwargs)


def pipeline_CBMultiplicativeQuantileRegressor(**kwargs):
"""
Convenience function containing CBMultiplicativeQuantileRegressor (estimator) + binning.
"""
return pipeline_CB(CBMultiplicativeQuantileRegressor, **kwargs)
247 changes: 246 additions & 1 deletion cyclic_boosting/regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@

import abc
import logging
import warnings

import numexpr
import numpy as np
import six
import sklearn.base
import scipy.special
from scipy.optimize import minimize

from cyclic_boosting.base import CyclicBoostingBase
from cyclic_boosting.link import LogLinkMixin
from cyclic_boosting.utils import continuous_quantile_from_discrete, get_X_column

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -180,4 +183,246 @@ def calc_parameters(self, feature, y, pred, prefit_data):
return _calc_factors_and_uncertainties(alpha=prefit_data, beta=prediction_sum_of_bins, link_func=self.link_func)


__all__ = ["get_gamma_priors", "CBPoissonRegressor", "CBNBinomRegressor"]
class CBMultiplicativeQuantileRegressor(CBBaseRegressor):
"""
Cyclic Boosting multiplicative quantile-regression mode. While its general
structure allows arbitrary/empirical target ranges/distributions, the
multiplicative model of this mode requires non-negative target values.
A quantile loss, according to the desired quantile to be predicted, is
minimized in each bin of each feature. While binning, feature cycles,
smoothing, and iterations work in the same way as usual in Cyclic Boosting,
the minimization itself is performed via ``scipy.optimize.minimize``
(instead of an analytical solution like, e.g., in ``CBPoissonRegressor``,
``CBNBinomRegressor``, or ``CBLocationRegressor``).
Parameters
----------
quantile : float
quantile to be estimated
See :class:`cyclic_boosting.base` for all other parameters.
"""

def __init__(
self,
feature_groups=None,
feature_properties=None,
weight_column=None,
prior_prediction_column=None,
minimal_loss_change=1e-10,
minimal_factor_change=1e-10,
maximal_iterations=10,
observers=None,
smoother_choice=None,
output_column=None,
learn_rate=None,
quantile=0.5,
aggregate=True,
):
CyclicBoostingBase.__init__(
self,
feature_groups=feature_groups,
feature_properties=feature_properties,
weight_column=weight_column,
prior_prediction_column=prior_prediction_column,
minimal_loss_change=minimal_loss_change,
minimal_factor_change=minimal_factor_change,
maximal_iterations=maximal_iterations,
observers=observers,
smoother_choice=smoother_choice,
output_column=output_column,
learn_rate=learn_rate,
aggregate=aggregate,
)

self.quantile = quantile

def precalc_parameters(self, feature, y, pred):
pass

def loss(self, prediction, y, weights):
"""
Calculation of the in-sample quantile loss, or to be exact costs,
(potentially including sample weights) after full feature cycles, i.e.,
iterations, to be used as stopping criteria.
Parameters
----------
prediction : np.ndarray
(in-sample) predictions for desired quantile, containing data with `float` type
y : np.ndarray
target variable, containing data with `float` type (potentially discrete)
weights : np.ndarray
optional (otherwise set to 1) sample weights, containing data with `float` type
Returns
-------
float
calcualted quantile costs
"""
if not len(y) > 0:
raise ValueError("Loss cannot be computed on empty data")
else:
sum_weighted_error = np.nansum(
(
(y < prediction) * (1 - self.quantile) * (prediction - y)
+ (y >= prediction) * self.quantile * (y - prediction)
)
* weights
)
return sum_weighted_error / np.nansum(weights)

def _init_global_scale(self, X, y):
"""
Calculation of the global scale for quantile regression, corresponding
to the (continuous approximation of the) respective quantile of the
target values used in the training.
The exact value of the global scale is not critical for the model
accuracy (as the model has enough parameters to compensate), but a
value not representating a good overall average leads to factors with
averages unequal to 1 for each feature (making interpretation more
difficult).
"""
if self.weights is None:
raise RuntimeError("The weights have to be initialized.")

self.global_scale_link_ = self.link_func(continuous_quantile_from_discrete(y, self.quantile))

if self.prior_prediction_column is not None:
prior_pred = get_X_column(X, self.prior_prediction_column)
finite = np.isfinite(prior_pred)
if not np.all(finite):
_logger.warning(
"Found a total number of {} non-finite values in the prior prediction column".format(
np.sum(~finite)
)
)

prior_pred_mean = np.sum(prior_pred[finite] * self.weights[finite]) / np.sum(self.weights[finite])

prior_pred_link_mean = self.link_func(prior_pred_mean)

if np.isfinite(prior_pred_link_mean):
self.prior_pred_link_offset_ = self.global_scale_link_ - prior_pred_link_mean
else:
warnings.warn(
"The mean prior prediction in link-space is not finite. "
"Therefore no indiviualization is done "
"and no prior mean substraction is necessary."
)
self.prior_pred_link_offset_ = float(self.global_scale_link_)

def quantile_costs(self, param, yhat_others, y, weights):
"""
Calculation of the in-sample quantile costs (potentially including
sample weights) for individual feature bins according to a quantile
loss function, to be minimized subsequently.
Parameters
----------
param : float
Factor to be estimated for the feature bin at hand.
yhat_others : np.ndarray
(in-sample) predictions of all other features (excluding the one at
hand) for the bin at hand, containing data with `float` type
y : np.ndarray
target variable, containing data with `float` type (potentially discrete)
weights : np.ndarray
optional (otherwise set to 1) sample weights, containing data with `float` type
Returns
-------
float
calcualted quantile costs
"""
quantile_loss = (y < (param * yhat_others)) * (1 - self.quantile) * (param * yhat_others - y) + (
y >= (param * yhat_others)
) * self.quantile * (y - param * yhat_others)
sum_weighted_error = np.nansum(quantile_loss * weights)
return sum_weighted_error / np.nansum(weights)

def optimization(self, y, yhat_others, weights):
"""
Minimization of the quantile costs (potentially including sample
weights) for individual feature bins. The initial value for the factors
is set to 1 (neutral value for multiplicative model).
Parameters
----------
param : float
Factor to be estimated for the feature bin at hand.
yhat_others : np.ndarray
(in-sample) predictions from all other features (excluding the one
at hand) for the bin at hand, containing data with `float` type
y : np.ndarray
target variable, containing data with `float` type (potentially discrete).
weights : np.ndarray
optional (otherwise set to 1) sample weights, containing data with `float` type
Returns
-------
float, float
estimated parameter (factor) and its uncertainty
"""
res = minimize(self.quantile_costs, 1, args=(yhat_others, y, weights))
# use moment-matching of a Gamma posterior with a log-normal
# distribution as approximation
uncertainty = np.sqrt(np.log(1 + 2 + np.sum(y)) - np.log(2 + np.sum(y)))
return res.x, uncertainty

def calc_parameters(self, feature, y, pred, prefit_data):
"""
Calling of the optimization (quantile loss minimization) for the
different bins of the feature at hand. In contrast to the analytical
solution in most other Cyclic Boosting modes (e.g.,
``CBPoissonRegressor``), working simply via bin statistics
(`bincount`), the optimization here requires a dedicated loss funtion
to be called for each observation.
Parameters
----------
feature : :class:`Feature`
feature for which the parameters of each bin are estimated
y : np.ndarray
target variable, containing data with `float` type (potentially
discrete)
pred : np.ndarray
(in-sample) predictions from all other features (excluding the one
at hand), containing data with `float` type
prefit_data
data returned by :meth:`~.precalc_parameters` during fit, not used
here
Returns
-------
float, float
estimated parameter (factor) and its uncertainty
"""
sorting = feature.lex_binned_data.argsort()
sorted_bins = feature.lex_binned_data[sorting]
splits_indices = np.unique(sorted_bins, return_index=True)[1][1:]

y_pred = np.hstack((y[..., np.newaxis], self.unlink_func(pred.predict_link())[..., np.newaxis]))
y_pred = np.hstack((y_pred, self.weights[..., np.newaxis]))
y_pred_bins = np.split(y_pred[sorting], splits_indices)

n_bins = len(y_pred_bins)
factors = np.zeros(n_bins)
uncertainties = np.zeros(n_bins)

for bin in range(n_bins):
factors[bin], uncertainties[bin] = self.optimization(
y_pred_bins[bin][:, 0], y_pred_bins[bin][:, 1], y_pred_bins[bin][:, 2]
)

if n_bins + 1 == feature.n_bins:
factors = np.append(factors, 1)
uncertainties = np.append(uncertainties, 0)

epsilon = 1e-5
factors = np.where(factors < epsilon, epsilon, factors)
return np.log(factors), uncertainties


__all__ = ["get_gamma_priors", "CBPoissonRegressor", "CBNBinomRegressor", "CBMultiplicativeQuantileRegressor"]
27 changes: 27 additions & 0 deletions cyclic_boosting/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pandas as pd
import six
import bisect

_logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -974,3 +975,29 @@ def get_feature_column_names(X, exclude_columns=[]):
if col in features:
features.remove(col)
return features


def continuous_quantile_from_discrete(y, quantile):
"""
Calculates a continous approximation of a given quantile from an array of potentially discrete values.
Parameters
----------
y : np.ndarray
containing data with `float` type (potentially discrete)
quantile : float
desired quantile
Returns
-------
float
calcualted quantile value
"""
sorted_y = np.sort(y)
quantile_index = int(quantile * (len(y) - 1))
quantile_y = sorted_y[quantile_index]
index_low = bisect.bisect_left(sorted_y, quantile_y)
index_high = bisect.bisect_right(sorted_y, quantile_y)
if index_high > index_low:
quantile_y += (quantile_index - index_low) / (index_high - index_low)
return quantile_y
Loading

0 comments on commit 2a8f4c6

Please sign in to comment.