Skip to content

Commit

Permalink
several small bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
FelixWick committed Mar 27, 2023
1 parent fd834e3 commit 3e09a6c
Show file tree
Hide file tree
Showing 4 changed files with 294 additions and 25 deletions.
2 changes: 1 addition & 1 deletion cyclic_boosting/GBSregression.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class CBGBSRegressor(RegressorMixin, CyclicBoostingBase, IdentityLinkMixin):
the regression of the outcome of a previous statistical subtraction of two
classes of observations from each other (e.g. groups A and B: A - B).
For this, the target y has to be set to positve values for group A and
For this, the target y has to be set to positive values for group A and
negative values for group B.
Additional Parameter
Expand Down
5 changes: 3 additions & 2 deletions cyclic_boosting/nbinom.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def calc_parameters(self, feature, y, pred, prefit_data):

def loss(self, c, y, weights):
# TODO: use weights
return loss_nbinom_c(y, self.mu, c, self.gamma)
return loss_nbinom_c(y.astype(np.float64), self.mu, c, self.gamma)

def fit(self, X, y=None, fit_mode=0):
self.mu = X[self.mean_prediction_column].values
Expand Down Expand Up @@ -249,7 +249,8 @@ def compute_2d_loss(

for i in nb.prange(n_new_c):
loss[i] = binned_loss_nbinom_c(
y, mu, c_link, binnumbers, minlength, gamma, new_c_link[i]
y.astype(np.float64), mu, c_link, binnumbers, minlength, gamma,
new_c_link[i]
)

return loss
Expand Down
124 changes: 104 additions & 20 deletions cyclic_boosting/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from cyclic_boosting import CBClassifier, CBLocationRegressor, \
CBExponential, CBNBinomRegressor, CBPoissonRegressor, CBLocPoissonRegressor, CBNBinomC, CBClassifier, CBGBSRegressor, binning
from cyclic_boosting import CBLocationRegressor, CBExponential, \
CBNBinomRegressor, CBPoissonRegressor, CBLocPoissonRegressor, CBNBinomC, \
CBClassifier, CBGBSRegressor, binning

from sklearn.pipeline import Pipeline


def pipeline_CB(
regressor=None,
estimator=None,
feature_groups=None,
feature_properties=None,
weight_column=None,
Expand All @@ -19,26 +20,109 @@ def pipeline_CB(
learn_rate=None,
number_of_bins=100,
aggregate=True,
a=1.0,
c=0.0,
external_colname=None,
standard_feature_groups=None,
external_feature_groups=None,
var_prior_exponent=0.1,
prior_exponent_colname=None,
mean_prediction_column=None,
gamma=0.0,
bayes=False,
n_steps=15,
regalpha=0.0,
):

estimator=regressor(
feature_groups=feature_groups,
feature_properties=feature_properties,
weight_column=weight_column,
prior_prediction_column=prior_prediction_column,
minimal_loss_change=minimal_loss_change,
minimal_factor_change=minimal_factor_change,
maximal_iterations=maximal_iterations,
observers=observers,
smoother_choice=smoother_choice,
output_column=output_column,
learn_rate=learn_rate,
aggregate=aggregate,
)

if estimator in [CBPoissonRegressor, CBLocPoissonRegressor,
CBLocationRegressor, CBClassifier]:
estimatorCB = estimator(
feature_groups=feature_groups,
feature_properties=feature_properties,
weight_column=weight_column,
prior_prediction_column=prior_prediction_column,
minimal_loss_change=minimal_loss_change,
minimal_factor_change=minimal_factor_change,
maximal_iterations=maximal_iterations,
observers=observers,
smoother_choice=smoother_choice,
output_column=output_column,
learn_rate=learn_rate,
aggregate=aggregate,
)
elif estimator == CBNBinomRegressor:
estimatorCB = estimator(
feature_groups=feature_groups,
feature_properties=feature_properties,
weight_column=weight_column,
prior_prediction_column=prior_prediction_column,
minimal_loss_change=minimal_loss_change,
minimal_factor_change=minimal_factor_change,
maximal_iterations=maximal_iterations,
observers=observers,
smoother_choice=smoother_choice,
output_column=output_column,
learn_rate=learn_rate,
aggregate=aggregate,
a=a,
c=c,
)
elif estimator == CBExponential:
estimatorCB = estimator(
external_colname=external_colname,
standard_feature_groups=standard_feature_groups,
external_feature_groups=external_feature_groups,
feature_properties=feature_properties,
weight_column=weight_column,
prior_prediction_column=prior_prediction_column,
minimal_loss_change=minimal_loss_change,
minimal_factor_change=minimal_factor_change,
maximal_iterations=maximal_iterations,
observers=observers,
smoother_choice=smoother_choice,
output_column=output_column,
learn_rate=learn_rate,
var_prior_exponent=var_prior_exponent,
prior_exponent_colname=prior_exponent_colname,
)
elif estimator == CBNBinomC:
estimatorCB = estimator(
mean_prediction_column=mean_prediction_column,
feature_groups=feature_groups,
feature_properties=feature_properties,
weight_column=weight_column,
prior_prediction_column=prior_prediction_column,
minimal_loss_change=minimal_loss_change,
minimal_factor_change=minimal_factor_change,
maximal_iterations=maximal_iterations,
observers=observers,
smoother_choice=smoother_choice,
output_column=output_column,
learn_rate=learn_rate,
gamma=gamma,
bayes=bayes,
n_steps=n_steps,
)
elif estimator == CBGBSRegressor:
estimatorCB = estimator(
feature_groups=feature_groups,
feature_properties=feature_properties,
weight_column=weight_column,
minimal_loss_change=minimal_loss_change,
minimal_factor_change=minimal_factor_change,
maximal_iterations=maximal_iterations,
observers=observers,
smoother_choice=smoother_choice,
output_column=output_column,
learn_rate=learn_rate,
aggregate=aggregate,
regalpha=regalpha,
)
else:
raise Exception("No valid CB estimator.")
binner = binning.BinNumberTransformer(n_bins=number_of_bins, feature_properties=feature_properties)

return Pipeline([("binning", binner), ("CB", estimator)])
return Pipeline([("binning", binner), ("CB", estimatorCB)])


def pipeline_CBPoissonRegressor(**kwargs):
Expand Down Expand Up @@ -87,4 +171,4 @@ def pipeline_CBGBSRegressor(**kwargs):
"""
Convenience function containing CBGBSRegressor (estimator) + binning.
"""
return pipeline_CB(CBGBSRegressor,**kwargs)
return pipeline_CB(CBGBSRegressor,**kwargs)
188 changes: 186 additions & 2 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
IsotonicRegressor
from cyclic_boosting.plots import plot_analysis
from cyclic_boosting.pipelines import pipeline_CBPoissonRegressor, \
pipeline_CBClassifier, pipeline_CBLocationRegressor
pipeline_CBClassifier, pipeline_CBLocationRegressor, pipeline_CBExponential, \
pipeline_CBNBinomRegressor, pipeline_CBNBinomC, pipeline_CBGBSRegressor


def plot_CB(filename, plobs, binner):
Expand Down Expand Up @@ -77,7 +78,8 @@ def cb_poisson_regressor_model():
}

plobs = [
observers.PlottingObserver(iteration=-1)
observers.PlottingObserver(iteration=1),
observers.PlottingObserver(iteration=-1),
]

CB_pipeline = pipeline_CBPoissonRegressor(
Expand All @@ -103,6 +105,8 @@ def test_poisson_regression():

CB_est = cb_poisson_regressor_model()
CB_est.fit(X.copy(), y)
# plot_CB('analysis_CB_iterfirst',
# [CB_est[-1].observers[0]], CB_est[-2])
# plot_CB('analysis_CB_iterlast',
# [CB_est[-1].observers[-1]], CB_est[-2])

Expand Down Expand Up @@ -140,6 +144,95 @@ def test_poisson_regression_default_features():
np.testing.assert_almost_equal(mad, 1.7185, 3)


def test_nbinom_regression_default_features():
np.random.seed(42)

df = pd.read_csv("./tests/integration_test_data.csv")

X, y = prepare_data(df)
X = X[[
'dayofweek',
'L_ID',
'PG_ID_3',
'P_ID',
'PROMOTION_TYPE',
'price_ratio',
'dayofyear'
]]

fp = feature_properties()
CB_est = pipeline_CBNBinomRegressor(
feature_properties=fp,
a=1.2,
c=0.1,
)
CB_est.fit(X.copy(), y)

yhat = CB_est.predict(X.copy())

mad = np.nanmean(np.abs(y - yhat))
np.testing.assert_almost_equal(mad, 1.7198, 3)


def cb_exponential_regressor_model():
features = get_features()
features.remove('price_ratio')
price_features = [
'L_ID',
'PG_ID_1',
'PG_ID_2',
'PG_ID_3',
'P_ID',
'dayofweek',
]

fp = feature_properties()
explicit_smoothers = {('dayofyear',): SeasonalSmoother(order=3),
('price_ratio',): IsotonicRegressor(increasing=False),
}

plobs = [
observers.PlottingObserver(iteration=1),
observers.PlottingObserver(iteration=-1),
]

CB_pipeline = pipeline_CBExponential(
feature_properties=fp,
standard_feature_groups=features,
external_feature_groups=price_features,
external_colname='price_ratio',
observers=plobs,
maximal_iterations=50,
smoother_choice=common_smoothers.SmootherChoiceGroupBy(
use_regression_type=True,
use_normalization=False,
explicit_smoothers=explicit_smoothers),
)

return CB_pipeline


def test_exponential_regression():
np.random.seed(42)

df = pd.read_csv("./tests/integration_test_data.csv")

X, y = prepare_data(df)
X.loc[df['price_ratio'] == np.nan, 'price_ratio'] = 1.

CB_est = cb_exponential_regressor_model()
CB_est.fit(X.copy(), y)
# plot_CB('analysis_CB_iterfirst',
# [CB_est[-1].observers[0]], CB_est[-2])
# plot_CB('analysis_CB_iterlast',
# [CB_est[-1].observers[-1]], CB_est[-2])

yhat = CB_est.predict(X.copy())

mad = np.nanmean(np.abs(y - yhat))
np.testing.assert_almost_equal(mad, 1.7203, 3)


def cb_classifier_model():
features = get_features()

Expand Down Expand Up @@ -217,3 +310,94 @@ def test_location_regression_default_features():

mad = np.nanmean(np.abs(y - yhat))
np.testing.assert_almost_equal(mad, 1.7511, 3)


def cb_width_model():
features = ['dayofweek', 'L_ID', 'PG_ID_3', 'PROMOTION_TYPE']

fp = feature_properties()
explicit_smoothers = {}

plobs = [
observers.PlottingObserver(iteration=-1)
]

CB_pipeline = pipeline_CBNBinomC(
mean_prediction_column='yhat_mean',
feature_properties=fp,
feature_groups=features,
observers=plobs,
maximal_iterations=50,
smoother_choice=common_smoothers.SmootherChoiceGroupBy(
use_regression_type=True,
use_normalization=False,
explicit_smoothers=explicit_smoothers),
)

return CB_pipeline


def test_width_regression_default_features():
np.random.seed(42)

df = pd.read_csv("./tests/integration_test_data.csv")

X, y = prepare_data(df)
X = X[[
'dayofweek',
'L_ID',
'PG_ID_3',
'P_ID',
'PROMOTION_TYPE',
'price_ratio',
'dayofyear'
]]

fp = feature_properties()
CB_est = pipeline_CBPoissonRegressor(
feature_properties=fp
)
CB_est.fit(X.copy(), y)
yhat = CB_est.predict(X.copy())
X['yhat_mean'] = yhat

CB_est_width = cb_width_model()
CB_est_width.fit(X.copy(), y)
c = CB_est_width.predict(X.copy())
np.testing.assert_almost_equal(c.mean(), 0.365, 3)


def test_GBS_regression_default_features():
np.random.seed(42)

df = pd.read_csv("./tests/integration_test_data.csv")

X, y = prepare_data(df)
X = X[[
'dayofweek',
'L_ID',
'PG_ID_3',
'P_ID',
'PROMOTION_TYPE',
'price_ratio',
'dayofyear'
]]

y[1000:10000] = -y[1000:10000]

fp = feature_properties()
# plobs = [
# observers.PlottingObserver(iteration=-1)
# ]
CB_est = pipeline_CBGBSRegressor(
# observers=plobs,
feature_properties=fp
)
CB_est.fit(X.copy(), y)
# plot_CB('analysis_CB_iterlast',
# [CB_est[-1].observers[-1]], CB_est[-2])

yhat = CB_est.predict(X.copy())

mad = np.nanmean(np.abs(y - yhat))
np.testing.assert_almost_equal(mad, 2.5755, 3)

0 comments on commit 3e09a6c

Please sign in to comment.