Skip to content

Commit

Permalink
updates for predict contributions
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonRobertPike committed Sep 23, 2024
1 parent 3ba04b7 commit 6f24901
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 34 deletions.
105 changes: 72 additions & 33 deletions lightgbmlss/distributions/distribution_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class DistributionClass:
penalize_crossing: bool
Whether to include a penalty term to discourage crossing of expectiles. Only used for Expectile distribution.
"""

def __init__(self,
distribution: torch.distributions.Distribution = None,
univariate: bool = True,
Expand Down Expand Up @@ -375,51 +376,89 @@ def predict_dist(self,
Predictions.
"""

kwargs = dict()
if pred_type == "contributions":
kwargs["pred_contrib"] = True
n_outputs_per_dist = data.shape[1] + 1
else:
n_outputs_per_dist = 1

predt = torch.tensor(
booster.predict(data, raw_score=True),
booster.predict(data, raw_score=True, **kwargs),
dtype=torch.float32
).reshape(-1, self.n_dist_param)
).reshape(-1, self.n_dist_param * n_outputs_per_dist)

# Set init_score as starting point for each distributional parameter.
init_score_pred = torch.tensor(
np.ones(shape=(data.shape[0], 1))*start_values,
np.ones(shape=(data.shape[0], 1)) * start_values,
dtype=torch.float32
)

# The predictions don't include the init_score specified in creating the train data.
# Hence, it needs to be added manually with the corresponding transform for each distributional parameter.
dist_params_predt = np.concatenate(
[
response_fun(
predt[:, i].reshape(-1, 1) + init_score_pred[:, i].reshape(-1, 1)).numpy()
for i, (dist_param, response_fun) in enumerate(self.param_dict.items())
],
axis=1,
)
dist_params_predt = pd.DataFrame(dist_params_predt)
dist_params_predt.columns = self.param_dict.keys()
if pred_type == "contributions":
CONST_COL = "Const"
COLUMN_LEVELS = ["distribution_arg", "FeatureContribution"]

feature_columns = data.columns.tolist() + [CONST_COL]
contributions_predt = pd.DataFrame(
predt,
columns=pd.MultiIndex.from_product(
[self.distribution_arg_names, feature_columns],
names=COLUMN_LEVELS
),
index=data.index,
)

init_score_pred_df = pd.DataFrame(
init_score_pred,
columns=pd.MultiIndex.from_product(
[self.distribution_arg_names, ["Const"]],
names=COLUMN_LEVELS
),
index=data.index
)
contributions_predt[init_score_pred_df.columns] = (
contributions_predt[init_score_pred_df.columns] + init_score_pred_df
)
# Cant include response function on individual feature contributions
return contributions_predt
else:
# The predictions don't include the init_score specified in creating the train data.
# Hence, it needs to be added manually with the corresponding transform for each distributional parameter.
dist_params_predt = np.concatenate(
[
response_fun(
predt[:, i].reshape(-1, 1) + init_score_pred[:, i].reshape(-1, 1)).numpy()
for i, (dist_param, response_fun) in enumerate(self.param_dict.items())
],
axis=1,
)
dist_params_predt = pd.DataFrame(dist_params_predt)
dist_params_predt.columns = self.param_dict.keys()

# Draw samples from predicted response distribution
pred_samples_df = self.draw_samples(predt_params=dist_params_predt,
n_samples=n_samples,
seed=seed)
if pred_type == "parameters":
return dist_params_predt

if pred_type == "parameters":
return dist_params_predt
elif pred_type == "expectiles":
return dist_params_predt
else:

elif pred_type == "expectiles":
return dist_params_predt
# Draw samples from predicted response distribution
pred_samples_df = self.draw_samples(predt_params=dist_params_predt,
n_samples=n_samples,
seed=seed)

elif pred_type == "samples":
return pred_samples_df
if pred_type == "samples":
return pred_samples_df

elif pred_type == "quantiles":
# Calculate quantiles from predicted response distribution
pred_quant_df = pred_samples_df.quantile(quantiles, axis=1).T
pred_quant_df.columns = [str("quant_") + str(quantiles[i]) for i in range(len(quantiles))]
if self.discrete:
pred_quant_df = pred_quant_df.astype(int)
return pred_quant_df
elif pred_type == "quantiles":
# Calculate quantiles from predicted response distribution
pred_quant_df = pred_samples_df.quantile(quantiles, axis=1).T
pred_quant_df.columns = [str("quant_") + str(quantiles[i]) for i in range(len(quantiles))]
if self.discrete:
pred_quant_df = pred_quant_df.astype(int)
return pred_quant_df
else:
raise RuntimeError(f"{pred_type=} not supported")

def compute_gradients_and_hessians(self,
loss: torch.tensor,
Expand Down Expand Up @@ -635,7 +674,7 @@ def dist_select(self,
try:
loss, params = dist_sel.calculate_start_values(target=target.reshape(-1, 1), max_iter=max_iter)
fit_df = pd.DataFrame.from_dict(
{self.loss_fn: loss.reshape(-1,),
{self.loss_fn: loss.reshape(-1, ),
"distribution": str(dist_name),
"params": [params]
}
Expand Down
1 change: 1 addition & 0 deletions lightgbmlss/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@ def predict(self,
- "quantiles" calculates the quantiles from the predicted distribution.
- "parameters" returns the predicted distributional parameters.
- "expectiles" returns the predicted expectiles.
- "contributions" returns constibutions of each feature and a constant by calling booster.predict(pred_contrib=True)
n_samples : int
Number of samples to draw from the predicted distribution.
quantiles : List[float]
Expand Down
20 changes: 19 additions & 1 deletion tests/test_model/test_model.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import numpy as np

from lightgbmlss.model import *
from lightgbmlss.distributions.Gaussian import *
from lightgbmlss.distributions.Mixture import *
Expand All @@ -6,6 +8,7 @@
from lightgbmlss.datasets.data_loader import load_simulated_gaussian_data
import pytest
from pytest import approx
from lightgbmlss.utils import identity_fn


@pytest.fixture
Expand Down Expand Up @@ -109,7 +112,7 @@ def test_model_univ_train_eval(self, univariate_data, univariate_lgblss, univari
# Assertions
assert isinstance(lgblss.booster, lgb.Booster)

def test_model_hpo(self, univariate_data, univariate_lgblss,):
def test_model_hpo(self, univariate_data, univariate_lgblss, ):
# Unpack
dtrain, _, _, _ = univariate_data
lgblss = univariate_lgblss
Expand Down Expand Up @@ -155,6 +158,7 @@ def test_model_predict(self, univariate_data, univariate_lgblss, univariate_para
pred_params = lgblss.predict(X_test, pred_type="parameters")
pred_samples = lgblss.predict(X_test, pred_type="samples", n_samples=n_samples)
pred_quantiles = lgblss.predict(X_test, pred_type="quantiles", quantiles=quantiles)
pred_contributions = lgblss.predict(X_test, pred_type="contributions")

# Assertions
assert isinstance(pred_params, (pd.DataFrame, type(None)))
Expand All @@ -173,6 +177,20 @@ def test_model_predict(self, univariate_data, univariate_lgblss, univariate_para
assert not np.isinf(pred_quantiles).any().any()
assert pred_quantiles.shape[1] == len(quantiles)

assert isinstance(pred_contributions, (pd.DataFrame, type(None)))
assert not pred_contributions.isna().any().any()
assert not np.isinf(pred_contributions).any().any()
assert (pred_contributions.shape[1] ==
lgblss.dist.n_dist_param * lgblss.dist.n_dist_param * (X_test.shape[1] + 1)
)

for key, func in lgblss.dist.param_dict.items():
if func == identity_fn:
assert np.allclose(
pred_contributions.xs(key, level="distribution_arg", axis=1).sum(axis=1),
pred_params[key], atol=1e-5
)

def test_model_plot(self, univariate_data, univariate_lgblss, univariate_params):
# Unpack
dtrain, dtest, _, X_test = univariate_data
Expand Down

0 comments on commit 6f24901

Please sign in to comment.