Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions flaml/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,11 @@ def custom_metric(
}
```
mlflow_logging: boolean, default=True | Whether to log the training results to mlflow. Not valid if mlflow is not installed.
multioutput_train_size: int, float or None, default=None | For multi-output regression tasks with
"holdout" evaluation, allows manual specification of validation set by concatenating training and
validation data and specifying where to split. If int, represents the number of samples in the
training set. If float (between 0.0 and 1.0), represents the proportion of the dataset to include
in the training set. If None, no split is performed. Only used when X_val and y_val are not provided.

"""
if ERROR:
Expand Down Expand Up @@ -419,6 +424,7 @@ def custom_metric(
settings["custom_hp"] = settings.get("custom_hp", {})
settings["skip_transform"] = settings.get("skip_transform", False)
settings["mlflow_logging"] = settings.get("mlflow_logging", True)
settings["multioutput_train_size"] = settings.get("multioutput_train_size", None)

self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor"
self.best_run_id = None
Expand Down Expand Up @@ -1764,6 +1770,42 @@ def metric_constraints(self) -> list:
"""
return self._metric_constraints

def _train_val_split(self, X, y, train_size):
"""Split concatenated training and validation data.

Args:
X: Combined training and validation features
y: Combined training and validation labels
train_size: int or float - if int, number of samples for training set;
if float, proportion of samples for training set

Returns:
X_train, X_val, y_train, y_val
"""
n_samples = len(X)

# Validate train_size parameter
if isinstance(train_size, float):
if not 0.0 < train_size < 1.0:
raise ValueError(f"train_size as a float must be between 0.0 and 1.0, got {train_size}")
train_size = int(n_samples * train_size)
elif isinstance(train_size, int):
if train_size <= 0 or train_size >= n_samples:
raise ValueError(f"train_size as an integer must be between 1 and {n_samples - 1}, got {train_size}")
else:
raise TypeError(f"train_size must be int or float, got {type(train_size).__name__}")

# Check we have at least one sample for validation
if train_size >= n_samples:
raise ValueError(f"train_size ({train_size}) must be less than the number of samples ({n_samples})")

X_train = X[:train_size]
X_val = X[train_size:]
y_train = y[:train_size]
y_val = y[train_size:]

return X_train, X_val, y_train, y_val

def _prepare_data(self, eval_method, split_ratio, n_splits):
self._state.task.prepare_data(
self._state,
Expand Down Expand Up @@ -1837,6 +1879,7 @@ def fit(
mlflow_logging=None,
fit_kwargs_by_estimator=None,
mlflow_exp_name=None,
multioutput_train_size=None,
**fit_kwargs,
):
"""Find a model for a given task.
Expand Down Expand Up @@ -2154,6 +2197,11 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
}
}
```
multioutput_train_size: int, float or None, default=None | For multi-output regression tasks with
"holdout" evaluation, allows manual specification of validation set by concatenating training and
validation data and specifying where to split. If int, represents the number of samples in the
training set. If float (between 0.0 and 1.0), represents the proportion of the dataset to include
in the training set. If None, no split is performed. Only used when X_val and y_val are not provided.

**fit_kwargs: Other key word arguments to pass to fit() function of
the searched learners, such as sample_weight. Below are a few examples of
Expand Down Expand Up @@ -2356,6 +2404,28 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds):
self.mlflow_integration.only_history = True
except KeyError:
logger.info("Not in Fabric, Skipped")

# Handle multioutput_train_size parameter
multioutput_train_size = (
self._settings.get("multioutput_train_size") if multioutput_train_size is None else multioutput_train_size
)
if multioutput_train_size is not None:
if X_val is None and y_val is None:
# Warn if not using holdout evaluation
if eval_method not in ["auto", "holdout", None]:
logger.warning(
f"multioutput_train_size is intended for use with 'holdout' evaluation method, "
f"but eval_method={eval_method}. The split may be overridden during data preparation."
)
# Split the concatenated training data into train and validation sets
X_train, X_val, y_train, y_val = self._train_val_split(X_train, y_train, multioutput_train_size)
logger.info(
f"Split data using multioutput_train_size={multioutput_train_size}: "
f"train size={len(X_train)}, val size={len(X_val)}"
)
else:
logger.warning("multioutput_train_size is ignored because X_val and y_val are already provided.")

task.validate_data(
self,
self._state,
Expand Down
35 changes: 35 additions & 0 deletions test/automl/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,41 @@ def test_multioutput():
print(model.predict(X_test))


def test_multioutput_train_size():
"""Test multioutput_train_size parameter for manual validation set specification."""
from sklearn.multioutput import MultiOutputRegressor

# create multi-output regression data
X, y = make_regression(n_samples=100, n_features=10, n_targets=3, random_state=42)

# Concatenate what would be training and validation data
# Simulate having 70 samples for training and 30 for validation
train_size = 70

# train the model using multioutput_train_size
model = MultiOutputRegressor(
AutoML(task="regression", time_budget=1, eval_method="holdout", multioutput_train_size=train_size)
)
model.fit(X, y)

# predict on a subset
predictions = model.predict(X[:10])

# Verify predictions have correct shape
assert predictions.shape == (10, 3), f"Expected shape (10, 3), got {predictions.shape}"
print(f"Predictions shape: {predictions.shape}")
print(f"Sample predictions:\n{predictions[:3]}")

# Test with float train_size (proportion)
model2 = MultiOutputRegressor(
AutoML(task="regression", time_budget=1, eval_method="holdout", multioutput_train_size=0.7)
)
model2.fit(X, y)
predictions2 = model2.predict(X[:10])
assert predictions2.shape == (10, 3), f"Expected shape (10, 3), got {predictions2.shape}"
print("Model with float train_size also works correctly")


@pytest.mark.parametrize(
"estimator",
[
Expand Down
Loading