diff --git a/flaml/automl/automl.py b/flaml/automl/automl.py index cb3fe37857..0aaf7f8e5a 100644 --- a/flaml/automl/automl.py +++ b/flaml/automl/automl.py @@ -361,6 +361,11 @@ def custom_metric( } ``` mlflow_logging: boolean, default=True | Whether to log the training results to mlflow. Not valid if mlflow is not installed. + multioutput_train_size: int, float or None, default=None | For multi-output regression tasks with + "holdout" evaluation, allows manual specification of validation set by concatenating training and + validation data and specifying where to split. If int, represents the number of samples in the + training set. If float (between 0.0 and 1.0), represents the proportion of the dataset to include + in the training set. If None, no split is performed. Only used when X_val and y_val are not provided. """ if ERROR: @@ -419,6 +424,7 @@ def custom_metric( settings["custom_hp"] = settings.get("custom_hp", {}) settings["skip_transform"] = settings.get("skip_transform", False) settings["mlflow_logging"] = settings.get("mlflow_logging", True) + settings["multioutput_train_size"] = settings.get("multioutput_train_size", None) self._estimator_type = "classifier" if settings["task"] in CLASSIFICATION else "regressor" self.best_run_id = None @@ -1764,6 +1770,42 @@ def metric_constraints(self) -> list: """ return self._metric_constraints + def _train_val_split(self, X, y, train_size): + """Split concatenated training and validation data. + + Args: + X: Combined training and validation features + y: Combined training and validation labels + train_size: int or float - if int, number of samples for training set; + if float, proportion of samples for training set + + Returns: + X_train, X_val, y_train, y_val + """ + n_samples = len(X) + + # Validate train_size parameter + if isinstance(train_size, float): + if not 0.0 < train_size < 1.0: + raise ValueError(f"train_size as a float must be between 0.0 and 1.0, got {train_size}") + train_size = int(n_samples * train_size) + elif isinstance(train_size, int): + if train_size <= 0 or train_size >= n_samples: + raise ValueError(f"train_size as an integer must be between 1 and {n_samples - 1}, got {train_size}") + else: + raise TypeError(f"train_size must be int or float, got {type(train_size).__name__}") + + # Check we have at least one sample for validation + if train_size >= n_samples: + raise ValueError(f"train_size ({train_size}) must be less than the number of samples ({n_samples})") + + X_train = X[:train_size] + X_val = X[train_size:] + y_train = y[:train_size] + y_val = y[train_size:] + + return X_train, X_val, y_train, y_val + def _prepare_data(self, eval_method, split_ratio, n_splits): self._state.task.prepare_data( self._state, @@ -1837,6 +1879,7 @@ def fit( mlflow_logging=None, fit_kwargs_by_estimator=None, mlflow_exp_name=None, + multioutput_train_size=None, **fit_kwargs, ): """Find a model for a given task. @@ -2154,6 +2197,11 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): } } ``` + multioutput_train_size: int, float or None, default=None | For multi-output regression tasks with + "holdout" evaluation, allows manual specification of validation set by concatenating training and + validation data and specifying where to split. If int, represents the number of samples in the + training set. If float (between 0.0 and 1.0), represents the proportion of the dataset to include + in the training set. If None, no split is performed. Only used when X_val and y_val are not provided. **fit_kwargs: Other key word arguments to pass to fit() function of the searched learners, such as sample_weight. Below are a few examples of @@ -2356,6 +2404,28 @@ def cv_score_agg_func(val_loss_folds, log_metrics_folds): self.mlflow_integration.only_history = True except KeyError: logger.info("Not in Fabric, Skipped") + + # Handle multioutput_train_size parameter + multioutput_train_size = ( + self._settings.get("multioutput_train_size") if multioutput_train_size is None else multioutput_train_size + ) + if multioutput_train_size is not None: + if X_val is None and y_val is None: + # Warn if not using holdout evaluation + if eval_method not in ["auto", "holdout", None]: + logger.warning( + f"multioutput_train_size is intended for use with 'holdout' evaluation method, " + f"but eval_method={eval_method}. The split may be overridden during data preparation." + ) + # Split the concatenated training data into train and validation sets + X_train, X_val, y_train, y_val = self._train_val_split(X_train, y_train, multioutput_train_size) + logger.info( + f"Split data using multioutput_train_size={multioutput_train_size}: " + f"train size={len(X_train)}, val size={len(X_val)}" + ) + else: + logger.warning("multioutput_train_size is ignored because X_val and y_val are already provided.") + task.validate_data( self, self._state, diff --git a/test/automl/test_regression.py b/test/automl/test_regression.py index c451618246..3f05c84015 100644 --- a/test/automl/test_regression.py +++ b/test/automl/test_regression.py @@ -244,6 +244,41 @@ def test_multioutput(): print(model.predict(X_test)) +def test_multioutput_train_size(): + """Test multioutput_train_size parameter for manual validation set specification.""" + from sklearn.multioutput import MultiOutputRegressor + + # create multi-output regression data + X, y = make_regression(n_samples=100, n_features=10, n_targets=3, random_state=42) + + # Concatenate what would be training and validation data + # Simulate having 70 samples for training and 30 for validation + train_size = 70 + + # train the model using multioutput_train_size + model = MultiOutputRegressor( + AutoML(task="regression", time_budget=1, eval_method="holdout", multioutput_train_size=train_size) + ) + model.fit(X, y) + + # predict on a subset + predictions = model.predict(X[:10]) + + # Verify predictions have correct shape + assert predictions.shape == (10, 3), f"Expected shape (10, 3), got {predictions.shape}" + print(f"Predictions shape: {predictions.shape}") + print(f"Sample predictions:\n{predictions[:3]}") + + # Test with float train_size (proportion) + model2 = MultiOutputRegressor( + AutoML(task="regression", time_budget=1, eval_method="holdout", multioutput_train_size=0.7) + ) + model2.fit(X, y) + predictions2 = model2.predict(X[:10]) + assert predictions2.shape == (10, 3), f"Expected shape (10, 3), got {predictions2.shape}" + print("Model with float train_size also works correctly") + + @pytest.mark.parametrize( "estimator", [