Skip to content
Open
17 changes: 17 additions & 0 deletions auto_create_timestamps_pr.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Auto-Create Timestamps in `prettify_prediction()` When `test_data` is None

## Why are these changes needed?

Currently, the `TimeSeriesDataset.prettify_prediction()` method in `flaml/automl/time_series/ts_data.py` throws a `NotImplementedError` when `test_data` is `None`.
This is frustrating for users who want to make predictions without providing explicit test data timestamps.

**This PR implements automatic timestamp generation** by:

1. Using the training data's end date as the starting point.
2. Generating future timestamps based on the inferred frequency.
3. Supporting `np.ndarray`, `pd.Series`, and `pd.DataFrame`.

## Checks

- [x] Pre-commit linting (black, ruff).
- [x] Added regression tests demonstrating the fix.
16 changes: 13 additions & 3 deletions flaml/automl/time_series/ts_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,13 +243,23 @@ def prettify_prediction(self, y_pred: Union[pd.DataFrame, pd.Series, np.ndarray]
y_pred[self.time_col] = self.test_data[self.time_col]

else:
# Auto-create timestamps when test_data is None
if isinstance(y_pred, np.ndarray):
raise ValueError("Can't enrich np.ndarray as self.test_data is None")
y_pred = pd.DataFrame(data=y_pred, columns=self.target_names)
elif isinstance(y_pred, pd.Series):
assert len(self.target_names) == 1, "Not enough columns in y_pred"
y_pred = pd.DataFrame({self.target_names[0]: y_pred})
# TODO auto-create the timestamps for the time column instead of throwing
raise NotImplementedError("Need a non-None test_data for this to work, for now")

# Generate timestamps based on training data's end_date and frequency
train_end_date = self.train_data[self.time_col].max()
pred_timestamps = pd.date_range(
start=train_end_date,
periods=len(y_pred) + 1,
freq=self.frequency,
)[
1:
] # Skip the first timestamp (train_end_date itself)
y_pred[self.time_col] = pred_timestamps

assert isinstance(y_pred, pd.DataFrame)
assert self.time_col in y_pred.columns
Expand Down
118 changes: 118 additions & 0 deletions test/automl/test_forecast.py
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,124 @@ def test_log_training_metric_ts_models():
assert automl.best_estimator == estimator


def test_prettify_prediction_auto_timestamps_data_types():
"""Test auto-timestamp generation with different input data types.

Before this PR fix, calling prettify_prediction() with test_data=None raised:
- ValueError for np.ndarray: "Can't enrich np.ndarray as self.test_data is None"
- NotImplementedError for pd.Series/DataFrame: "Need a non-None test_data for this to work"

This test verifies the fix works for np.ndarray, pd.Series, and pd.DataFrame inputs.
"""
from flaml.automl.time_series import TimeSeriesDataset

# Create training data with daily frequency
n = 30
train_df = pd.DataFrame(
{
"date": pd.date_range(start="2023-01-01", periods=n, freq="D"),
"value": np.random.randn(n),
}
)
tsds = TimeSeriesDataset(train_df, time_col="date", target_names="value")
assert len(tsds.test_data) == 0

pred_steps = 5
expected_start = pd.date_range(start=train_df["date"].max(), periods=2, freq="D")[1]

# Test np.ndarray
result = tsds.prettify_prediction(np.random.randn(pred_steps))
assert isinstance(result, pd.DataFrame)
assert len(result) == pred_steps
assert result["date"].iloc[0] == expected_start

# Test pd.Series
result = tsds.prettify_prediction(pd.Series(np.random.randn(pred_steps)))
assert isinstance(result, pd.DataFrame)
assert len(result) == pred_steps
assert result["date"].iloc[0] == expected_start

# Test pd.DataFrame
result = tsds.prettify_prediction(pd.DataFrame({"value": np.random.randn(pred_steps)}))
assert isinstance(result, pd.DataFrame)
assert len(result) == pred_steps
assert result["date"].iloc[0] == expected_start


def test_prettify_prediction_auto_timestamps_frequencies():
"""Test auto-timestamp generation with different frequencies.

Before this PR fix, this would raise NotImplementedError when test_data is None.
Tests daily and monthly frequencies with np.ndarray input.
"""
from flaml.automl.time_series import TimeSeriesDataset

pred_steps = 6

# Test daily frequency
train_df_daily = pd.DataFrame(
{
"date": pd.date_range(start="2023-01-01", periods=30, freq="D"),
"value": np.random.randn(30),
}
)
tsds_daily = TimeSeriesDataset(train_df_daily, time_col="date", target_names="value")
result = tsds_daily.prettify_prediction(np.random.randn(pred_steps))
expected_dates = pd.date_range(start=train_df_daily["date"].max(), periods=pred_steps + 1, freq="D")[1:]
pd.testing.assert_index_equal(pd.DatetimeIndex(result["date"]), expected_dates, check_names=False)

# Test monthly frequency
train_df_monthly = pd.DataFrame(
{
"date": pd.date_range(start="2022-01-01", periods=24, freq="MS"),
"value": np.random.randn(24),
}
)
tsds_monthly = TimeSeriesDataset(train_df_monthly, time_col="date", target_names="value")
result = tsds_monthly.prettify_prediction(np.random.randn(pred_steps))
expected_dates = pd.date_range(start=train_df_monthly["date"].max(), periods=pred_steps + 1, freq="MS")[1:]
pd.testing.assert_index_equal(pd.DatetimeIndex(result["date"]), expected_dates, check_names=False)


def test_auto_timestamps_e2e(budget=3):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test already works without the PR. Should have a test that won't work on current release and will be fixed with PR.

"""E2E test: train a model and predict without explicit test_data timestamps.

This showcases the improvement from this PR - users can now make predictions
without providing explicit test data timestamps.
"""
try:
import statsmodels # noqa: F401
except ImportError:
print("statsmodels not installed, skipping E2E test")
return

# Create training data
n = 100
train_df = pd.DataFrame(
{
"ds": pd.date_range(start="2020-01-01", periods=n, freq="D"),
"y": np.sin(np.linspace(0, 10, n)) + np.random.randn(n) * 0.1,
}
)

# Train model
automl = AutoML()
automl.fit(
dataframe=train_df,
label="y",
period=10,
task="ts_forecast",
time_budget=budget,
estimator_list=["arima"],
)

# Predict using steps (no explicit test_data) - this is the key improvement
y_pred = automl.predict(10)
assert y_pred is not None
assert len(y_pred) == 10
print("E2E test passed: model trained and predicted without explicit test_data!")


if __name__ == "__main__":
# test_forecast_automl(60)
# test_multivariate_forecast_num(5)
Expand Down