diff --git a/.github/workflows/build_deploy_docs.yml b/.github/workflows/build_deploy_docs.yml new file mode 100644 index 000000000..ddaedfa59 --- /dev/null +++ b/.github/workflows/build_deploy_docs.yml @@ -0,0 +1,66 @@ +# Simple workflow for deploying static content to GitHub Pages generated by Github +# except for added job steps "Copy Static Files" through "Build MkDocs Pages". +name: Deploy static content to Pages + +on: + # Runs on pushes targeting the default branch + push: + branches: ["master"] + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write + +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: "pages" + cancel-in-progress: false + +jobs: + # Single deploy job since we're just deploying + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Pages + uses: actions/configure-pages@v3 + - name: Copy Static Files + run: | + cp -R docs/website site/ + - name: Replace GITHUB token + # Use different sed delimiter to avoid clashing with forward slash in URL + run: | + find docs/ -type f -exec sed -i "s@GITHUB@${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}\/blob\/master@g" {} \; + - name: Replace WEBSITE token + # Use different sed delimiter to avoid clashing with forward slash in URL + run: | + WEBSITE_URL="https://${GITHUB_REPOSITORY_OWNER}.github.io/automlbenchmark" + find docs/ -type f -exec sed -i "s@WEBSITE@${WEBSITE_URL}@g" {} \; + sed -i "s@WEBSITE@${WEBSITE_URL}@g" mkdocs.yml + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Install MkDocs + run: | + python -m pip install mkdocs-material + - name: Build MkDocs Pages + run: | + mkdocs build --site-dir site/docs + - name: Upload artifact + uses: actions/upload-pages-artifact@v2 + with: + # Upload entire repository + path: './site/' + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v2 diff --git a/amlb/benchmark.py b/amlb/benchmark.py index 572dc91a3..b8c531208 100644 --- a/amlb/benchmark.py +++ b/amlb/benchmark.py @@ -379,7 +379,7 @@ def _is_task_enabled(task_def): class TaskConfig: - def __init__(self, name, openml_task_id, test_server, fold, metrics, seed, + def __init__(self, name, openml_task_id, test_server, fold, metrics, quantile_levels, seed, max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb, input_dir, output_dir, tag, command, git_info, measure_inference_time: bool = False): self.framework = None @@ -404,6 +404,7 @@ def __init__(self, name, openml_task_id, test_server, fold, metrics, seed, self.git_info = git_info self.measure_inference_time = measure_inference_time self.ext = ns() # used if frameworks require extra config points + self.quantile_levels = list(sorted(quantile_levels)) def __setattr__(self, name, value): if name == 'metrics': @@ -477,9 +478,10 @@ def __init__(self, benchmark: Benchmark, task_def, fold): self.fold = fold self.task_config = TaskConfig( name=task_def.name, - openml_task_id=task_def.openml_task_id, + openml_task_id=task_def["openml_task_id"], fold=fold, metrics=task_def.metric, + quantile_levels=task_def.quantile_levels, seed=rget().seed(fold), max_runtime_seconds=task_def.max_runtime_seconds, cores=task_def.cores, diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py index ee92d3b1a..f39e3ca7b 100644 --- a/amlb/datasets/file.py +++ b/amlb/datasets/file.py @@ -1,5 +1,6 @@ from abc import abstractmethod import logging +import math import os import re import tempfile @@ -33,17 +34,17 @@ def __init__(self, cache_dir=None): def load(self, dataset, fold=0): dataset = dataset if isinstance(dataset, ns) else ns(path=dataset) log.debug("Loading dataset %s", dataset) + target = dataset['target'] + type_ = dataset['type'] + features = dataset['features'] + + if type_ and DatasetType[type_] == DatasetType.timeseries: + return TimeSeriesDataset(path=dataset['path'], fold=fold, target=target, features=features, cache_dir=self._cache_dir, config=dataset) + paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold, name=dataset['name'] if 'name' in dataset else None) assert fold < len(paths['train']), f"No training dataset available for fold {fold} among dataset files {paths['train']}" - # seed = rget().seed(fold) - # if len(paths['test']) == 0: - # log.warning("No test file in the dataset, the train set will automatically be split 90%/10% using the given seed.") - # else: assert fold < len(paths['test']), f"No test dataset available for fold {fold} among dataset files {paths['test']}" - target = dataset['target'] - type_ = dataset['type'] - features = dataset['features'] ext = os.path.splitext(paths['train'][fold])[1].lower() train_path = paths['train'][fold] test_path = paths['test'][fold] if len(paths['test']) > 0 else None @@ -139,40 +140,6 @@ def __repr__(self): return repr_def(self) - def extend_dataset_with_timeseries_config(self, dataset, dataset_config): - dataset = deepcopy(dataset) - dataset_config = deepcopy(dataset_config) - if dataset_config['id_column'] is None: - log.warning("Warning: For timeseries task setting undefined `id_column` to `item_id`.") - dataset_config['id_column'] = "item_id" - if dataset_config['forecast_range_in_steps'] is None: - log.warning("Warning: For timeseries task setting undefined `forecast_range_in_steps` to `1`.") - dataset_config['forecast_range_in_steps'] = "1" - - dataset.timestamp_column=dataset_config['timestamp_column'] - dataset.id_column=dataset_config['id_column'] - dataset.forecast_range_in_steps=int(dataset_config['forecast_range_in_steps']) - - train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count() - test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count() - forecast_range_in_steps_mean_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean()) - forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1 - if not dataset.forecast_range_in_steps == forecast_range_in_steps_mean_diff_train_test: - msg = f"Warning: Forecast range {dataset.forecast_range_in_steps}, does not equal mean difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}." - log.warning(msg) - if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.: - msg = f"Error: Not all sequences of train and test set have same sequence length difference." - raise ValueError(msg) - if dataset.forecast_range_in_steps > forecast_range_in_steps_mean_diff_train_test: - msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}." - raise ValueError(msg) - if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test: - msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than minimum sequence length + 1, {forecast_range_in_steps_max_min_train_test}." - raise ValueError(msg) - return dataset - - - class FileDataset(Dataset): def __init__(self, train: Datasplit, test: Datasplit, @@ -350,10 +317,88 @@ def __init__(self, train_path, test_path, # todo: handle auto-split (if test_path is None): requires loading the training set, split, save super().__init__(None, None, target=target, features=features, type=type) - self._train = CsvDatasplit(self, train_path, timestamp_column=timestamp_column) - self._test = CsvDatasplit(self, test_path, timestamp_column=timestamp_column) + self._train = CsvDatasplit(self, train_path) + self._test = CsvDatasplit(self, test_path) + self._dtypes = None + + +class TimeSeriesDataset(FileDataset): + def __init__(self, path, fold, target, features, cache_dir, config): + super().__init__(None, None, target=target, features=features, type="timeseries") + if config['forecast_horizon_in_steps'] is None: + raise AssertionError("Task definition for timeseries must include `forecast_horizon_in_steps`") + if config['freq'] is None: + raise AssertionError("Task definition for timeseries must include `freq`") + if config['seasonality'] is None: + raise AssertionError("Task definition for timeseries must include `seasonality`") + + full_data = read_csv(path) + if config['id_column'] is None: + log.warning("Warning: For timeseries task, setting undefined `id_column` to `item_id`") + config['id_column'] = 'item_id' + if config['id_column'] not in full_data.columns: + raise ValueError(f'The id_column with name {config["id_column"]} is missing from the dataset') + if config['timestamp_column'] is None: + log.warning("Warning: For timeseries task, setting undefined `timestamp_column` to `timestamp`") + config['timestamp_column'] = 'timestamp' + if config['timestamp_column'] not in full_data.columns: + raise ValueError(f'The timestamp_column with name {config["timestamp_column"]} is missing from the dataset') + + self.forecast_horizon_in_steps = int(config['forecast_horizon_in_steps']) + self.freq = pd.tseries.frequencies.to_offset(config['freq']).freqstr + self.seasonality = int(config['seasonality']) + self.id_column = config['id_column'] + self.timestamp_column = config['timestamp_column'] + + full_data[self.timestamp_column] = pd.to_datetime(full_data[self.timestamp_column]) + if config['name'] is not None: + file_name = config['name'] + else: + file_name = os.path.splitext(os.path.basename(path))[0] + save_dir = os.path.join(cache_dir, file_name, str(fold)) + train_path, test_path = self.save_train_and_test_splits(full_data, fold=fold, save_dir=save_dir) + + self._train = CsvDatasplit(self, train_path, timestamp_column=self.timestamp_column) + self._test = CsvDatasplit(self, test_path, timestamp_column=self.timestamp_column) self._dtypes = None + # Store repeated item_id & in-sample seasonal error for each time step in the forecast horizon - needed later for metrics like MASE. + # We need to store this information here because Result object has no access to past time series values. + self.repeated_item_id = self.test.data[self.id_column].astype("category").cat.codes.to_numpy() + self.repeated_abs_seasonal_error = self.compute_seasonal_error() + + def save_train_and_test_splits(self, full_data, fold, save_dir): + full_data = full_data.sort_values(by=[self.id_column, self.timestamp_column]) + shortest_ts_length = full_data.groupby(self.id_column).size().min() + min_expected_ts_length = (fold + 1) * self.forecast_horizon_in_steps + 1 + if shortest_ts_length < min_expected_ts_length: + raise ValueError( + f'All time series in the dataset must have length > `(fold + 1) * forecast_horizon_in_steps` ' + f'(at least {min_expected_ts_length + 1}), but shortest time series has length {shortest_ts_length}' + ) + # Remove the last `steps_to_remove` steps from each time series to obtain the correct fold + if fold > 0: + steps_to_remove = (fold + 1) * self.forecast_horizon_in_steps + full_data = full_data.groupby(self.id_column, as_index=False).nth(slice(None, -steps_to_remove)) + train_data = full_data.groupby(self.id_column, as_index=False).nth(slice(None, -self.forecast_horizon_in_steps)) + test_data = full_data.groupby(self.id_column, as_index=False).nth(slice(-self.forecast_horizon_in_steps, None)) + + if not os.path.exists(save_dir): + os.makedirs(save_dir) + train_path = os.path.join(save_dir, "train.csv") + test_path = os.path.join(save_dir, "test.csv") + + train_data.to_csv(train_path, index=False) + test_data.to_csv(test_path, index=False) + return train_path, test_path + + def compute_seasonal_error(self): + train_data_with_index = self.train.data.set_index(self.id_column) + seasonal_diffs = train_data_with_index[self.target.name].groupby(level=self.id_column).diff(self.seasonality).abs() + abs_seasonal_error = seasonal_diffs.groupby(level=self.id_column).mean().fillna(1.0).values + # Repeat seasonal error for each time step in the forecast horizon + return np.repeat(abs_seasonal_error, self.forecast_horizon_in_steps) + class CsvDatasplit(FileDatasplit): @@ -396,8 +441,7 @@ def load_metadata(self): else 'string' if pat.is_string_dtype(dt) else 'datetime' if pat.is_datetime64_dtype(dt) else 'object') - features = [Feature(i, col, to_feature_type(dtypes[i])) - for i, col in enumerate(self._ds.columns)] + features = [Feature(i, col, to_feature_type(dtypes[i])) for i, col in enumerate(self._ds.columns)] for f in features: col = self._ds.iloc[:, f.index] diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py index 678d1854d..e924774be 100644 --- a/amlb/datasets/openml.py +++ b/amlb/datasets/openml.py @@ -25,6 +25,12 @@ from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify +# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921 +try: + set_openml_cache = oml.config.set_cache_directory +except AttributeError: + set_openml_cache = oml.config.set_root_cache_directory + log = logging.getLogger(__name__) # hack (only adding a ? to the regexp pattern) to ensure that '?' values remain quoted when we save dataplits in arff format. @@ -39,7 +45,7 @@ class OpenmlLoader: def __init__(self, api_key, cache_dir=None): oml.config.apikey = api_key if cache_dir: - oml.config.set_cache_directory(cache_dir) + set_openml_cache(cache_dir) if oml.config.retry_policy != "robot": log.debug("Setting openml retry_policy from '%s' to 'robot'." % oml.config.retry_policy) diff --git a/amlb/datautils.py b/amlb/datautils.py index a002a236a..d8a24d2ef 100644 --- a/amlb/datautils.py +++ b/amlb/datautils.py @@ -37,19 +37,21 @@ def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dty :param header: if the columns header should be read. :param as_data_frame: if the result should be returned as a data frame (default) or a numpy array. :param dtype: data type for columns. - :param timestamp_column: column name for timestamp, to ensure dates are correctly parsed by pandas. + :param timestamp_column: name of the column that should be parsed as date. :return: a DataFrame """ - if dtype is not None and timestamp_column is not None and timestamp_column in dtype: - dtype = dtype.copy() # to avoid outer context manipulation - del dtype[timestamp_column] - + if timestamp_column is None: + parse_dates = None + else: + if dtype is not None: + dtype.pop(timestamp_column, None) + parse_dates = [timestamp_column] df = pd.read_csv(path, nrows=nrows, header=0 if header else None, index_col=0 if index else None, dtype=dtype, - parse_dates=[timestamp_column] if timestamp_column is not None else None) + parse_dates=parse_dates) return df if as_data_frame else df.values diff --git a/amlb/defaults.py b/amlb/defaults.py index 6d0bf35c5..3031be71b 100644 --- a/amlb/defaults.py +++ b/amlb/defaults.py @@ -1,9 +1,15 @@ import pathlib -from openml.config import cache_directory +import openml from amlb.utils import Namespace as ns +# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921 +try: + cache_directory = openml.config.cache_directory +except AttributeError: + cache_directory = openml.config.get_cache_directory() + default_dirs = ns( input_dir=cache_directory, output_dir=str(pathlib.Path(__file__).parent.parent / "results"), diff --git a/amlb/resources.py b/amlb/resources.py index 808a7f954..f3667b891 100644 --- a/amlb/resources.py +++ b/amlb/resources.py @@ -210,7 +210,7 @@ def _validate_task(self, task, lenient=False): if not lenient and len(missing) > 0: raise ValueError("{missing} mandatory properties as missing in task definition {taskdef}.".format(missing=missing, taskdef=task)) - for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb']: + for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb', 'quantile_levels']: if task[conf] is None: task[conf] = self.config.benchmarks.defaults[conf] log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf])) @@ -310,4 +310,3 @@ def output_dirs(root, session=None, subdirs=None, create=False): TransformRule(from_key='aws.query_frequency_seconds', to_key='aws.query_interval_seconds'), TransformRule(from_key='aws.ec2.monitoring.cpu.query_frequency_seconds', to_key='aws.ec2.monitoring.cpu.query_interval_seconds'), ] - diff --git a/amlb/results.py b/amlb/results.py index 6e1a5bc60..cdaa56725 100644 --- a/amlb/results.py +++ b/amlb/results.py @@ -2,7 +2,6 @@ **results** module provides the logic to format, save and read predictions generated by the *automl frameworks* (cf. ``TaskResult``), as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``). """ -from functools import partial import collections import io import logging @@ -16,6 +15,7 @@ from numpy import nan, sort import pandas as pd import scipy as sci +import scipy.sparse from .data import Dataset, DatasetType, Feature from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \ @@ -244,7 +244,7 @@ def load_predictions(predictions_file): if rconfig().test_mode: TaskResult.validate_predictions(df) - if 'y_past_period_error' in df.columns: + if 'repeated_item_id' in df.columns: return TimeSeriesResult(df) else: if df.shape[1] > 2: @@ -293,6 +293,8 @@ def save_predictions(dataset: Dataset, output_file: str, predictions = predictions.squeeze() if isinstance(predictions, S): predictions = predictions.values + if scipy.sparse.issparse(truth) and truth.shape[1] == 1: + truth = pd.DataFrame(truth.todense()) if isinstance(truth, DF): truth = truth.squeeze() if isinstance(truth, S): @@ -750,6 +752,108 @@ def ncrps(self): weighted_losses = quantile_losses.sum(0) / denom # shape [num_quantiles] return weighted_losses.mean() +class TimeSeriesResult(RegressionResult): + def __init__(self, predictions_df, info=None): + super().__init__(predictions_df, info) + required_columns = {'truth', 'predictions', 'repeated_item_id', 'repeated_abs_seasonal_error'} + if required_columns - set(self.df.columns): + raise ValueError(f'Missing columns for calculating time series metrics: {required_columns - set(self.df.columns)}.') + + quantile_columns = [column for column in self.df.columns if column.startswith('0.')] + unrecognized_columns = [column for column in self.df.columns if column not in required_columns and column not in quantile_columns] + if len(unrecognized_columns) > 0: + raise ValueError(f'Predictions contain unrecognized columns: {unrecognized_columns}.') + + self.type = DatasetType.timeseries + self.truth = self.df['truth'].values.astype(float) + self.item_ids = self.df['repeated_item_id'].values + self.abs_seasonal_error = self.df['repeated_abs_seasonal_error'].values.astype(float) + # predictions = point forecast, quantile_predictions = quantile forecast + self.predictions = self.df['predictions'].values.astype(float) + self.quantile_predictions = self.df[quantile_columns].values.astype(float) + self.quantile_levels = np.array(quantile_columns, dtype=float) + + if (~np.isfinite(self.predictions)).any() or (~np.isfinite(self.quantile_predictions)).any(): + raise ValueError('Predictions contain NaN or Inf values') + + _, unique_item_ids_counts = np.unique(self.item_ids, return_counts=True) + if len(set(unique_item_ids_counts)) != 1: + raise ValueError(f'Error: Predicted sequences have different lengths {unique_item_ids_counts}.') + + def _itemwise_mean(self, values): + """Compute mean for each time series.""" + return pd.Series(values).groupby(self.item_ids, sort=False).mean().values + + def _safemean(self, values): + """Compute mean, while ignoring nan, +inf, -inf values.""" + return np.mean(values[np.isfinite(values)]) + + @metric(higher_is_better=False) + def smape(self): + """Symmetric Mean Absolute Percentage Error""" + num = np.abs(self.truth - self.predictions) + denom = (np.abs(self.truth) + np.abs(self.predictions)) / 2 + return self._safemean(num / denom) + + @metric(higher_is_better=False) + def mape(self): + """Mean Absolute Percentage Error""" + num = np.abs(self.truth - self.predictions) + denom = np.abs(self.truth) + return self._safemean(num / denom) + + @metric(higher_is_better=False) + def wape(self): + """Weighted Average Percentage Error""" + return np.sum(np.abs(self.truth - self.predictions)) / np.sum(np.abs(self.truth)) + + @metric(higher_is_better=False) + def mase(self): + """Mean Absolute Scaled Error + + Error for each item is normalized by the in-sample error of the naive forecaster. + This makes scores comparable across different items. + """ + error = np.abs(self.truth - self.predictions) + error_per_item = self._itemwise_mean(error / self.abs_seasonal_error) + return self._safemean(error_per_item) + + def _quantile_loss_per_step(self): + # Array of shape [len(self.predictions), len(self.quantile_levels)] + return 2 * np.abs( + (self.quantile_predictions - self.truth[:, None]) + * ((self.quantile_predictions >= self.truth[:, None]) - self.quantile_levels) + ) + + @metric(higher_is_better=False) + def mql(self): + """Quantile Loss, also known as Pinball Loss, averaged across all quantile levels & time steps. + + Equivalent to the Weighted Interval Score if the quantile_levels are symmetric around 0.5 + + Approximates the Continuous Ranked Probability Score + """ + return np.mean(self._quantile_loss_per_step()) + + @metric(higher_is_better=False) + def wql(self): + """Weighted Quantile Loss. + + Defined as total quantile loss normalized by the total abs value of target time series. + """ + return self._quantile_loss_per_step().mean(axis=1).sum() / np.sum(np.abs(self.truth)) + + @metric(higher_is_better=False) + def sql(self): + """Scaled Quantile Loss, also known as Scaled Pinball Loss. + + Similar to MASE, the quantile loss for each item is normalized by the in-sample error of the naive forecaster. + This makes scores comparable across different items. + """ + pl_per_item = self._itemwise_mean(self._quantile_loss_per_step().mean(axis=1) / self.abs_seasonal_error) + return self._safemean(pl_per_item) + + _encode_predictions_and_truth_ = False save_predictions = TaskResult.save_predictions diff --git a/amlb/runners/aws.py b/amlb/runners/aws.py index 6aae9cff1..221fa7fb8 100644 --- a/amlb/runners/aws.py +++ b/amlb/runners/aws.py @@ -1314,4 +1314,3 @@ def _download_resources(self): def _upload_results(self): pass - diff --git a/amlb/utils/config.py b/amlb/utils/config.py index 59bf7db30..5bf80412e 100644 --- a/amlb/utils/config.py +++ b/amlb/utils/config.py @@ -1,4 +1,4 @@ -from collections import namedtuple +from __future__ import annotations from copy import deepcopy from dataclasses import dataclass from importlib.util import find_spec @@ -59,19 +59,15 @@ def config_load(path, verbose=False): return loader(file, as_namespace=True) -# TransformRule = namedtuple('TransformRule', -# ['from_key', 'to_key', 'fn', 'keep_from'], -# defaults=[None, identity, False], -# module=__name__) @dataclass class TransformRule: from_key: Union[str, List[str]] - to_key: str = None + to_key: str | None = None # if not provided, used for transformations on same key fn: Callable = identity keep_from: bool = False -def transform_config(config: Namespace, transform_rules: [TransformRule], inplace=True) -> Namespace: +def transform_config(config: Namespace, transform_rules: list[TransformRule], inplace=True) -> Namespace: """ Allows to modify a configuration namespace (for example if the configuration format is modified) by applying a list of transformation rules. diff --git a/amlb/utils/process.py b/amlb/utils/process.py index d50d8615d..8849da05a 100644 --- a/amlb/utils/process.py +++ b/amlb/utils/process.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import gc from concurrent.futures import ThreadPoolExecutor from contextlib import contextmanager @@ -18,7 +20,7 @@ import threading import _thread import traceback -from typing import Dict, List, Union, Tuple +from typing import Dict, List, Union, Tuple, cast import psutil @@ -118,7 +120,7 @@ def live_output_windows(process: subprocess.Popen, **_) -> Tuple[str, str]: queue=queue.Queue(), lines=[], ), - ) + ) # type: ignore # no reasonable type annotation, should refactor def forward_output(stream, queue_): if isinstance(stream, io.TextIOWrapper): @@ -135,12 +137,14 @@ def forward_output(stream, queue_): for output in outputs.values(): while True: try: - line = output["queue"].get(timeout=0.5) - output["lines"].append(line) + line = cast(queue.Queue, output["queue"]).get(timeout=0.5) + cast(list[str], output["lines"]).append(line) print(line.rstrip()) except queue.Empty: break - return ''.join(outputs["out"]["lines"]), ''.join(outputs["err"]["lines"]) + stdout = ''.join(cast(list[str], outputs["out"]["lines"])) + stderr = ''.join(cast(list[str], outputs["err"]["lines"])) + return stdout, stderr def live_output_unix(process, input=None, timeout=None, activity_timeout=None, mode='line', **_): @@ -448,7 +452,7 @@ class InterruptTimeout(Timeout): def __init__(self, timeout_secs, message=None, log_level=logging.WARNING, interrupt='thread', sig=signal.SIGINT, id=None, - interruptions: Union[Dict, List[Dict]] = None, wait_retry_secs=1, + interruptions: Union[Dict, List[Dict]] | None = None, wait_retry_secs=1, before_interrupt=None): def interruption(): inter_iter = iter(self._interruptions) diff --git a/amlb/utils/serialization.py b/amlb/utils/serialization.py index dffa4c53c..65f0c817d 100644 --- a/amlb/utils/serialization.py +++ b/amlb/utils/serialization.py @@ -23,7 +23,7 @@ def _import_data_libraries(): except ImportError: pd = None try: - import scipy.sparse as sp + import scipy.sparse as sp # type: ignore # https://github.com/scipy/scipy/issues/17158 except ImportError: sp = None return np, pd, sp diff --git a/amlb/utils/time.py b/amlb/utils/time.py index 416688bcc..cbb409f89 100644 --- a/amlb/utils/time.py +++ b/amlb/utils/time.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import datetime as dt import logging import math @@ -41,8 +43,8 @@ def datetime_iso(datetime=None, date=True, time=True, micros=False, date_sep='-' return datetime.strftime(strf) -def countdown(timeout_secs, on_timeout: Callable = None, message: str = None, interval=1, log_level=logging.INFO, - interrupt_event: threading.Event = None, interrupt_cond: Callable = None): +def countdown(timeout_secs, on_timeout: Callable | None = None, message: str = "", interval=1, log_level=logging.INFO, + interrupt_event: threading.Event | None = None, interrupt_cond: Callable | None = None): timeout_epoch = time.time() + timeout_secs remaining = timeout_secs interrupt = interrupt_event or threading.Event() diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 491eeb627..000000000 --- a/docs/README.md +++ /dev/null @@ -1,399 +0,0 @@ -# OpenML AutoML Benchmark - -The OpenML AutoML Benchmark provides a framework for evaluating and comparing open-source AutoML systems. The system is *extensible* because you can [add your own](https://github.com/openml/automlbenchmark/blob/master/docs/extending.md) AutoML frameworks and datasets. For a thorough explanation of the benchmark, and evaluation of results, you can read our [paper](https://openml.github.io/automlbenchmark/paper.html) which was accepted at the [2019 ICML AutoML Workshop](https://sites.google.com/view/automl2019icml/). - -_**NOTE:**_ _This benchmarking framework currently features binary and multiclass classification; extending to regression is a work in progress. Please file an issue with any concerns/questions._ - - * [Installation](#installation) - * [Pre-requisites](#pre-requisites) - * [Setup](#setup) - * [Quickstart](#quickstart) - * [Running benchmarks](#running-benchmarks) - * [In Docker image](#in-docker-image) - * [In local environment](#in-local-environment) - * [On AWS](#on-aws) - * [Output](#output) - * [Advanced configuration](#advanced-configuration) - * [Issues](#issues) - * [Frequently Asked Questions](#frequently-asked-questions) - -Automatic Machine Learning (AutoML) systems automatically build machine learning pipelines or neural architectures in a data-driven, objective, and automatic way. They automate a lot of drudge work in designing machine learning systems, so that better systems can be developed, faster. However, AutoML research is also slowed down by two factors: - -* We currently lack standardized, easily-accessible benchmarking suites of tasks (datasets) that are curated to reflect important problem domains, practical to use, and sufficiently challenging to support a rigorous analysis of performance results. - -* Subtle differences in the problem definition, such as the design of the hyperparameter search space or the way time budgets are defined, can drastically alter a task’s difficulty. This issue makes it difficult to reproduce published research and compare results from different papers. - -This toolkit aims to address these problems by setting up standardized environments for in-depth experimentation with a wide range of AutoML systems. - -Documentation: - -### Features: -* Curated suites of [benchmarking datasets](https://openml.github.io/automlbenchmark/benchmark_datasets.html) from [OpenML](https://www.openml.org/s/218/data). -* Includes code to benchmark a number of [popular AutoML systems](https://openml.github.io/automlbenchmark/automl_overview.html) on regression and classification tasks. -* [New AutoML systems can be added](./HOWTO.md#add-an-automl-framework) -* Experiments can be run in Docker or Singularity containers -* Execute experiments locally or on AWS (see below) - - -## Installation -### Pre-requisites -To run the benchmarks, you will need: -* Python 3.9+. -* PIP3: ensure you have a recent version. If necessary, upgrade your pip using `python -m pip install -U pip`. -* The Python libraries listed in [requirements.txt](../requirements.txt): it is strongly recommended to first create a [Python virtual environment](https://docs.python.org/3/library/venv.html#venv-def) (cf. also [Pyenv](https://github.com/pyenv/pyenv): quick install using `curl https://pyenv.run | bash` or `brew install pyenv`) and work in it if you don't want to mess up your global Python environment. -* [Docker](https://docs.docker.com/install/), if you plan to run the benchmarks in a container. - -### Setup -Clone the repo (in development environment, you should of course remove the `--depth 1` argument): -```bash -git clone https://github.com/openml/automlbenchmark.git --branch stable --depth 1 -cd automlbenchmark -``` -Optional: create a Python3 virtual environment. - -- _**NOTE**: we don't recommend creating your virtual environment with `virtualenv` library here as the application may create additional virtual environments for some frameworks to run in isolation._ -_Those virtual environments are created internally using `python -m venv` and we encountered issues with `pip` when `venv` is used on top of a `virtualenv` environment._ -_Therefore, we rather suggest one of the method below:_ - -using venv on Linux/macOS: -```bash -python3 -m venv ./venv -source venv/bin/activate -# remember to call `deactivate` once you're done using the application -``` -using venv on Windows: -```bash -python3 -m venv ./venv -venv\Scripts\activate -# remember to call `venv\Scripts\deactivate` once you're done using the application -``` - -or using pyenv: -```bash -pyenv install {python_version: 3.9.16} -pyenv virtualenv ve-automl -pyenv local ve-automl -``` -Then pip install the dependencies: - -```bash -python -m pip install -r requirements.txt -``` - -- _**NOTE**: in case of issues when installing Python requirements, you may want to try the following:_ - - _on some platforms, we need to ensure that requirements are installed sequentially:_ `xargs -L 1 python -m pip install < requirements.txt`. - - _enforce the `python -m pip` version above in your virtualenv:_ `python -m pip install --upgrade pip==19.3.1`. - -## Quickstart -To run a benchmark call the `runbenchmark.py` script with at least the following arguments: - -1. The AutoML framework that should be evaluated, see [frameworks.yaml](../resources/frameworks.yaml) for supported frameworks. If you want to add a framework see [HOWTO](./HOWTO.md#add-an-automl-framework). -2. The benchmark suite to run should be one implemented in [benchmarks folder](../resources/benchmarks), or an OpenML study or task (formatted as `openml/s/X` or `openml/t/Y` respectively). -3. (Optional) The constraints applied to the benchmark as defined by default in [constraints.yaml](../resources/constraints.yaml). Default constraint is `test` (2 folds for 10 min each). -4. (Optional) If the benchmark should be run `local` (default, tested on Linux and macOS only), in a `docker` container or on `aws` using multiple ec2 instances. - -Examples: -```bash -python3 runbenchmark.py -python3 runbenchmark.py constantpredictor -python3 runbenchmark.py tpot test -python3 runbenchmark.py autosklearn openml/t/59 -m docker -python3 runbenchmark.py h2oautoml validation 1h4c -m aws -python3 runbenchmark.py autogluon:latest validation -python3 runbenchmark.py tpot:2020Q2 -``` - -For the complete list of supported arguments, run: -```bash -python3 runbenchmark.py --help -``` - -```text -usage: runbenchmark.py [-h] [-m {local,aws,docker,singularity}] - [-t [task_id [task_id ...]]] - [-f [fold_num ...]] [-i input_dir] - [-o output_dir] [-u user_dir] [-p parallel_jobs] - [-s {auto,skip,force,only}] [-k [true|false]] - [-e] [--logging LOGGING] - [--openml-run-tag OPENML_RUN_TAG] - framework [benchmark] [constraint] - -positional arguments: - framework The framework to evaluate as defined by default in resources/frameworks.yaml. - To use a labelled framework (i.e. a framework defined in resources/frameworks-{label}.yaml), - use the syntax {framework}:{label}. - benchmark The benchmark type to run as defined by default in resources/benchmarks/{benchmark}.yaml, - a path to a benchmark description file, or an openml suite or task. - OpenML references should be formatted as 'openml/s/X' and 'openml/t/Y', - for studies and tasks respectively. Use 'test.openml/s/X' for the - OpenML test server. - (default: 'test') - constraint The constraint definition to use as defined by default in resources/constraints.yaml. - (default: 'test') - -optional arguments: - -h, --help show this help message and exit - -m {local,aws,docker,singularity}, --mode {local,aws,docker,singularity} - The mode that specifies how/where the benchmark tasks will be running. - (default: 'local') - -t [task_id ...], --task [task_id ...] - The specific task name (as defined in the benchmark file) to run. - When an OpenML reference is used as benchmark, the dataset name should be used instead. - If not provided, then all tasks from the benchmark will be run. - -f [fold_num ...], --fold [fold_num ...] - If task is provided, the specific fold(s) to run. - If fold is not provided, then all folds from the task definition will be run. - -i input_dir, --indir input_dir - Folder from where the datasets are loaded by default. - (default: '~/.openml') - -o output_dir, --outdir output_dir - Folder where all the outputs should be written.(default: './results') - -u user_dir, --userdir user_dir - Folder where all the customizations are stored.(default: '~/.config/automlbenchmark') - -p parallel_jobs, --parallel parallel_jobs - The number of jobs (i.e. tasks or folds) that can run in parallel. - A hard limit is defined by property `job_scheduler.max_parallel_jobs` - in `resources/config.yaml`. - Override this limit in your custom `config.yaml` file if needed. - Supported only in aws mode or container mode (docker, singularity). - (default: 1) - -s {auto,skip,force,only}, --setup {auto,skip,force,only} - Framework/platform setup mode. Available values are: - • auto: setup is executed only if strictly necessary. - • skip: setup is skipped. - • force: setup is always executed before the benchmark. - • only: only setup is executed (no benchmark). - (default: 'auto') - -k [true|false], --keep-scores [true|false] - Set to true (default) to save/add scores in output directory. - -e, --exit-on-error If set, terminates on the first task that does not complete with a model. - --logging LOGGING Set the log levels for the 3 available loggers: - • console - • app: for the log file including only logs from amlb (.log extension). - • root: for the log file including logs from libraries (.full.log extension). - Accepted values for each logger are: notset, debug, info, warning, error, fatal, critical. - Examples: - --logging=info (applies the same level to all loggers) - --logging=root:debug (keeps defaults for non-specified loggers) - --logging=console:warning,app:info - (default: 'console:info,app:debug,root:info') - --openml-run-tag OPENML_RUN_TAG - Tag that will be saved in metadata and OpenML runs created during upload, must match '([a-zA-Z0-9_\-\.])+'. -``` - -The script will produce output that records task metadata and the result. -The result is the score on the test set, where the score is a specific model performance metric (e.g. "AUC") defined by the benchmark. -```text - task framework fold result mode version utc acc auc logloss -0 iris H2OAutoML 0 1.000000 local 3.22.0.5 2019-01-21T15:19:07 1.000000 NaN 0.023511 -1 iris H2OAutoML 1 1.000000 local 3.22.0.5 2019-01-21T15:20:12 1.000000 NaN 0.091685 -2 kc2 H2OAutoML 0 0.811321 local 3.22.0.5 2019-01-21T15:21:11 0.811321 0.859307 NaN -3 kc2 H2OAutoML 1 0.886792 local 3.22.0.5 2019-01-21T15:22:12 0.886792 0.888528 NaN -``` - -## Running benchmarks -The `automlbenchmark` app currently allows running benchmarks in various environments: -* in a docker container (running locally or on multiple AWS instances). -* completely locally, if the framework is supported on the local system. -* on AWS, possibly distributing the tasks to multiple EC2 instances, each of them running the benchmark either locally or in a docker container. - -### In Docker image -The [Docker] image is automatically built before running the benchmark if it doesn't already exist locally or in a public repository (by default in ). -Especially, without docker image, the application will need to download and install all the dependencies when building the image, so this may take some time. - -The generated image is usually named `automlbenchmark/{framework}:{tag}`, but this is customizable per framework: cf. `resources/frameworks.yaml` and [HOWTO](HOWTO.md#framework-definition) for details. - -For example, this will build a Docker image for the `RandomForest` framework and then immediately start a container to run the `validation` benchmark, using all folds, allocating 1h and 4 cores for each task: -```bash -python3 runbenchmark.py RandomForest validation 1h4c -m docker -``` - -If the corresponding image already exists locally and you want it to be rebuilt before running the benchmark, then the setup needs to be forced: -```bash -python3 runbenchmark.py {framework} {benchmark} {constraint} -m docker -s force -``` - -The image can also be built without running any benchmark: -```bash -python3 runbenchmark.py {framework} -m docker -s only -``` - -In rare cases, mainly for development, you may want to specify the docker image: -```bash -python3 runbenchmark.py {framework} {benchmark} {constraint} -m docker -Xdocker.image={image} -``` - -### In local environment -If docker allows portability, it is still possible to run the benchmarks locally without container on some environments (currently Linux, and macOS for most frameworks). - -A minimal example would be to run the test benchmarks with a random forest: -```bash -python3 runbenchmark.py RandomForest test -``` - -The majority of frameworks though require a `setup` step before being able to run a benchmark. Please note that this step may take some time depending on the framework. -This setup is executed by default on first run of the framework, but in this case, it is not guaranteed that the benchmark run following immediately will manage to complete successfully (for most frameworks though, it does). - -In case of error, just run the benchmark one more time. - -If it still fails, you may need to rerun the setup step manually: -```bash -python3 runbenchmark.py {framework} -s only -``` -You can then run the benchmarks as many times as you wish. - -When testing a framework or a new dataset, you may want to run only a single task and a specific fold, for example: -```bash -python3 runbenchmark.py TPOT validation -t bioresponse -f 0 -``` - -### On AWS -To run a benchmark on AWS you additionally need to have a configured AWS account. -The application is using the [boto3] Python package to exchange files through S3 and create EC2 instances. - - If this is your first time setting up your AWS account on the machine that will run the `automlbenchmark` app, you can use the [AWS CLI](http://aws.amazon.com/cli/) tool and run: - ```bash - aws configure - ``` -You will need your AWS Access Key ID, AWS Secret Access Key, and pick a default [EC2 region](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html#concepts-available-regions). - -- _**NOTE:** Currently the AMI is only configured for the following regions so you'll have to set your default region as one of these_: - - us-east-1 - - us-west-1 - - eu-west-1 - - eu-central-1 - -On first use, it is recommended to simply copy the `config.yaml` from [examples/aws] to your user `~/.config/automlbenchmark` folder (or merge it if you already have a `config.yaml` in this user folder) and follow the instructions in that file. - -To run a test to see if the benchmark framework is working on AWS, do the following: -```bash -python3 runbenchmark.py constantpredictor test -m aws -``` -This will create and start an EC2 instance for each benchmark job and run the 4 jobs (2 OpenML tasks * 2 folds) from the `test` benchmark sequentially, each job running for 1mn in this case (excluding setup time for the EC2 instances). - -For longer benchmarks, you'll probably want to run multiple jobs in parallel and distribute the work to several EC2 instances, for example: -```bash -python3 runbenchmark.py AUTOWEKA validation 1h4c -m aws -p 4 -``` -will keep 4 EC2 instances running, monitor them in a dedicated thread, and finally collect all outputs from s3. - -- _**NOTE**: each EC2 instance is provided with a time limit at startup to ensure that in any case, the instance is stopped even if there is an issue when running the benchmark task. In this case the instance is stopped, not terminated, and we can therefore inspect the machine manually (ideally after resetting its UserData field to avoid re-triggering the benchmark on the next startup)._ - -The console output is still showing the instances starting, outputs the progress and then the results for each dataset/fold combination: -```text -Running `H2OAutoML_nightly` on `validation` benchmarks in `aws` mode -Loading frameworks definitions from ['/Users/me/repos/automlbenchmark/resources/frameworks.yaml']. -Loading benchmark definitions from /Users/me/repos/automlbenchmark/resources/benchmarks/validationt.yaml. -Uploading `/Users/me/repos/automlbenchmark/resources/benchmarks/validation.yaml` to `ec2/input/validation.yaml` on s3 bucket automl-benchmark. -... -Starting new EC2 instance with params: H2OAutoML_nightly /s3bucket/input/validation.yaml -t micro-mass -f 0 -Started EC2 instance i-0cd081efc97c3bf6f -[2019-01-22T11:51:32] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: pending -Starting new EC2 instance with params: H2OAutoML_nightly /s3bucket/input/validation.yaml -t micro-mass -f 1 -Started EC2 instance i-0251c1655e286897c -... -[2019-01-22T12:00:32] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running -[2019-01-22T12:00:33] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: running -[2019-01-22T12:00:48] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running -[2019-01-22T12:00:48] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: running -... -[ 731.511738] cloud-init[1521]: Predictions saved to /s3bucket/output/predictions/h2oautoml_nightly_micro-mass_0.csv -[ 731.512132] cloud-init[1521]: H2O session _sid_96e7 closed. -[ 731.512506] cloud-init[1521]: Loading predictions from /s3bucket/output/predictions/h2oautoml_nightly_micro-mass_0.csv -[ 731.512890] cloud-init[1521]: Metric scores: {'framework': 'H2OAutoML_nightly', 'version': 'nightly', 'task': 'micro-mass', 'fold': 0, 'mode': 'local', 'utc': '2019-01-22T12:00:02', 'logloss': 0.6498889633819804, 'acc': 0.8793103448275862, 'result': 0.6498889633819804} -[ 731.513275] cloud-init[1521]: Job local_micro-mass_0_H2OAutoML_nightly executed in 608.534 seconds -[ 731.513662] cloud-init[1521]: All jobs executed in 608.534 seconds -[ 731.514089] cloud-init[1521]: Scores saved to /s3bucket/output/scores/H2OAutoML_nightly_task_micro-mass.csv -[ 731.514542] cloud-init[1521]: Loaded scores from /s3bucket/output/scores/results.csv -[ 731.515006] cloud-init[1521]: Scores saved to /s3bucket/output/scores/results.csv -[ 731.515357] cloud-init[1521]: Summing up scores for current run: -[ 731.515782] cloud-init[1521]: task framework ... acc logloss -[ 731.516228] cloud-init[1521]: 0 micro-mass H2OAutoML_nightly ... 0.87931 0.649889 -[ 731.516671] cloud-init[1521]: [1 rows x 9 columns] -... -EC2 instance i-0cd081efc97c3bf6f is stopped -Job aws_validation_micro-mass_0_H2OAutoML_nightly executed in 819.305 seconds -[2019-01-22T12:01:34] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running -[2019-01-22T12:01:49] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running -EC2 instance i-0251c1655e286897c is stopping -Job aws_validation_micro-mass_1_H2OAutoML_nightly executed in 818.463 seconds -... -Terminating EC2 instances i-0251c1655e286897c -Terminated EC2 instances i-0251c1655e286897c with response {'TerminatingInstances': [{'CurrentState': {'Code': 32, 'Name': 'shutting-down'}, 'InstanceId': 'i-0251c1655e286897c', 'PreviousState': {'Code': 64, 'Name': 'stopping'}}], 'ResponseMetadata': {'RequestId': 'd09eeb0c-7a58-4cde-8f8b-2308a371a801', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'text/xml;charset=UTF-8', 'transfer-encoding': 'chunked', 'vary': 'Accept-Encoding', 'date': 'Tue, 22 Jan 2019 12:01:53 GMT', 'server': 'AmazonEC2'}, 'RetryAttempts': 0}} -Instance i-0251c1655e286897c state: shutting-down -All jobs executed in 2376.891 seconds -Deleting uploaded resources `['ec2/input/validation.yaml', 'ec2/input/config.yaml', 'ec2/input/frameworks.yaml']` from s3 bucket automl-benchmark. -``` - -### Output -By default, a benchmark run creates the following subdirectories and files in the output directory (by default a subdirectory of `./results` with unique name identifying the benchmark run): -* `scores`: this subdirectory contains - * `results.csv`: a global scoreboard, keeping scores from all benchmark runs. - For safety reasons, this file is automatically backed up to `scores/backup/results.{currentdate}.csv` by the application before any modification. - * individual score files keeping scores for each framework+benchmark combination (not backed up). -* `predictions`, this subdirectory contains the last predictions in a standardized format made by each framework-dataset combination. - Those last predictions are systematically backed up with current data to `predictions/backup` subdirectory before a new prediction is written. -* `logs`: this subdirectory contains logs produced by the `automlbenchmark` app, including when it's been run in Docker container or on AWS. - - -### Uploading results to OpenML -The `upload_results.py` script can be used to upload results to OpenML with the following usage: -```text ->python upload_results.py --help -usage: Script to upload results from the benchmark to OpenML. [-h] [-i INPUT_DIRECTORY] [-a APIKEY] [-m MODE] [-x] [-v] [-t TASK] - -optional arguments: - -h, --help show this help message and exit - -i INPUT_DIRECTORY, --input-directory INPUT_DIRECTORY - Directory that stores results from the runbenchmark.py invocation. By default use the most recent folder in the results folder as - specified in the configuration. - -a APIKEY, --api-key APIKEY - OpenML API key to use for uploading results. - -m MODE, --mode MODE Run mode (default=check). - • check: only report whether results can be uploaded. - • upload: upload all complete results. - -x, --fail-fast Stop as soon as a task fails to upload due to an error during uploading. - -v, --verbose Output progress to console. - -t TASK, --task TASK Only upload results for this specific task. -``` - -Note that the default behavior does not upload data but only verifies data is complete. -We strongly encourage you to only upload your data after verifying all expected results are complete. -The OpenML Python package is used for uploading results, so to ensure your API credentials are configured, please refer to their [configuration documentation](https://openml.github.io/openml-python/master/usage.html#installation-set-up). -Results obtained on tasks on the test server (e.g. through the `--test-server` option of `runbenchmark.py`) are uploaded to the test server and don't require additional authentication. - -## Advanced configuration -If you need to create your own benchmark, add a framework, create a plugin for a proprietary framework, or simply want to use some advanced options (e.g. run some frameworks with non-default parameters), see the [HOWTO]. - -## Issues -If you face any issue, please first have a look at the [Troubleshooting guide] and check the [existing issues](https://github.com/openml/automlbenchmark/issues). -Any new issue should also be reported there. - - -[HOWTO]: ./HOWTO.md -[Troubleshooting guide]: ./HOWTO.md#troubleshooting-guide -[examples/aws]: ../examples/aws/config.yaml - -[Docker]: https://docs.docker.com/ -[boto3]: https://boto3.readthedocs.io/ - - -## Frequently Asked Questions - -**When will results be updated, also for the new/updated frameworks?** - -We don't perform a benchmark evaluation for each new package or update. -Due to budget constraints, we can only do a limited number of evaluations. -The next full evaluation will be performed before the end of the year 2020. -We hope to find funding to guarantee regular evaluations. - ---- -**(When) will you add framework X?** - -We are currently not focused on integrating additional AutoML systems. -However, we process any pull requests that add frameworks and will assist with the integration. -The best way to make sure framework X gets included is to start with the integration yourself or encourage the package authors to do so (for technical details see [HOWTO]). - -It is also possible to open a Github issue indicating the framework you would like added. -Please use a clear title (e.g. "Add framework: X") and provide some relevant information (e.g. a link to the documentation). -This helps us keep track of which frameworks people are interested in seeing included. diff --git a/docs/about.md b/docs/about.md deleted file mode 100644 index c63eaa53c..000000000 --- a/docs/about.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -title: About -layout: category -sidebar_sort_order: 10 ---- - -## Goals - -We want to provide an ongoing benchmark with up-to-date results on realistic and current machine learning problems. -By making it open-source and open to contributions, we hope that all packages will be used as intended and evaluated fairly. -Fair results for each framework are enabled by allowing authors to contribute directly to the repository. -To ensure the benchmark accurately reflects the state of AutoML, evaluations will be rerun when frameworks get major updates, -and the selection of problems will be updated1. - -Currently, we limit the datasets to involve single-label classification problems on i.i.d. tabular data optimizing for one of two metrics. -We would like to extend the types of tasks to include e.g. regression, multi-label classification and temporal data, -but also to include problem-specific metrics (e.g. have a false negative incur a higher cost than a false positive for a disease diagnosis problem). - -## Open Science -Open science is important to us. -This is a transparent benchmark: no favorites, no cheating. -We require that all evaluated AutoML systems are open-source and all data to be freely available on [OpenML](https://www.openml.org/). -All the code required to run the benchmark is available on [Github](https://github.com/openml/automlbenchmark). - -## Limitations -It is important to note that the current benchmark has some limitations. - -First, we evaluate the AutoML systems by their default settings, only specifying the resources to be used (number of cores, wallclock time and memory). -We do not tune their search space or optimization hyperparameters, even though all packages allow at least some tuning. -There are of course valid reasons to tune these settings, such as only allowing a subset of models that are most interpretable. -However, in a general sense we feel that requiring tuning of AutoML frameworks defeats the purpose of AutoML, and thus opt not to do so. -That said, tuning the search space or hyperparameter values may drastically change the results. -Our hope is that authors of AutoML packages put more thought in picking good default settings, possibly dependent on the task at hand. -Over time, we hope this becomes a non-issue. - -We must stress that this benchmark does *not* tell us what optimization technique is best. -For each package, the search space from which to construct a model is very different. -These differences are caused by many design differences. -These are differences in their representation of machine learning pipelines (e.g. fixed-length vs. unlimited-length), -by the underlying machine learning packages (e.g. scikit-learn vs. WEKA), -and even the selection of included algorithms and allowed hyperparameter values. -Finally some packages use meta-learning for warm-starting, or post-processing techniques to improve results. - -There are also qualities of frameworks which are not evaluated. -Perhaps the most interesting one is the convergence rate, or how good the any-time stopping performance is of each framework along the optimization process. -But other qualities, such as ease of use or level of support can also be important to some users. - - ---- -1 Due to the high (computational) cost involved, we need to find a balance here. \ No newline at end of file diff --git a/docs/automl_overview.md b/docs/automl_overview.md deleted file mode 100644 index 890bcfed5..000000000 --- a/docs/automl_overview.md +++ /dev/null @@ -1,334 +0,0 @@ ---- -layout: category -title: AutoML Systems -sidebar_sort_order: 2 ---- - -There is more to an AutoML system than just its performance. -An AutoML framework may only be available through an API for a specific programming language, while others can work stand-alone. -Some systems might output models which can be used without further dependency on the AutoML package, -in other cases the AutoML system is still required to use the model. -Some systems might be developed with a specific domain in mind. -When choosing an AutoML system, it is essential to consider things that are important to you. - -On this page a brief description and further references for the AutoML systems in the benchmark is provided. - -List of AutoML systems in the benchmark, in alphabetical order: - -- [auto-sklearn](#auto-sklearn) -- [Auto-WEKA](#auto-weka) -- [H2O AutoML](#h2o-automl) -- [TPOT](#tpot) - -There are many more AutoML frameworks, and unfortunately we could not yet evaluate them all. -While we hope to cover them in the comparison in the future, for now we will -Some other frameworks worth mentioning are, again in alphabetical order: - -- [autoxgboost](#autoxgboost) -- [FLAML](#flaml) -- [GAMA](#gama) -- [hyperopt-sklearn](#hyperopt-sklearn) -- [ML-Plan](#ml-plan) -- [mlr3automl](#mlr3automl) -- [oboe](#oboe) - -For completeness, the baseline methods are also described: - -- [Constant Predictor](#constant-predictor) -- [Random Forest](#random-forest) -- [Tuned Random Forest](#tuned-random-forest) - -##### Statement To Authors -We did our best to provide a reasonable description which highlights some unique or important aspects of each package. -If you want to change or add to the description and references of your AutoML package, please submit a pull request with your proposed changes. - -The description needs to be kept brief and factual. -The goal is to get an impression, based on which the reader can delve more in-depth in the provided documentation. - -If your AutoML framework is not on this page and feel it should be, please open a PR with the proposed addition. -Keep the formatting consistent with the rest of the page. - ------ - -# Included AutoML Frameworks - -## auto-sklearn -[source](https://github.com/automl/auto-sklearn) | -[documentation](http://automl.github.io/auto-sklearn/stable/) | -Python | -Optimization: Bayesian Optimization | -3-clause BSD - -> auto-sklearn is an automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator. - -Auto-sklearn is declared the overall winner of the [ChaLearn AutoML](http://automl.chalearn.org/) Challenge -[1](https://docs.google.com/a/chalearn.org/viewer?a=v&pid=sites&srcid=Y2hhbGVhcm4ub3JnfGF1dG9tbHxneDoyYThjZjhhNzRjMzI3MTg4) -in 2015-2016 and -[2](https://www.4paradigm.com/competition/pakdd2018) -in 2017-2018. -It provides a scikit-learn-like interface in Python and uses Bayesian optimization to find good machine learning pipelines. - -It features automatic ensemble construction. -Meta-learning is used to warm-start the search procedure, this means that the search is more likely to start with good pipelines. - -#### Papers - -Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter (2015). -[Efficient and Robust Automated Machine Learning](http://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf) -*Advances in Neural Information Processing Systems 28 (NIPS 2015)*. - -## Auto-WEKA -[source](https://github.com/automl/autoweka) | -[documentation](http://www.cs.ubc.ca/labs/beta/Projects/autoweka/manual.pdf) | -Java, CLI, GUI | -Optimization: Bayesian Optimization | -GPLv3 - -> Our hope is that Auto-WEKA will help non-expert users to more effectively identify machine learning algorithms and -> hyperparameter settings appropriate to their applications, and hence to achieve improved performance. - -Auto-WEKA is built on the Java machine learning package [WEKA](http://www.cs.waikato.ac.nz/ml/weka/). -Auto-WEKA can be used through a graphical user interface, which means there is no need to use a terminal or programming language. -It is one of the first systems to consider joint algorithm selection and hyperparameter optimization in addition to preprocessing steps. - - - -#### Papers - -Lars Kotthoff, Chris Thornton, Holger Hoos, Frank Hutter, and Kevin Leyton-Brown (2017). -[Auto-WEKA 2.0: Automatic model selection and hyperparameter optimization in WEKA](http://www.cs.ubc.ca/labs/beta/Projects/autoweka/papers/16-599.pdf) -*JMLR. 18(25):1−5, 2017* - -Chris Thornton, Frank Hutter, Holger Hoos, and Kevin Leyton-Brown (2013). -[Auto-WEKA: Combined Selection and Hyperparameter Optimization of Classification Algorithms](http://www.cs.ubc.ca/labs/beta/Projects/autoweka/papers/autoweka.pdf) -*Proceedings of KDD 2013*. - - -## H2O AutoML -[source](https://github.com/h2oai/h2o-3) | -[documentation](http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html) | -Python, R | -Optimization: Random Search | -Apache-2.0 - -> H2O’s AutoML can be used for automating the machine learning workflow, -> which includes automatic training and tuning of many models within a user-specified time-limit. - - -H2O AutoML performs Random Search followed by a stacking stage. -By default it uses the H2O machine learning package, which supports distributed training. - -#### Papers - -\- - - -## TPOT -[source](https://github.com/EpistasisLab/tpot) | -[documentation](https://epistasislab.github.io/tpot/) | -Python, CLI | -Optimization: Genetic Programming | -LGPL-3.0 - -> Consider TPOT your Data Science Assistant. -> TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming. - -TPOT provides a scikit-learn-like interface for use in Python, but can be called from the command line as well. -It constructs machine learning pipelines of arbitrary length using scikit-learn algorithms and, optionally, xgboost. -In its search, preprocessing and stacking are both considered. -After the search, it is able to export python code so that you may reconstruct the pipeline without dependencies on TPOT. - -While technically pipelines can be of any length, TPOT performs multi-objective optimization: -it aims to keep the number of components in the pipeline small while optimizing the main metric. -TPOT features support for sparse matrices, multiprocessing and custom pipeline components. - -#### Papers - -Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016). -[Automating biomedical data science through tree-based pipeline optimization](http://dx.doi.org/10.1007/978-3-319-31204-0_9). -*Applications of Evolutionary Computation*, pages 123-137. - -Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016). -[Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science](http://doi.acm.org/10.1145/2908812.2908918). -*Proceedings of GECCO 2016*, pages 485-492. - - - -# Other AutoML Frameworks - -## autoxgboost -[source](https://github.com/ja-thomas/autoxgboost) | -[documentation](https://github.com/ja-thomas/autoxgboost/blob/master/poster_2018.pdf) | -R | -Optimization: Bayesian Optimization | - - -> autoxgboost aims to find an optimal xgboost model automatically using the machine learning framework mlr and the bayesian optimization framework mlrMBO. - -Autoxgboost is different from most frameworks on this page in that it does not search over multiple learning algorithms. -Instead, it restricts itself to finding a good hyperparameter configuration for xgboost. -The exception to this is a preprocessing step for categorical variables, where the specific encoding strategy to use is tuned as well. - -#### Papers - -Janek Thomas, Stefan Coors and Bernd Bischl (2018). -[Automatic Gradient Boosting](https://arxiv.org/pdf/1807.03873v2.pdf) -*International Workshop on Automatic Machine Learning at ICML 2018* - -## FLAML -[source](https://github.com/microsoft/FLAML) | -[documentation](https://microsoft.github.io/FLAML/) | -Python | -Optimization: Configurable | -License MIT - -> FLAML is a lightweight Python library that finds accurate machine learning models efficiently and economically. - -FLAML is powered by a new, cost-effective hyperparameter optimization and learner selection method invented by Microsoft Research. FLAML leverages the structure of the search space to choose a search order optimized for both cost and error. -FLAML is fast and economical. The simple and lightweight design makes it easy to extend, such as adding customized learners or metrics. - -#### Papers - -Chi Wang, Qingyun Wu, Markus Weimer, and Erkang Zhu (2021). -[FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/) -*Proceedings of MLSys 2021* - -Qingyun Wu, Chi Wang, and Silu Huang (2021). -[Frugal Optimization for Cost-related Hyperparameters](https://www.microsoft.com/en-us/research/publication/frugal-optimization-for-cost-related-hyperparameters/) -*Proceedings of AAAI 2021* - -Chi Wang, Qingyun Wu, Silu Huang, and Amin Saied (2021). -[Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/) -*The Ninth International Conference on Learning Representations (ICLR 2021)* - -## GAMA -[source](https://github.com/PGijsbers/gama) | -[documentation](https://pgijsbers.github.io/gama/) | -Python | -Optimization: Configurable | -License MIT - -> GAMA is an AutoML tool for end-users and AutoML researchers with a configurable AutoML pipeline. - -GAMA is a new framework under active development. -GAMA supports AutoML researchers through a configurable AutoML pipeline, extensive logging and visualization of the logs. -The configurable AutoML pipeline allows selection of the optimization and post-processing algorithms. - -By default GAMA searches over linear machine learning pipelines and create an ensemble of them as a post-processing step. -Currently pipelines can be optimized with an asynchronous evolutionary algorithm or [ASHA](https://arxiv.org/abs/1810.05934). - -#### Papers - -Pieter Gijsbers, Joaquin Vanschoren (2019). -[GAMA: Genetic Automated Machine learning Assistant](https://joss.theoj.org/papers/10.21105/joss.01132). -*Journal of Open Source Software, 4(33), 1132* - -## hyperopt-sklearn -[source](https://github.com/hyperopt/hyperopt-sklearn) | -[documentation](http://hyperopt.github.io/hyperopt-sklearn/) | -Python | -Optimization: Random Search, various SMBO | -3-clause BSD - -> Hyperopt-sklearn is Hyperopt-based model selection among machine learning algorithms in scikit-learn. - -Hyperopt-sklearn allows for different search strategies through a scikit-learn-like interface. -Besides random search, various sequential model based optimization (SMBO) techniques are available. -Amongst these are Tree of Parzen Estimators (TPE), Annealing and Gaussian Process Trees. - -#### Papers - -Komer, Brent, James Bergstra, and Chris Eliasmith (2014). -[Hyperopt-sklearn: automatic hyperparameter configuration for scikit-learn.](http://compneuro.uwaterloo.ca/files/publications/komer.2014b.pdf) -*ICML workshop on AutoML 2014*. - -## ML-Plan -[source](https://github.com/starlibs/AILibs) | -[documentation](https://starlibs.github.io/AILibs/projects/mlplan/) | -Java | -Optimization: Best-First Search on a search graph induced through Hierachical Task Network Planning | AGPL-3.0 - -> a new approach to AutoML based on hierarchical planning - -ML-Plan organizes the search space of possible solution candidates via Hierarchical Task Network (HTN) planning. -It works with both WEKA and scikit-learn backends and can be used to deal with classification, regression, multi-label classification, and remaining useful lifetime estimation tasks. -ML-Plan is under active development. - -#### Papers - -Felix Mohr, Marcel Wever and Eyke Hüllermeier (2018). -[ML-Plan: Automated machine learning via hierarchical planning](https://link.springer.com/article/10.1007/s10994-018-5735-z) -*Machine Learning 107(8):1495–1515* - -Marcel Wever, Felix Mohr and Eyke Hüllermeier (2018). -[ML-Plan for Unlimited-Length Machine Learning Pipelines](https://ris.uni-paderborn.de/download/3852/3853/38.pdf) -* ICML workshop on AutoML 2018*. - -Marcel Wever, Felix Mohr and Eyke Hüllermeier (2018). -[Automated Multi-Label Classification based on ML-Plan](https://arxiv.org/abs/1811.04060) -*arXiv preprint* - -Marcel Wever, Felix Mohr, Alexander Tornede and Eyke Hüllermeier (2019). -[Automating Multi-Label Classification Extending ML-Plan](https://ris.uni-paderborn.de/download/10232/13177/Automating_MultiLabel_Classification_Extending_ML-Plan.pdf) -* ICML workshop on AutoML 2019*. - -## mlr3automl -[source](https://github.com/a-hanf/mlr3automl) | -[documentation](https://github.com/a-hanf/mlr3automl/blob/master/vignettes/mlr3automl.md) | -R | -Optimization: Hyperband | License LGPL-3.0 - -> mlr3automl combines a static portfolio with Hyperband tuning. - -mlr3automl is built on top of mlr3. It combines a static portfolio of known successful pipelines -with Hyperband tuning. mlr3automl currently supports classification and regression tasks. - -#### Papers -\- - -## OBOE -[source](https://github.com/udellgroup/oboe) | -[documentation](https://github.com/udellgroup/oboe) | -Python | -Optimization: Collaborative Filtering | -License N/A - -> Oboe is a data-driven Python algorithmic system for automated machine learning, and is based on matrix factorization and classical experiment design. - -OBOE is still in early stages of development. -It focuses on finding a good initial set of pipelines from which to start further optimization. -The focus is on time-constrained model selection and hyperparameter tuning, using meta-learning to find good pipelines. - -OBOE searches for a good set of algorithm configurations to create an ensemble from, using meta-learning. -With collaborative filtering they estimate which algorithms are likely to do well on the new dataset. - -#### Papers - -Chengrun Yang, Yuji Akimoto, Dae Won Kim, Madeleine Udell (2018). -[OBOE: Collaborative Filtering for AutoML Initialization](https://arxiv.org/pdf/1808.03233.pdf). -*arXiv preprint*. - - -## Baselines - -We compare the performance of AutoML frameworks not only to each other, but also to three baseline methods, these are: - -## Constant Predictor -[source](https://github.com/openml/automlbenchmark/tree/master/frameworks/constantpredictor) - -Always predicts the class probabilities according to their occurrence in the dataset. - -## Random Forest -[source](https://github.com/openml/automlbenchmark/tree/master/frameworks/RandomForest) - -The [Random Forest Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) of scikit-learn 0.20. -All hyperparameters are set to their default value, except for the number of estimators, which is set to *2000*. - -## Tuned Random Forest -[source](https://github.com/openml/automlbenchmark/tree/master/frameworks/TunedRandomForest) - -Uses the Random Forest setup as described above, but first optimizes the hyperparameter `max_features`. -It tries up to *11* different values of `max_features`. -Five values uniformly picked from `[1, sqrt(p))`, five values from `(sqrt(p), p]` and finally `sqrt(p)`, where `p` if the number of features in the dataset. - -It first evaluates `max_features=sqrt(p)` and then evaluates the other values in ascending order, until it completes them all or runs out of time. -Finally the model is fit to the entire training dataset with the best value for `max_features` according to the above cross-validation results. diff --git a/docs/benchmark_datasets.md b/docs/benchmark_datasets.md deleted file mode 100644 index b0ebfd36d..000000000 --- a/docs/benchmark_datasets.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -layout: category -title: Benchmark Datasets -sidebar_sort_order: 3 ---- - -The benchmark aims to consist of datasets that represent real-world data science problems. -This means we want to include datasets of all sizes (including *big* ones), of different problem domains and with various levels of difficulty. - -We also want to prevent AutoML tools from overfitting to our benchmark. -For this reason, we plan to change the selection of benchmark problems over time. -This should help prevent (some of the) bias that can be introduced by static benchmarks. - -In our selection for the [paper](#paper.md), we drew datasets from [OpenML100](https://www.openml.org/s/14), [OpenML-CC18](https://www.openml.org/s/98) and [AutoML Challenges](http://automl.chalearn.org/data). -However, we did not include all datasets. -One reason was that some did not meet our criteria (more on that below), another that we wanted to keep some datasets of the future. -There are also a few datasets which we wanted to include, but could not include in the paper due to time constraints. - -## Criteria -As stated before, we did not adopt all proposed datasets but made a selection. -Our criteria for adopting a dataset were as follows: - -**difficulty** of the dataset has to be a sufficient. -If a problem is easily solved by just about any algorithm, it will not be able to differentiate the various AutoML frameworks. -This was the case for many of the OpenML 100 problems (see e.g. [this Github Issue](https://github.com/openml/OpenML/issues/491)), -but also some of the OpenML-CC18 problems (see e.g. [this task](https://www.openml.org/t/15)). - -**representative of real-world** data science problems to be solved with the tool. -In particular we **limit artificial** problems. -We included some, either based on their widespread use ([kr-vs-kp](https://www.openml.org/d/3)) or because they pose difficult problems. -But we do not want them to be a large part of the benchmark. -We also **limit image problems** because those problems are typically solved with solutions in the deep learning domain. -However they still make for realistic, interesting and hard problems, so we did not want to exclude them altogether. - -**diversity** in the problem domains. -We do not want the benchmark to skew towards any domain in particular. -There are various software quality problems in the OpenML-CC18 ( -[jm1](https://www.openml.org/d/1053), -[kc1](https://www.openml.org/d/1067), -[kc2](https://www.openml.org/d/1063), -[pc1](https://www.openml.org/d/1068), -[pc3](https://www.openml.org/d/1050), -[pc4](https://www.openml.org/d/1049)), but adopting them all would lead to a bias in the benchmark to this domain. - -*We want to note however that being notified of new interesting problems in a domain that is already well-represented is still useful, -because we want to eventually replace datasets in the benchmark.* - -**miscellaneous** reasons to *exclude* a dataset included label-leakage, near-duplicates (e.g. different only in categorical encoding or imputation) or violation of the i.i.d. assumption. - - - -## Final List -The first iteration of our benchmark as presented in the paper contained 39 classification datasets. -For the full list of datasets and their characteristics see [OpenML Study 218](https://www.openml.org/s/218) or its [table view](https://www.openml.org/search?q=tags.tag%3Astudy_218&type=data&table=1&size=39). - -## The Future -As stated before, we want the selection of benchmark problems to change over time. -If you find a good candidate dataset, you can [help us make it part of the benchmark](extending.md#adding-a-dataset). -While we are interested in all interesting datasets that match our criteria, we are particularly interested in bigger datasets (>100k rows). - -We greatly appreciate any help to find new and interesting problems for the AutoML benchmark. \ No newline at end of file diff --git a/docs/bib_workshop.md b/docs/bib_workshop.md deleted file mode 100644 index 5eeb9a23d..000000000 --- a/docs/bib_workshop.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -title: BibTeX - AutoML @ ICML 2019 Paper ---- -``` -@article{amlb2019, - title={An Open Source AutoML Benchmark}, - author={Gijsbers, P. and LeDell, E. and Poirier, S. and Thomas, J. and Bischl, B. and Vanschoren, J.}, - journal={arXiv preprint arXiv:1907.00909 [cs.LG]}, - url={https://arxiv.org/abs/1907.00909}, - note={Accepted at AutoML Workshop at ICML 2019}, - year={2019} -} - -``` \ No newline at end of file diff --git a/docs/documentation.md b/docs/documentation.md deleted file mode 100644 index 45a863e88..000000000 --- a/docs/documentation.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -layout: category -title: Documentation -sidebar_sort_order: 4 ---- - - - [Running the Benchmark](README.md#running-benchmarks) - - [Adding a dataset](extending.md#adding-a-dataset) - - [Adding an AutoML Framework](extending.md#adding-an-automl-framework) \ No newline at end of file diff --git a/docs/extending.md b/docs/extending.md deleted file mode 100644 index 33aa01048..000000000 --- a/docs/extending.md +++ /dev/null @@ -1,98 +0,0 @@ ---- -title: Extending the benchmark ---- - -Whether you want to add a dataset or a framework to the benchmark, you will first have to [fork our repository](https://help.github.com/en/articles/fork-a-repo). -By forking our repository, you can make and test changes without affecting the benchmark. -If you feel your changes should be included in the benchmark, set up a [pull request](https://help.github.com/en/articles/about-pull-requests). -When creating a pull request, indicate clearly the changes and why they are made. - -## Adding a dataset - -### What makes a good dataset -Before discussing on *how* to add your dataset to the benchmark, we want to briefly elaborate on what we think makes for an interesting dataset. - -In our benchmark we aim to include machine learning problems which are representative of those encountered in practice. -In particular, problems of different domains, mixed data types and dataset sizes. -Currently, we would love some additional *big* datasets. - -Another important aspect for inclusion in the benchmark would be that it is a hard problem. -Even if the data is interesting, if a (near-)perfect model can be created with a decision tree, it is not going to be useful to profile the AutoML systems with. -Ideal datasets are those where only certain algorithms (with certain hyperparameter configurations) work, or require non-trivial data preprocessing. -Running a few different algorithms, with and without preprocessing, with different configurations, is encouraged to show the problem is sufficiently difficult. - -Perhaps your dataset does not match with the above description, or you lack the resources or know-how to evaluate the problem with different machine learning approaches. -If you think the problem is interesting regardless, do not hesitate to contact us anyway. -If possible, do this through a pull request as laid out in the following sections. -Otherwise, open an [issue](https://github.com/openml/automlbenchmark/issues). -Please title the issue '[DATA ADD] DATASETNAME' (replacing 'DATASETNAME' with the name of your dataset), -provide a link to the dataset on OpenML as well as motivation as to why you think the dataset is an interesting addition. -Following the steps below will make it more likely that we'll be able to review (and add) the dataset quickly. - -### Uploading to OpenML -To add a dataset to the benchmark, it needs to be uploaded to OpenML. -This requires the dataset in [ARFF format](https://www.cs.waikato.ac.nz/ml/weka/arff.html). -Read [here](https://docs.openml.org/#data) for more information on OpenML data, -and [here](https://www.openml.org/new/data) on how to actually upload it (this requires you to [sign up](https://www.openml.org/register) for OpenML). - -After uploading the dataset, visit its page on OpenML and create a [task](https://docs.openml.org/#tasks) for it. -An OpenML task specifies the evaluation procedure (e.g. splits of a 10-fold cross-validation) and the target of the problem. -To create a task for your OpenML dataset, visit its webpage and find the 'define new task' button at the bottom. -After these steps we are ready to add the problem to a benchmark. - -### Testing the task -First, to make sure everything was set up right, create a single-problem benchmark. -The easiest is to modify the [example benchmark](https://github.com/openml/automlbenchmark/blob/master/resources/benchmarks/example.yaml) by replacing the iris task information with your own. -Then run the benchmark: `python runbenchmark.py constantpredictor_enc example`. - - -Check results for errors. -If your task fails and it is unclear why, you can open an [issue](https://github.com/openml/automlbenchmark/issues). -If you do, please clearly indicate the related OpenML task id and steps to recreate it -and title the issue '[DATA HELP] DATASETNAME', replacing 'DATASETNAME' with the name of your dataset. - -### Adding it to the real thing -If you've made sure everything works, modify one of the existing benchmark or create a new one with your task. -When extending an existing benchmark, make sure not to modify any of the existing problems for the task. -Finally commit your changes and set up a pull request. - - -**Please make sure the PR does not include the changes made to `example.yaml`** - - -In your PR include: - - a link to the task and dataset on OpenML, where the OpenML dataset has meaningful meta-data (e.g. description) - - a motivation as to why this is an interesting addition to the benchmark. - Preferably address the points from the [What makes a good dataset](#what-makes-a-good-dataset) section. - The higher quality your motivation, the better we can come to a conclusion on whether to include the dataset or not. - - -## Adding an AutoML framework - -To add a new framework, create a new folder in the [frameworks folder](https://github.com/openml/automlbenchmark/tree/master/frameworks) (`/frameworks`). -In the package include at least a `__init__.py` file which exposes the method `run(Dataset, TaskConfig)` and optionally also `setup(*args)` and/or `docker_commands()` as documented [here](https://github.com/openml/automlbenchmark/blob/master/frameworks/__init__.py). - -For an example using a python-based AutoML tool, see e.g. the [TPOT](https://github.com/openml/automlbenchmark/tree/master/frameworks/TPOT) folder. -For an example using a non-python-based AutoML tool, see e.g. the [Auto-WEKA](https://github.com/openml/automlbenchmark/tree/master/frameworks/AutoWEKA) folder. - -Note that, as can be seen in the TPOT example, imputing the data before passing it to the framework is (currently) allowed. -The data is available in its regular form, but also in a numeric-only form (where string values are encoded with integers). - -Finally, add your framework to the [`framework.yaml`](https://github.com/openml/automlbenchmark/blob/master/resources/frameworks.yaml) file. -If at any point you run into issues or questions that aren't answered by the benchmark's documentation, -please open an [issue](https://github.com/openml/automlbenchmark/issues), the title of the issue should start with '[FW ADD]'. - -### Testing an AutoML framework - -To test if the implementation is successful, it is recommended to run the validation benchmark: -`python runbenchmark.py your_framework validation`. -This benchmark has tasks with a variety of interesting properties (e.g. missing values, different data types). - - -### Adding it to the real thing -If everything seems to work correctly, you're almost ready to set up a pull request. -But first, make sure you all documentation is up-to-date with your latest additions. -In particular, add or update the section on your AutoML framework in [the AutoML overview](https://github.com/openml/automlbenchmark/blob/master/docs/automl_overview.md). - -The title of your pull request when adding a new framework should be 'Add FRAMEWORK' where 'FRAMEWORK' should be replaced by the name of your framework. -If you are updating a framework, please title the pull request 'Update FRAMEWORK' similarly. \ No newline at end of file diff --git a/docs/extending/benchmark.md b/docs/extending/benchmark.md new file mode 100644 index 000000000..5331662fe --- /dev/null +++ b/docs/extending/benchmark.md @@ -0,0 +1,319 @@ +# Benchmark + +Benchmarks are collections of machine learning tasks, where each task is a dataset +with associated information on train/test splits used to evaluate the model. +These tasks can be defined in a `yaml` file or on [OpenML](https://www.openml.org). +Both options allow for defining a benchmark of one or more datasets. +It is even possible to reference to OpenML tasks from a benchmark file. + +!!! note "Supported Datasets" + + Currently, the AutoML benchmark only supports definitions of tabular datasets for + classification, regression, and time series forecasting. The time series forecasting + support is in an early stage, subject to change, and not supported through OpenML. + +## Defining a Benchmark on OpenML +Especially when performing a benchmark evaluation to be used in a publication, we +recommend the use of OpenML for the definition of the benchmark if possible. This +ensures that other users can run your benchmark out of the box, without any required +additional files. OpenML also provides a lot of meta-data about the datasets which is +also accessible through [APIs](https://www.openml.org/apis) in various programming +languages. We recommend using the [`openml-python`](https://openml.github.io/openml-python) +Python library as it is the most comprehensive of the OpenML libraries. + +Defining a benchmark on OpenML requires the following steps: + + - [Upload a dataset](https://openml.github.io/openml-python/main/examples/30_extended/create_upload_tutorial.html#sphx-glr-examples-30-extended-create-upload-tutorial-py). + A dataset is the tabular data, alongside meta-data like its name, + authors, and license. OpenML will also automatically extract meta-data about the + datasets, such as feature types, class balance, or dataset size. After uploading the + dataset, it will receive an identifier (`ID`) and should be visible on the OpenML + website on `www.openml.org/d/ID`. + - [Define a task](https://openml.github.io/openml-python/main/generated/openml.tasks.create_task.html#openml.tasks.create_task). + A task defines how to evaluate a model on a given dataset, for example + "10-fold cross-validation optimizing AUC". OpenML will generate splits for the 10-fold + cross-validation procedure which means that anyone using this task definition can + perform the experiment with the exact same splits easily. + - [Define a benchmark suite](https://openml.github.io/openml-python/main/examples/30_extended/suites_tutorial.html#sphx-glr-examples-30-extended-suites-tutorial-py). + On a technical level, a benchmarking suite is nothing more than a collection of tasks. + You can add a description that details the purpose of the benchmarking suite, or any + information that users should be aware of before using the suite. + +When a task or benchmark suite is available on OpenML, it can be directly referred to +for the `benchmark` parameter of `runbenchmark.py` as `openml/s/ID` for suites and +`openml/t/ID` for tasks, where `ID` is to be replaced with the OpenML identifier of the +object. For example, `openml/t/59` refers to [task 59](https://www.openml.org/t/59), +which is 10-fold cross-validation on the [iris dataset](https://www.openml.org/d/61). + +## Defining a Benchmark with a File + +When defining a benchmark with a `yaml` file, the `yaml` will contain information about +tasks that are located either on disk or on OpenML. We make a few default benchmarks +available in our [`resources/benchmarks`](GITHUB/resources/benchmarks) folder: + + * `test`: a collection of three small datasets covering regression, binary classification, + and multiclass classification. This makes it incredibly useful for small tests and + fast feedback on whether the software runs without error. + * `validation`: a collection of datasets which have different edge cases, such as a + very wide dataset, datasets with missing or non-numerical values, and more. This + typically produces most errors you might also encounter when running larger + benchmarks. + * `timeseries`: a benchmark for testing time series forecasting integration (experimental). + +Below is an excerpt from the `test.yaml` file: + +```yaml +- name: kc2 + openml_task_id: 3913 + description: "binary test dataset" +``` + +When writing your own benchmark definition, it needs to be discoverable by the benchmark. +A good place to do this would be adding a `benchmarks` directory to your benchmark +configuration directory (`~/.config/automlbenchmark` by default) and updating your +[custom configuration](../../using/configuration/#custom-configurations) by adding: + +```yaml +benchmarks: + definition_dir: + - '{root}/resources/benchmarks' + - '{user}/resources/benchmarks' +``` + +Each task must have a name that is unique in the definition file (case-insensitive), +this name will also be used as identifier (e.g., in the results files). +Additionally, the file must have a description of where to find the dataset files +and splits. When you have a task already on OpenML, you can directly reference it with +`openml_task_id` to define the dataset and splits. Alternatively, you can use local files. + +It is also possible to benchmark your own datasets that you can not or do not want to +upload to OpenML. The data files should be in `arff` or `csv` format and contain at least +one file for training data and one file for test data. When working with multiple files, +it is useful to use an archive (`.zip`, `.tar`, `.tgz`, `.tbz`) or directory structure. +Use the following naming convention to allow the AutoML benchmark to infer what each file represents: + + - if there's only one file for training and one for test, they should be named `{name}_train.csv` and `{name}_test.csv` (in case of CSV files). + - if there are multiple `folds`, they should follow a similar convention: `{name}_train_0.csv`, `{name}_test_0.csv``, {name}_train_1.csv`, `{name}_test_1.csv`, ... + +Examples: + +=== "Single Fold CSV" + + ```yaml + - name: example_csv + dataset: + train: /path/to/data/ExampleTraining.csv + test: /path/to/data/ExampleTest.csv + target: TargetColumn + folds: 1 + ``` + +=== "Multiple Folds CSV" + + ```yaml + - name: example_multi_folds + dataset: + train: + - /path/to/data/ExampleTraining_0.csv + - /path/to/data/ExampleTraining_1.csv + test: + - /path/to/data/ExampleTest_0.csv + - /path/to/data/ExampleTest_1.csv + target: TargetColumn + folds: 2 + ``` + +=== "Directory" + + It is important that the files in the directory follow the naming convention described above. + + ```yaml + - name: example_dir + dataset: + path: /path/to/data + target: TargetColumn + folds: 1 + ``` + +=== "Archive" + + It is important that the files in the archive follow the naming convention described above. + + ```yaml + - name: example_archive + dataset: + path: /path/to/archive.zip + target: TargetColumn + folds: 3 + ``` + +=== "Remote Files" + + The remote file may also be an archive. If that is the case, it is important that + the files in the archive follow the naming convention described above. + + ```yaml + - name: example_csv_http + dataset: + train: https://my.domain.org/data/ExampleTraining.csv + test: https://my.domain.org/data/ExampleTest.csv + target: TargetColumn + folds: 1 + ``` + + Remote files are downloaded to the `input_dir` folder and archives are decompressed + there as well. You can change the value of this folder in your + [custom config.yaml file](../../using/configuration/#custom-configurations) + or specify it at the command line with the `-i` or `--indir` argument + (by default, it points to the `~/.openml/cache` folder). + + +The `target` attribute is optional but recommended. If not set, it will resolve to the +column `target` or `class` if present, and the last column otherwise. + +You can even make use of the [special directives](../../using/configuration/#custom-configurations) like `{user}`. + +```yaml +- name: example_relative_to_user_dir + dataset: + train: "{user}/data/train.csv" + test: "{user}/data/test.csv" +``` + +After creating a benchmark definition, e.g. `~/.config/automlbenchmark/benchmarks/my_benchmark.yaml`, +it can then be referenced when running `runbenchmark.py`: `python runbenchmark.py FRAMEWORK my_benchmark`. + +## Defining a Time Series Forecasting Dataset + +!!! warning "Time Series Forecasting should be considered experimental" + + Time series forecasting support should be considered experimental and is currently + only supported with the AutoGluon integration. + +Benchmark definitions for time series datasets work in much the same way, but there are +some additional fields and requirements to a valid time series dataset. + +First, the dataset must be stored as a single csv file in +[long format](https://doc.dataiku.com/dss/latest/time-series/data-formatting.html#long-format) +and must include 3 columns: + + - `id_column`: An indicator column that specifies to which time series the sample belongs by a unique id. + The default expected name of this column is "item_id". + - `timestamp_column`: A column with the timestamp of the observation. + The default expected name of this column is "timestamp". + - `target`: A column with the target value of the time series + +Additionally, the data must satisfy the following criteria: + + - The shortest time series in the dataset must have length of at least `folds * forecast_horizon_in_step + 1` (see [Generated Folds](#generated-folds)). + - Time series may have different lengths or have different starting timestamps, + but must have the same frequency. + - All time series must have regular timestamp index, i.e., it must have an observation + for each time step from start to end. + +If the `id_column` or `timestamp_column` names are not the default expected ones, +they must be explicitly stated in the definition, as can be seen in the examples below. +Moreover, the definition must also contain the following fields: + + - `path`: a local or remote path to the CSV file with time series data. + - `freq`: a [pandas-compatible frequency string](https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases) + that denotes the frequency of the time series. For example, `D` for daily, `H` for hourly, or `15min` for 15-minute frequency. + - `forecast_horizon_in_steps`: a positive integer denoting how many future time series values need to be predicted. + - `seasonality`: a positive integer denoting the seasonal period of the data, measured in steps. + This parameter is used for computing metrics like [mean absolute scaled error](https://en.wikipedia.org/wiki/Mean_absolute_scaled_error#Seasonal_time_series) (denoted as *m* on Wikipedia). + + +=== "Default Column Names" + + Given a file at `path.to/data.csv` that contains two time series with daily frequency, + `A` with three observations and `B` with four observations: + + | item_id | timestamp | target | + |---------|-----------|--------:| + | A | 2020-01-01| 2.0 | + | A | 2020-01-02| 1.0 | + | A | 2020-01-03| 5.0 | + | B | 2019-05-02| 8.0 | + | B | 2019-05-03| 2.0 | + | B | 2019-05-04| 1.0 | + | B | 2019-05-05| 9.0 | + + When we specify the fields outlined above, then the respective task definition may + look like the one below. Note that we do not have to specify `id_column` or + `timestamp_column` as their names match the default expected value. + + ```yaml + - name: example_time_series_data + dataset: + path: /path/to/data.csv + freq: D + forecast_horizon_in_steps: 1 + seasonality: 7 + target: target + folds: 1 + ``` + + + +=== "Non-default Column Names" + + Given a file at `path.to/data.csv` that contains two time series with daily frequency, + `A` with three observations and `B` with four observations. It is identical to + the example "default column values", but the header provides different column names: + + | Product | Date | Value | + |---------|-----------|--------:| + | A | 2020-01-01| 2.0 | + | A | 2020-01-02| 1.0 | + | A | 2020-01-03| 5.0 | + | B | 2019-05-02| 8.0 | + | B | 2019-05-03| 2.0 | + | B | 2019-05-04| 1.0 | + | B | 2019-05-05| 9.0 | + + When we specify the fields outlined above, then the respective task definition may + look like the one below. Note that we do *have to* specify `id_column` or + `timestamp_column` as their names do not match the default expected value. If left + unspecified, the benchmark tool will raise an error. + + ```yaml + - name: example_time_series_data + dataset: + path: /path/to/data.csv + freq: D + forecast_horizon_in_steps: 1 + seasonality: 7 + id_column: Product + timestamp_column: Date + target: Value + folds: 1 + ``` + + + +### Generated Folds + +AMLB automatically generates the train and test splits from the raw data depending +on the chosen `forecast_horizon_in_steps` and `folds` parameters. Assuming +`forecast_horizon_in_steps = K` and `folds = n`, and each time series has length `n * K`, +the folds will be generated as follows: + + rows | fold 0 | fold 1 | ... | fold (n-2) | fold (n-1) + -- | -- | -- | -- | -- | -- + 1..K | train | train | ... | train | train + K..2K | train | train | ... | train | test + 2..3K | train | train | ... | test | + ... | | | | + (n-2)K...(n-1)K | train | test | | + (n-1)K...nK | test | | | + +As a consequence, the shortest time series in the dataset must have length of at least +`folds * forecast_horizon_in_step + 1`. + +!!! warning "This is still batch learning!" + + It is important to note that the model does not carry over between folds, each fold + the model is trained from scratch on the available training data. As such, it is + still batch learning, as opposed to [train-then-test](https://scikit-multiflow.readthedocs.io/en/stable/user-guide/core-concepts.html) + (or prequential) evaluation where a single model is continuously updated instead. + \ No newline at end of file diff --git a/docs/extending/constraint.md b/docs/extending/constraint.md new file mode 100644 index 000000000..7d6497d37 --- /dev/null +++ b/docs/extending/constraint.md @@ -0,0 +1,71 @@ +# Constraints + +Constraint definitions allow a set of common constraints to be applied to all tasks in +a benchmark. Default constraint definitions are available in +[`resources/constraint.yaml`](GITHUB/resources/constraint.yaml). +When no constraint is specified at the command line, the `test` constraint definition is used by default. + +A constraint definition can consist of the following constraints: + +- `folds` (default=10): The number of folds to evaluate for the task. Has to be less or equal to the number of folds defined by the task. +- `max_runtime_seconds` (default=3600): maximum time in seconds for each individual fold of a benchmark task. + This parameter is usually passed directly to the framework. If it doesn't respect the + constraint, the application will abort the task after `2 * max_runtime_seconds`. + In any case, the _actual_ time used is always recorded and available in the results. +- `cores` (default=-1): amount of cores used for each automl task. If non-positive, it will try to use all cores. +- `max_mem_size_mb` (default=-1): amount of memory assigned to each automl task. + If non-positive, then the amount of memory is computed from os available memory. +- `min_vol_size_mb` (default=-1): minimum amount of free space required on the volume. If non-positive, skips verification. If the requirement is not fulfilled, a warning message will be printed, but the task will still be attempted. +- `ec2_volume_type`: The volume type to use for the task when using EC2 instances, otherwise defaults to the value of `aws.ec2.volume_type` in your configuration file. + +!!! warning "Constraints are not enforced!" + + These constraints are forwarded to the AutoML framework if possible but are + generally not enforced. Not all AutoML frameworks allow for e.g., memory limits + to be set, and not all implementations that do treat it as a hard constraint. + For that reason, only `max_runtime_seconds` is enforced as described above. + It is advised when benchmarking to use an environment that mimics the given constraints. + +??? info "Constraints can be overriden by `benchmark`" + + A benchmark definition can override constraints on a task level. + This is useful if you want to define a benchmark which has different constraints + for different tasks. The default "test" benchmark does this to limit runtime to + 60 seconds instead of 600 seconds, which is useful to get quick results for its + small datasets. For more information, see [defining a benchmark](#ADD-link-to-adding-benchmark). + + +When writing your own constraint definition, it needs to be discoverable by the benchmark. +A good place to do this would be adding a `constraints.yaml` file to your benchmark +configuration directory (`~/.config/automlbenchmark` by default) and updating your +[custom configuration](../../using/configuration/#custom-configurations) by adding: + +```yaml +benchmarks: + constraints_file: + - '{root}/resources/constraints.yaml' + - '{user}/constraints.yaml' +``` + +You can then define multiple constraints in your constraint file, for example: +```yaml title="{user}/constraints.yaml" +--- + +test: + folds: 1 + max_runtime_seconds: 120 + +8h16c: + folds: 10 + max_runtime_seconds: 28800 + cores: 16 + min_vol_size_mb: 65536 + ec2_volume_type: gp3 +``` + +The new constraints can now be passed on the command line when executing the benchmark: +```bash +python runbenchmark.py randomforest validation 8h16c +``` +*Note*: The above example is _allowed_ to run for 8 hours, but will stop earlier as +`RandomForest` stops early after training 2000 trees. \ No newline at end of file diff --git a/docs/extending/framework.md b/docs/extending/framework.md new file mode 100644 index 000000000..ffd66f60d --- /dev/null +++ b/docs/extending/framework.md @@ -0,0 +1,415 @@ +# Adding an AutoML Framework + +!!! warning "Rewrite in progress" + + Most information on this page is accurate, and it should be complete enough to use. + However, it hasn't been updated to make use of `mkdocs-materials` features, and + _might_ have some outdated examples. Contributions welcome. + +## Add an AutoML framework + +Adding an AutoML framework consist in several steps: + + 1. create a Python module that will contain everything related to the integration of this framework. + 1. define the framework in a [Framework definition](#framework-definition) file. + 1. write some integration code + - to download/setup the framework dynamically: by convention, this is done by a `setup.sh` script defined in the module. + - to run the framework using the data and constraints/parameters provided by the benchmark application: by convention, this is done by an `exec.py` script in the module, but it may require more files depending on the framework, for example if it runs on Python or R, Java... + + +### Framework definition + +The framework definition consists in an entry in a `yaml` file with the framework name and some properties + + 1. to describe the framework and define which version will be used: `project`, `version`. + 1. to indicate the Python module with the integration code: `module` or `extends`. + 1. to pass optional parameters to the framework and/or the integration code: `params`. + +Default framework definitions are defined in file `resources/frameworks.yaml` in lexicographic order, +where `version` should be set to `stable`, which will point dynamically to the most recent official release available. + +Frameworks that offer the possibility to test cutting edge version (e.g. nightly builds, +`dev`/`master` repo, ...) can add an entry to `resources/frameworks_latest.yaml`, where `version` should be set to `latest`. + +Maintainers of this repository try to regularly — ideally, every quarter — create a +framework definition using frozen framework versions in order to favour the reproducibility of the published benchmarks. + +Following the [custom configuration](../using/configuration.md#custom-configurations), +it is possible to override and/or add a framework definitions by creating a `frameworks.yaml` file in your `user_dir`. + +See for example the `examples/custom/frameworks.yaml`: + +```yaml +--- + +GradientBoosting: + module: extensions.GradientBoosting + project: https://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting + params: + n_estimators: 500 + +Stacking: + module: extensions.Stacking + project: https://scikit-learn.org/stable/modules/ensemble.html#stacking + params: + _rf_params: {n_estimators: 200} + _gbm_params: {n_estimators: 200} + _linear_params: {penalty: elasticnet, loss: log} +# _svc_params: {tol: 1e-3, max_iter: 1e5} +# _final_params: {penalty: elasticnet, loss: log} # sgd linear + _final_params: {max_iter: 1000} # logistic/linear + +autosklearn_latest: + extends: autosklearn + version: latest + description: "this will use master branch from the autosklearn repository instead of the fixed version" + +autosklearn_mybranch: + extends: autosklearn + version: mybranch + description: "this will use mybranch branch from the autosklearn repository instead of the fixed version" + +autosklearn_oldgen: + extends: autosklearn + version: "0.7.1" + description: "this will use the latest autosklearn version from the old generation" + +H2OAutoML_nightly: + module: frameworks.H2OAutoML + setup_cmd: 'LATEST_H2O=`curl http://h2o-release.s3.amazonaws.com/h2o/master/latest` && pip install --no-cache-dir -U "http://h2o-release.s3.amazonaws.com/h2o/master/${{LATEST_H2O}}/Python/h2o-3.29.0.${{LATEST_H2O}}-py2.py3-none-any.whl"' + version: 'nightly' + +H2OAutoML_custom: + extends: H2OAutoML + params: + nfolds: 3 + stopping_tolerance: 0.05 +``` + +This example shows + +- the definitions for 2 new frameworks: `GradientBoosting` and `Stacking`. + - Those definitions (optionally) externalize some parameters (e.g. `n_estimators`): the `params` property always appears in json format in the results, so that we can clearly see what has been tuned when analyzing the results later. + - Note that the module is case sensitive and should point to the module containing the integration code. + - The application will search for modules from the sys path, which includes the application `root_dir` and the `user_dir`: + - that's why the default frameworks use `module: frameworks.autosklearn` for example, + - and the example above can use `module: extensions.GradientBoosting` because those examples must be run by setting the `user_dir` to `examples/config`, e.g. + > `python runbenchmark.py gradientboosting -u examples/custom`. +- a custom definition (`H2OAutoML_nightly`) for the existing `frameworks.H2OAutoML` module, allowing to reuse the module for a dynamic version of the module: + - the `setup_cmd` is executed after the default setup of the module, so it can be used to make additional setup. To customize the setup, it is possible to use: + - `setup_args: my_version` (only if the `setup.sh` in the framework module supports new arguments). + - `setup_cmd` (as shown in this example). + - `setup_script: my_additional_setup.sh`. +- 2 custom definitions (`H2OAutoML_blending` and `H2OAutoML_custom`) simply extending the existing `H2OAutoML` definition (therefore inheriting from all its properties, including the `module` one), but overriding the `params` property, thus allowing to provide multiple "flavours" of the same framework. + +The frameworks defined in this example can then be used like any other framework as soon as both the framework module and the definition file are made available to the application: in our case, this is done by the creation of the integration modules under `examples/custom/extensions` and by exposing the definitions in `examples/custom/frameworks.yaml` thanks to the entry in `examples/custom/config.yaml`: +```yaml +frameworks: + definition_file: # this allows to add custom framework definitions (in {user}/frameworks.yaml) on top of the default ones. + - '{root}/resources/frameworks.yaml' + - '{user}/frameworks.yaml' +``` + +By pointing the `user_dir` to `examples/custom`, our `config.yaml` is also loaded, and we can use the new frameworks: +```bash +python runbenchmark.py gradientboosting -u examples/custom +python runbenchmark.py stacking -u examples/custom +python runbenchmark.py h2oautoml_blending -u examples/custom +``` + +*Note:* + +By default, when generating a docker image, the image name is created as `automlbenchmark/{framework}:{version}-{branch}` with the framework name in lowercase, and `branch` being the branch of the `automlbenchmark` app (usually `stable`). +However, it is possible to customize this image name as follow: +```yaml +MyFramework: + version: 1.0 + module: extensions.MyFramework + docker: + author: my_docker_repo + image: my_image + tag: my_tag +``` +which will result in the docker image name `my_docker_repo/my_image:my_tag-{branch}`, with `branch` still being the branch of the application. + + +### Framework integration + +If the framework definition allows to use the new framework from the application, the (not so) hard part is to integrate it. + +There are already several frameworks already integrated under `frameworks` directory (+ the examples under `examples/custom`), so the best starting point when adding a new framework is to first look at the existing ones. + +Among the existing frameworks, we can see different type of integrations: + +- trivial integration: these are frameworks running on Python and using dependencies (`numpy`, `sklearn`) already required by the application itself. These are not really AutoML toolkits, but rather integrations using `sklearn` to provide a reference when analyzing the results: cf. `constantpredictor`, `DecisionTree`. +- Python API integration: these are frameworks that can be run directly from Python: cf. `autosklearn`, `H2OAutoML`, `TPOT`, `RandomForest`, `TunedRandomForest`. + - contrary to the trivial integration, those require a `setup` phase. + - Most of them currently run using the same dependencies as the application, which is not recommended due to potential version conflicts (especially with `sklearn`). This was not a major constraint with the first frameworks implemented, but now, those integrations can and will be slightly changed to [run in their dedicated virtual environment], using their own dependencies: cf. `RandomForest` and `examples/custom/extensions/Stacking` for examples. +- non-Python frameworks: those frameworks typically run in `R` or `Java` and don't provide any Python API. The integration is then still done by spawning the `Java` or `R` process from the `exec.py`: cf. `AutoWEKA` or `ranger`, respectively. + +#### Recommended structure + +By convention, the integration is done using the following structure: + +```text +frameworks/autosklearn/ +|-- __init__.py +|-- exec.py +|-- requirements.txt +`-- setup.sh +``` + +Please note however, that this structure is not a requirement, the only requirement is the contract exposed by the integration module itself, i.e. by the `__init__.py` file. + +A simple `__init__.py` would look like this: + +```python +from amlb.utils import call_script_in_same_dir + + +def setup(*args, **kwargs): + call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs) + + +def run(*args, **kwargs): + from .exec import run + return run(*args, **kwargs) + +``` + +where we see that the module should expose (only `run` is actually required) the following functions: + +- `setup` (optional): called by the application to setup the given framework, usually by simply running a `setup.sh` script that will be responsible for potentially creating a local virtual env, downloading and installing the dependencies. + The `setup` function can also receive the optional `setup_args` param from the [framework definition](#framework-definition) as an argument. +- `run`: called by the benchmark application to execute a task against the framework, using the selected dataset and constraints. We will describe the parameters in detail below, for now, just note that by convention, we just load the `exec.py` file from the module and delegate the execution to its `run` function. +- `docker_commands` (optional): called by the application to collect docker instructions that are specific to the framework. If the framework requires a `setup` phase, then the string returned by this function should at least ensure that the setup is also executed during the docker image creation, that's one reason why it is preferable to do all the setup in a `setup.sh` script, to allow the docker support above. + +#### Frameworks with Python API + +##### Frameworks requiring a dedicated virtual env + +For frameworks with Python API, we may worry about version conflicts between the packages used by the application (e.g. `sklearn`, `numpy`, `pandas`) and the ones required by the framework. + +In this case, the integration is slightly different as you can see with the `RandomForest` integration allowing to use any version of `sklearn`. + +This is the basic structure after the creation of the dedicated Python virtual environment during setup: +```text +frameworks/RandomForest/ +|-- __init__.py +|-- exec.py +|-- requirements.txt +|-- setup.sh +`-- venv/ + `-- (this local virtual env is created by the frameworks/shared/setup.sh) +``` + +Noticeable differences with a basic integration: + +- the `venv` is created in `setup.sh` by passing the current dir when sourcing the `shared/setup.sh` script: `. $HERE/../shared/setup.sh $HERE`. +- the `run` function in `__init__.py` prepares the data (in the application environment) before executing the `exec.py` in the dedicated `venv`. The call to `run_in_venv` is in charge of serializing the input, calling `exec.py` and deserializing + saving the results from `exec`. +- `exec.py`, when calls in the subprocess (function `__main__`), calls `call_run(run)` which deserializes the input (dataset + config) and passes it to the `run` function that just need to return a `result` object. + +*Note A*: + +As the serialization/deserialization of `numpy` arrays can be costly for very large datasets, it is recommended to use dataset serialization only if the framework itself doesn't support loading datasets from files. + +This means that, in the `__init__.py` instead of implementing `run` as: +```python +data = dict( + train=dict( + X=dataset.train.X, + y=dataset.train.y + ), + test=dict( + X=dataset.test.X, + y=dataset.test.y + ) +) + +return run_in_venv(__file__, "exec.py", + input_data=data, dataset=dataset, config=config) +``` +it could simply expose the dataset paths (the application avoids loading the data if not explicitly needed by the framework): +```python +data = dict( + target=dict(name=dataset.target.name), + train=dict(path=dataset.train.path), + test=dict(path=dataset.test.path) +) +return run_in_venv(__file__, "exec.py", + input_data=data, dataset=dataset, config=config) +``` + +*Note B*: + +The serialization/deserialization of data between the main process and the framework process can be customized using the `options` parameter: +The allowed options for (de)serialization are defined by the object `amlb.utils.serialization.ser_config`. + +For example: +```python +data = dict( + train=dict( + X=dataset.train.X, + y=dataset.train.y + ), + test=dict( + X=dataset.test.X, + y=dataset.test.y + ) +) + +options = dict( + serialization=dict(sparse_dataframe_deserialized_format='dense') +) +return run_in_venv(__file__, "exec.py", + input_data=data, dataset=dataset, config=config, options=options) +``` + + + +#### Other Frameworks + +Integration of frameworks without any Python API is done in similar way, for example: + +```text +frameworks/AutoWEKA/ +|-- __init__.py +|-- exec.py +|-- requirements.txt +|-- setup.sh +`-- lib/ + `-- (this is where the framework dependencies go, usually created by setup.sh) +``` +or +```text +frameworks/ranger/ +|-- __init__.py +|-- exec.R +|-- exec.py +|-- requirements.txt +`-- setup.sh +``` + +Here are the main differences: +- the `setup` phase is identical, but if at runtime, some executable file or library is required that need to be installed locally (as opposed to globally: for example, `R` or `java` executable are usually installed globally), we just recommend to put everything under the integration module (for example in `lib` and/or `bin` subfolders as for `AutoWEKA`). This is also true for some Python frameworks (cf. `hyperoptsklearn` integration for example, where the modules are loaded from `frameworks/hyperoptsklearn/lib/hyperopt-sklearn`). +- the framework is then executed by building a command manually in `exec.py`, running it in a subprocess, and finally collecting the results generated by the subprocess. For example, in `ranger/exec.py`: + ```python + with Timer() as training: + run_cmd(("Rscript --vanilla -e \"" + "source('{script}'); " + "run('{train}', '{test}', '{output}', cores={cores}, meta_results_file='{meta_results}', task_type='{task_type}')" + "\"").format( + script=os.path.join(here, 'exec.R'), + train=dataset.train.path, + test=dataset.test.path, + output=config.output_predictions_file, + meta_results=meta_results_file, + task_type=config.type, + cores=config.cores + ), _live_output_=True) + ``` + Here, the `exec.R` script is also responsible to save the predictions in the expected format. + + +#### Add a default framework + +Is called "default framework" an AutoML framework whose integration is available on `master` branch under the `frameworks` folder, and with a simple definition in `resources/frameworks.yaml`. + +*NOTE:* +There are a few requirements when integrating a new default framework: + +- The code snippet triggering the training should use only defaults (no AutoML hyper parameters), plus possibly a generic `**kwargs` in order to support `params` section in custom framework definitions. In other words, one of the requirements for being included in the benchmark is that the framework is submitted without any tweaks to default settings. This is to prevent submissions (systems) from overfitting or tuning to the benchmark. +- There must be a way to limit the runtime of the algorithm (a maximum runtime parameter). +- Exceptions: + - the problem type ("classification", "regression", "binary", "multiclass"): this is available through `config.type` or `dataset.type`. + - information about data, for example the column types: available through the `dataset` object. + - time, cpu and memory constraints: those must be provided by the benchmark application through the `config` object. + - the objective function: provided by `config.metric` (usually requires a translation for a given framework). + - seed: provided by `config.seed` + - paths to folders (output, temporary...): if possible, use `config.output_dir` or a subfolder (see existing integrations). +- The default framework definition in `resources/frameworks.yaml` shouldn't have any `params` section: this `params` section is intended for custom definitions, not default ones. +```yaml +good_framework: + version: "0.0.1" + project: "http://go.to/good_framework" + +bad_framework: + version: "0.0.1" + project: "http://go.to/bad_framework" + params: + enable_this: true + use: ['this', 'that'] +``` + +Using the instructions above: + + 1. verify that there is an issue created under for the framework you want to add, or create one. + 1. create a private branch for your integration changes. + 1. create the framework module (e.g. `MyFramework`) under `frameworks` folder. + 1. define the module (if possible without any `params`) in `resources/frameworks.yaml`. + 1. try to setup the framework: + > python runbenchmark.py myframework -s only + 1. fixes the framework setup until it works: the setup being usually a simple `setup.sh` script, you should be able to test it directly without using the application. + 1. try to run simple test against one fold using defaults (`test` benchmark and `test` constraints) with the `-Xtest_mode` that will trigger additional validations: + > python runbenchmark.py myframework -f 0 -Xtest_mode + 1. fix the module integration code until the test produce all results with no error (if the integration generated an error, it is visible in the results). + 1. if this works, validate it against the `validation` dataset using one fold: + > python runbenchmark.py myframework validation 1h4c -f 0 -Xtest_mode + 1. if this works, try to run it in docker to validate the docker image setup: + > python runbenchmark.py myframework -m docker + 1. if this works, try to run it in aws: + > python runbenchmark.py myframework -m aws + 1. add a brief description of the framework to the documentation in [docs/website/framework.html](GITHUB/docs/website/frameworks.html) following the same formatting as the other entries. + 1. create a pull request, and ask a review from authors of `automlbenchmark`: they'll also be happy to help you during this integration. + +#### Add a custom framework + +You may want to integrate a framework without wanting to make this publicly available. + +In this case, as we've seen above, there's always the possibility to integrate your framework in a custom `user_dir`. + +Using the instructions above: + + 1. define what is (or will be) your custom `user_dir` for this framework. + 1. ensure it contains a `config.yaml`, otherwise create one (for example copy [this one](../using/configuration.md#custom-configurations) or `examples/custom/config.yaml`). + 1. create the framework module somewhere under this `user_dir`, e.g. `{user_dir}/extensions/MyFramework`. + 1. define the module in `{user_dir}/frameworks.yaml` (create the file if needed). + 1. follow the same steps as for a "default" framework to implement the integration: setup, test, ... except that you always need to specify the `user_dir`, e.g. for testing: + > python runbenchmark.py myframework -f 0 -u {user_dir} + 1. there may be some issues when trying to build the docker image when the framework is in a custom folder, as all the files should be under the docker build context: solving this probably requires a multi-stage build, needs more investigation. For now, if you really need a docker image, you can either build it manually, or simply copy the `extensions` folder temporarily under `automlbenchmark`. + 1. even without docker image, you can run the framework on AWS, as soon as the custom `config.yaml`, `frameworks.yaml` and `extensions` folder are made available as AWS resources: cf. again the [custom configuration](../using/configuration.md#custom-configurations). The application will copy those files to the EC2 instances into a local `user_dir` and will be able to setup the framework there. + + +## Using a Different Hyperparameter Configuration + +When you want to use an existing framework integration with a different hyperparameter +configuration, it is often enough to write only a custom framework definition without +further changes. + +Framework definitions accept a `params` dictionary for pass-through parameters, +i.e., parameters that are directly accessible from the `exec.py` file in the framework +integration executing the AutoML training. *Most* integration scripts use this to +overwrite any (default) hyperparameter value. Use the `extends` field to indicate +which framework definition to copy default values from, and then add any fields to +overwrite. In the example below the `n_estimators` and `verbose` params are passed +directly to the `RandomForestClassifier`, which will now train only 200 trees +(default is 2000): + +```yaml +RandomForest_custom: + extends: RandomForest + params: + n_estimators: 200 + verbose: true +``` + +This new definition can be used as normal: +``` +python runbenchmark.py randomforest_custom ... +``` + +!!! note + By convention, param names starting with `_` are filtered out (they are not passed + to the framework) but are used for custom logic in the `exec.py`. For example, the + `_save_artifact` field is often used to allow additional artifacts, such as logs or + models, to be saved. diff --git a/docs/extending/index.md b/docs/extending/index.md new file mode 100644 index 000000000..54f1a4939 --- /dev/null +++ b/docs/extending/index.md @@ -0,0 +1,8 @@ +# Extending the Benchmark Tool + +You can extend the benchmark tool in multiple ways. +[Benchmarks](benchmark.md) define collections of tasks on which to evaluate AutoML +frameworks. [Constraints](constraint.md) specify the resource constraints forwarded +to the AutoML framework, such as a time or memory limit. Finally, it is possible to +[add AutoML frameworks](framework.md#add-a-custom-framework) or to +[use an integrated AutoML framework with non-default configuration](framework.md#using-a-different-hyperparameter-configuration). \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 000000000..481bb4b0d --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,40 @@ +# Frequently Asked Questions + +If your question is not answered here, please check our Github [issue tracker](https://github.com/openml/automlbenchmark/issues) and [discussion board](https://github.com/openml/automlbenchmark/discussions). +If you still can not find an answer, please [open a Q&A discussion on Github](https://github.com/openml/automlbenchmark/discussions/new?category=q-a). + +## (When) will you add framework X? + +We are currently not focused on integrating additional AutoML systems. +However, we process any pull requests that add frameworks and will assist with the integration. +The best way to make sure framework X gets included is to start with the integration +yourself or encourage the package authors to do so. For technical details see +[Adding an AutoML Framework](./extending/framework.md). + +It is also possible to open a Github issue indicating the framework you would like added. +Please use a clear title (e.g. "Add framework: X") and provide some relevant information +(e.g. a link to the documentation). +This helps us keep track of which frameworks people are interested in seeing included. + + +## Framework setup is not executed +First, it is important to note that we officially only officially support Ubuntu 22.04 LTS, +though other versions and MacOS generally work too. If that does not work, for +example with Windows, use docker mode as per [the installation instructions](getting_started.md#installation). +For MacOS, it may be required to have [brew](https://brew.sh) installed. + +If you are experiencing issues with the framework setup not being executed, please +try the following steps before opening an issue: + + - delete the `.marker_setup_safe_to_delete` from the framework module and try to run + the benchmark again. This marker file is automatically created after a successful + setup to avoid having to execute it each tine (setup phase can be time-consuming), + this marker then prevents auto-setup, except if the `-s only` or `-s force` args below are used. + + - force the setup using the `--setup=only` arg on the command line. This forces the + setup to take place. If the setup is now done correctly, you can run the commands + as normal to start the benchmark. If not, continue. + + - manually clean the installation files by deleting the `lib`, `venv` and `.setup` folders + in the given framework folder (e.g. `frameworks/MyFramework`), and try again. + diff --git a/docs/getting_started.md b/docs/getting_started.md new file mode 100644 index 000000000..e1d8d02dc --- /dev/null +++ b/docs/getting_started.md @@ -0,0 +1,267 @@ +--- +title: Getting Started +description: A short tutorial on installing the software and running a simple benchmark. +--- + +# Getting Started + +The AutoML Benchmark is a tool for benchmarking AutoML frameworks on tabular data. +It automates the installation of AutoML frameworks, passing it data, and evaluating +their predictions. +[Our paper](https://arxiv.org/pdf/2207.12560.pdf) describes the design and showcases +results from an evaluation using the benchmark. +This guide goes over the minimum steps needed to evaluate an +AutoML framework on a toy dataset. + +## Installation +These instructions assume that [Python 3.9 (or higher)](https://www.python.org/downloads/) +and [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) are installed, +and are available under the alias `python` and `git`, respectively. We recommend +[Pyenv](https://github.com/pyenv/pyenv) for managing multiple Python installations, +if applicable. We support Ubuntu 22.04, but many linux and MacOS versions likely work +(for MacOS, it may be necessary to have [`brew`](https://brew.sh) installed). + +First, clone the repository: + +```bash +git clone https://github.com/openml/automlbenchmark.git --branch stable --depth 1 +cd automlbenchmark +``` + +Create a virtual environments to install the dependencies in: + +=== ":simple-linux: Linux" + + ```bash + python -m venv venv + source venv/bin/activate + ``` + +=== ":material-apple: MacOS" + + ```bash + python -m venv venv + source venv/bin/activate + ``` + +=== ":simple-windows: Windows" + + ```bash + python -m venv ./venv + venv/Scripts/activate + ``` + +Then install the dependencies: + +```bash +python -m pip install --upgrade pip +python -m pip install -r requirements.txt +``` + + +??? windows "Note for Windows users" + + The automated installation of AutoML frameworks is done using shell script, + which doesn't work on Windows. We recommend you use + [Docker](https://docs.docker.com/desktop/install/windows-install/) to run the + examples below. First, install and run `docker`. + Then, whenever there is a `python runbenchmark.py ...` + command in the tutorial, add `-m docker` to it (`python runbenchmark.py ... -m docker`). + +??? question "Problem with the installation?" + + On some platforms, we need to ensure that requirements are installed sequentially. + Use `xargs -L 1 python -m pip install < requirements.txt` to do so. If problems + persist, [open an issue](https://github.com/openml/automlbenchmark/issues/new) with + the error and information about your environment (OS, Python version, pip version). + + +## Running the Benchmark + +To run a benchmark call the `runbenchmark.py` script specifying the framework to evaluate. +See [integrated frameworks](WEBSITE/frameworks.html) for a list of supported frameworks, or the [adding a frameworking](extending/framework.md) page on how to add your own. + +### Example: a test run with Random Forest +Let's try evaluating the `RandomForest` baseline, which uses [scikit-learn](https://scikit-learn.org/stable/)'s random forest: + +=== ":simple-linux: Linux" + + ```bash + python runbenchmark.py randomforest + ``` + +=== ":material-apple: MacOS" + + ```bash + python runbenchmark.py randomforest + ``` + +=== ":simple-windows: Windows" + As noted above, we need to install the AutoML frameworks (and baselines) in + a container. Add `-m docker` to the command as shown: + ```bash + python runbenchmark.py randomforest -m docker + ``` + + !!! warning "Important" + Future example usages will only show invocations without `-m docker` mode, + but Windows users will need to run in some non-local mode. + +After running the command, there will be a lot of output to the screen that reports +on what is currently happening. After a few minutes final results are shown and should +look similar to this: + +``` +Summing up scores for current run: + id task fold framework constraint result metric duration seed +openml.org/t/3913 kc2 0 RandomForest test 0.865801 auc 11.1 851722466 +openml.org/t/3913 kc2 1 RandomForest test 0.857143 auc 9.1 851722467 + openml.org/t/59 iris 0 RandomForest test -0.120755 neg_logloss 8.7 851722466 + openml.org/t/59 iris 1 RandomForest test -0.027781 neg_logloss 8.5 851722467 +openml.org/t/2295 cholesterol 0 RandomForest test -44.220800 neg_rmse 8.7 851722466 +openml.org/t/2295 cholesterol 1 RandomForest test -55.216500 neg_rmse 8.7 851722467 +``` + +The result denotes the performance of the framework on the test data as measured by +the metric listed in the metric column. The result column always denotes performance +in a way where higher is better (metrics which normally observe "lower is better" are +converted, which can be observed from the `neg_` prefix). + +While running the command, the AutoML benchmark performed the following steps: + + 1. Create a new virtual environment for the Random Forest experiment. + This environment can be found in `frameworks/randomforest/venv` and will be re-used + when you perform other experiments with `RandomForest`. + 2. It downloaded datasets from [OpenML](https://www.openml.org) complete with a + "task definition" which specifies [cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html) folds. + 3. It evaluated `RandomForest` on each (task, fold)-combination in a separate subprocess, where: + 1. The framework (`RandomForest`) is initialized. + 2. The training data is passed to the framework for training. + 3. The test data is passed to the framework to make predictions on. + 4. It passes the predictions back to the main process + 4. The predictions are evaluated and reported on. They are printed to the console and + are stored in the `results` directory. There you will find: + 1. `results/results.csv`: a file with all results from all benchmarks conducted on your machine. + 2. `results/randomforest.test.test.local.TIMESTAMP`: a directory with more information about the run, + such as logs, predictions, and possibly other artifacts. + +!!! info "Docker Mode" + + When using docker mode (with `-m docker`) a docker image will be made that contains + the virtual environment. Otherwise, it functions much the same way. + +### Important Parameters + +As you can see from the results above, the default behavior is to execute a short test +benchmark. However, we can specify a different benchmark, provide different constraints, +and even run the experiment in a container or on AWS. There are many parameters +for the `runbenchmark.py` script, but the most important ones are: + +`Framework (required)` + +: The AutoML framework or baseline to evaluate and is not case-sensitive. See + [integrated frameworks](WEBSITE/frameworks.html) for a list of supported frameworks. + In the above example, this benchmarked framework `randomforest`. + +`Benchmark (optional, default='test')` + +: The benchmark suite is the dataset or set of datasets to evaluate the framework on. + These can be defined as on [OpenML](https://www.openml.org) as a [study or task](extending/benchmark.md#defining-a-benchmark-on-openml) + (formatted as `openml/s/X` or `openml/t/Y` respectively) or in a [local file](extending/benchmark.md#defining-a-benchmark-with-a-file). + The default is a short evaluation on two folds of `iris`, `kc2`, and `cholesterol`. + +`Constraints (optional, default='test')` + +: The constraints applied to the benchmark as defined by default in [constraints.yaml](GITHUB/resources/constraints.yaml). + These include time constraints, memory constrains, the number of available cpu cores, and more. + Default constraint is `test` (2 folds for 10 min each). + + !!! warning "Constraints are not enforced!" + These constraints are forwarded to the AutoML framework if possible but, except for + runtime constraints, are generally not enforced. It is advised when benchmarking + to use an environment that mimics the given constraints. + + ??? info "Constraints can be overriden by `benchmark`" + A benchmark definition can override constraints on a task level. + This is useful if you want to define a benchmark which has different constraints + for different tasks. The default "test" benchmark does this to limit runtime to + 60 seconds instead of 600 seconds, which is useful to get quick results for its + small datasets. For more information, see [defining a benchmark](#ADD-link-to-adding-benchmark). + +`Mode (optional, default='local')` + +: The benchmark can be run in four modes: + + * `local`: install a local virtual environment and run the benchmark on your machine. + * `docker`: create a docker image with the virtual environment and run the benchmark in a container on your machine. + If a local or remote image already exists, that will be used instead. Requires [Docker](https://docs.docker.com/desktop/). + * `singularity`: create a singularity image with the virtual environment and run the benchmark in a container on your machine. Requires [Singularity](https://docs.sylabs.io/guides/3.5/user-guide/introduction.html). + * `aws`: run the benchmark on [AWS EC2](https://aws.amazon.com/free/?trk=b3f93e34-c1e0-4aa9-95f8-6d2c36891d8a&sc_channel=ps&ef_id=CjwKCAjw-7OlBhB8EiwAnoOEk0li05IUgU9Ok2uCdejP22Yr7ZuqtMeJZAdxgL5KZFaeOVskCAsknhoCSjUQAvD_BwE:G:s&s_kwcid=AL!4422!3!649687387631!e!!g!!aws%20ec2!19738730094!148084749082&all-free-tier.sort-by=item.additionalFields.SortRank&all-free-tier.sort-order=asc&awsf.Free%20Tier%20Types=*all&awsf.Free%20Tier%20Categories=*all) instances. + It is possible to run directly on the instance or have the EC2 instance run in `docker` mode. + Requires valid AWS credentials to be configured, for more information see [Running on AWS](#ADD-link-to-aws-guide). + + +For a full list of parameters available, run: + +``` +python runbenchmark.py --help +``` + +### Example: AutoML on a specific task and fold + +The defaults are very useful for performing a quick test, as the datasets are small +and cover different task types (binary classification, multiclass classification, and +regression). We also have a ["validation" benchmark](GITHUB/resources/benchmarks/validation.yaml) +suite for more elaborate testing that also includes missing data, categorical data, +wide data, and more. The benchmark defines 9 tasks, and evaluating two folds with a +10-minute time constraint would take roughly 3 hours (=9 tasks * 2 folds * 10 minutes, +plus overhead). Let's instead use the `--task` and `--fold` parameters to run only a +specific task and fold in the `benchmark` when evaluating the +[flaml](https://microsoft.github.io/FLAML/) AutoML framework: + +``` +python runbenchmark.py flaml validation test -t eucalyptus -f 0 +``` + +This should take about 10 minutes plus the time it takes to install `flaml`. +Results should look roughly like this: + +``` +Processing results for flaml.validation.test.local.20230711T122823 +Summing up scores for current run: + id task fold framework constraint result metric duration seed +openml.org/t/2079 eucalyptus 0 flaml test -0.702976 neg_logloss 611.0 1385946458 +``` + +Similarly to the test run, you will find additional files in the `results` directory. + + +### Example: Benchmarks on OpenML + +In the previous examples, we used benchmarks which were defined in a local file +([test.yaml](GITHUB/resources/benchmarks/test.yaml) and +[validation.yaml](GITHUB/resources/benchmarks/validation.yaml), respectively). +However, we can also use tasks and +benchmarking suites defined on OpenML directly from the command line. When referencing +an OpenML task or suite, we can use `openml/t/ID` or `openml/s/ID` respectively as +argument for the benchmark parameter. Running on the [iris task](https://openml.org/t/59): + +``` +python runbenchmark.py randomforest openml/t/59 +``` + +or on the entire [AutoML benchmark classification suite](https://openml.org/s/271) (this will take hours!): + +``` +python runbenchmark.py randomforest openml/s/271 +``` + +!!! info "Large-scale Benchmarking" + + For large scale benchmarking it is advised to parallelize your experiments, + as otherwise it may take months to run the experiments. + The benchmark currently only supports native parallelization in `aws` mode + (by using the `--parallel` parameter), but using the `--task` and `--fold` parameters + it is easy to generate scripts that invoke individual jobs on e.g., a SLURM cluster. + When you run in any parallelized fashion, it is advised to run each process on + separate hardware to ensure experiments can not interfere with each other. diff --git a/docs/index.md b/docs/index.md index d884d6b24..82ea1189c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,18 +1,21 @@ ---- -layout: index -title: Home ---- +# AutoML Benchmark -# An Open Source AutoML Benchmark +These are the AutoML Benchmark documentation pages with information on how to +configure and use the AutoML Benchmark tool. For first time users, we advise +visiting the [getting started](getting_started.md) page. -This the homepage for the open and extensible AutoML Benchmark. -The AutoML Benchmark provides an overview and comparison of open-source AutoML systems. -It is *open* because the benchmark infrastructure is [open-source](https://github.com/openml/automlbenchmark/) -and *extensible* because you can [add your own](extending.md) problems and datasets. +This documentation is accompanied by [our website](WEBSITE) +which has information on [our papers](WEBSITE/papers.html), +[integrated frameworks](WEBSITE/frameworks.html), +and [evaluation results](WEBSITE/results.html). -A brief overview and further references for each AutoML system can be found on the [AutoML systems](automl_overview.md) page. -For a thorough explanation of the benchmark, and evaluation of results, you can read our [paper](paper.md). -If you want to analyze the results yourself, you can do this on the [results](results.md) pages. +!!! note "Help Wanted!" -Because the benchmark infrastructure is open-source, you can rerun the benchmark yourself, use custom datasets or your own AutoML platform as explained in our [project documentation](documentation.md). -We also invite you to [submit your own AutoML](documentation.md) system to be evaluated against the benchmark and included in the overview. + We recently switched to generating doc pages with `mkdocs-material`. In the process, + we did our best to make sure to use the additional functionalities to better + present the information and make it easy to find through notes, tabs, and other features. + + It is possible to find parts of the documentation are not updated (correctly), or + are not clear. We welcome all help to improve the documentation. If you have a + suggestion on how to improve the documentation, please open an issue. If you find + an error, please open an issue or open a pull request directly. Thanks! :pray: \ No newline at end of file diff --git a/docs/modifications.md b/docs/modifications.md deleted file mode 100644 index c90dff209..000000000 --- a/docs/modifications.md +++ /dev/null @@ -1,42 +0,0 @@ ---- ---- -# Required Modifications - -Each method is given, unless otherwise specified or unavailable, information about resources: -* Memory -* Runtime -* Number of cores - -And, additionally: -* Metric to optimize - -## auto-sklearn - -### Data preprocessing -Encode string data to numeric (labelencoding). - -### Non-default arguments - -## Auto-WEKA -logloss metric is specified as kBInformation. - -### Data preprocessing -None, ARFF file used directly. -Output is rewritten so it fits `docker/common/evaluate.py` expectations. - -### Non-default arguments - -## H2O AutoML - -### Data preprocessing -None, ARFF file used directly. - -### Non-default arguments - -## TPOT - -### Data preprocessing -Encode string data to numeric (labelencoding). - -### Non-default arguments - diff --git a/docs/paper.md b/docs/paper.md deleted file mode 100644 index 8011f3628..000000000 --- a/docs/paper.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -layout: category -title: Paper -sidebar_sort_order: 9 ---- -[PDF](https://www.automl.org/wp-content/uploads/2019/06/automlws2019_Paper45.pdf) \| [arXiv](https://arxiv.org/abs/1907.00909) \| [BibTeX](bib_workshop.md) - -> First look of the benchmark submitted to [AutoML Workshop at ICML 2019](https://sites.google.com/view/automl2019icml). - -**abstract:** In recent years, an active field of research has developed around automated machine learning(AutoML). -Unfortunately, comparing different AutoML systems is hard and often done in correctly. -We introduce an open, ongoing, and extensible benchmark framework which follows best practices and avoids common mistakes. -The framework is open-source, uses public datasets and has a website with up-to-date results. -We use the framework to conduct a thorough comparison of 4 AutoML systems across 39 datasets and analyze the results. - ---- - diff --git a/docs/results.md b/docs/results.md deleted file mode 100644 index 7c9aa541a..000000000 --- a/docs/results.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -layout: page -title: Results -sidebar_link: true -sidebar_sort_order: 1 ---- - -### Complete Results -[Complete results][reports] are also available in [csv] format or as simple [visualizations] for now. -We hope to provide interactive visualization in the future. - -### Binary Results -A sample of the results obtained by running each framework over 10 folds for various durations each: for binary tasks, the plotted metric is AUC. -Smaller and medium datasets were trained for 1h and 4h. Larger datasets have been trained for 4h and 8h. - -![Binary Results Stripplot 1h][binary_1h] - -![Binary Results Stripplot 4h][binary_4h] - -![Binary Results Stripplot 8h][binary_8h] - -### Multiclass Results -A sample of the results obtained by running each framework over 10 folds for various durations each: for multiclass tasks, the plotted metric is logloss. -Smaller and medium datasets were trained for 1h and 4h. Larger datasets have been trained for 4h and 8h. - -![Multiclass Results Stripplot 1h][multiclass_1h] - -![Multiclass Results Stripplot 4h][multiclass_4h] - -![Multiclass Results Stripplot 8h][multiclass_8h] - -[binary_1h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/1h/binary_results_stripplot.png -[multiclass_1h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/1h/multiclass_results_stripplot.png -[binary_4h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/4h/binary_results_stripplot.png -[multiclass_4h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/4h/multiclass_results_stripplot.png -[binary_8h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/8h/binary_results_stripplot.png -[multiclass_8h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/8h/multiclass_results_stripplot.png -[reports]:https://github.com/openml/automlbenchmark/tree/master/reports -[csv]:https://github.com/openml/automlbenchmark/tree/master/reports/tables -[visualizations]:https://github.com/openml/automlbenchmark/tree/master/reports/graphics \ No newline at end of file diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css new file mode 100644 index 000000000..7727c4398 --- /dev/null +++ b/docs/stylesheets/extra.css @@ -0,0 +1,25 @@ +:root { + --md-primary-fg-color: #1971c2; + --md-primary-fg-color--light: white; + --md-primary-fg-color--dark: #90030C; + + --md-admonition-icon--windows: url('data:image/svg+xml;charset=utf-8,') +} +.md-typeset .admonition.windows, +.md-typeset details.windows { + border-color: rgb(25, 113, 194); +} +.md-typeset .windows > .admonition-title, +.md-typeset .windows > summary { + background-color: rgba(25, 113, 194, 0.1); +} +.md-typeset .windows > .admonition-title::before, +.md-typeset .windows > summary::before { + background-color: rgb(25, 113, 194); + -webkit-mask-image: var(--md-admonition-icon--windows); + mask-image: var(--md-admonition-icon--windows); +} + +.md-typeset .limit_max_height code { + max-height: 20rem; +} \ No newline at end of file diff --git a/docs/using/aws.md b/docs/using/aws.md new file mode 100644 index 000000000..1e9c76a2e --- /dev/null +++ b/docs/using/aws.md @@ -0,0 +1,162 @@ +# AWS + +The AutoML benchmark supports running experiments on [AWS EC2](https://aws.amazon.com/ec2/). + +!!! danger "AMLB does not limit expenses!" + + The AWS integration lets your easily conduct massively parallel evaluations. + The AutoML Benchmark does not in any way restrict the _total_ costs you can make on AWS. + However, there are some tips for [reducing costs](#reducing-costs). + + ??? danger "Example Costs" + + For example, benchmarking one framework on the classification and regression suites + on a one hour budget takes 1 hour * 10 folds * 100 datasets = 1,000 hours, plus + overhead. Even when using spot instance pricing on `m5.2xlarge` instances (default) + probably costs at least $100 US (prices depend on overhead and fluctating prices). + A full evaluation with multiple frameworks and/or time budgets can cost + thousands of dollars. + + +## Setup + +To run a benchmark on AWS you additionally need to have a configured AWS account. +The application is using the [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) +Python package to exchange files through S3 and create EC2 instances. + +If this is your first time setting up your AWS account on the machine that will run the +`automlbenchmark` app, you can use the [AWS CLI](http://aws.amazon.com/cli/) tool and run: + ```bash + aws configure + ``` +You will need your AWS Access Key ID, AWS Secret Access Key, and pick a default [EC2 region](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html#concepts-available-regions). + +!!! note "Selecting a Region" + To use a region, an AMI must be configured in the automl benchmark configuration file + under `aws.ec2.regions`. The default configuration has AMIs for `us-east-1`, + `us-east-2`, `us-west-1`, `eu-west-1`, and `eu-central-1`. If you default EC2 + region is different from these, you will need to add the AMI to your [custom configuration](configuration.md#custom-configurations). + +On first use, it is recommended to use the following configuration file, or to extend +your custom configuration file with these options. Follow the instructions in the file +and make any necessary adjustments before running the benchmark. + +```yaml title="Starting AWS Configuration" +--8<-- "examples/aws/config.yaml" +``` + +To run a test to see if the benchmark framework is working on AWS, do the following: + +```bash +python3 runbenchmark.py constantpredictor test -m aws +``` + +This will create and start an EC2 instance for each benchmark job and run the 6 jobs +(3 OpenML tasks * 2 folds) from the `test` benchmark sequentially. +Each job will run is constrained to a one-minute limit in this case, excluding setup +time for the EC2 instances (though `constantpredictor` will likely only take seconds). + +For longer benchmarks, you'll probably want to run multiple jobs in parallel and +distribute the work to several EC2 instances, for example: +```bash +python3 runbenchmark.py autosklearn validation 1h4c -m aws -p 4 +``` +will keep 4 EC2 instances running, monitor them in a dedicated thread, and finally collect all outputs from s3. + +??? info "EC2 Instances always stopped eventually (by default)" + + Each EC2 instance is provided with a time limit at startup to ensure that in any case, + the instance is stopped even if there is an issue when running the benchmark task. + In this case the instance is stopped, not terminated, and we can therefore inspect + the machine manually (ideally after resetting its UserData field to avoid + re-triggering the benchmark on the next startup). + +The console output is still showing the instances starting, outputs the progress and +then the results for each dataset/fold combination (log excerpt from different command): + +```{.text .limit_max_height title="Example output benchmarking H2O on AWS"} +Running `H2OAutoML_nightly` on `validation` benchmarks in `aws` mode! +Loading frameworks definitions from ['/Users/me/repos/automlbenchmark/resources/frameworks.yaml']. +Loading benchmark definitions from /Users/me/repos/automlbenchmark/resources/benchmarks/validationt.yaml. +Uploading `/Users/me/repos/automlbenchmark/resources/benchmarks/validation.yaml` to `ec2/input/validation.yaml` on s3 bucket automl-benchmark. +... +Starting new EC2 instance with params: H2OAutoML_nightly /s3bucket/input/validation.yaml -t micro-mass -f 0 +Started EC2 instance i-0cd081efc97c3bf6f +[2019-01-22T11:51:32] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: pending +Starting new EC2 instance with params: H2OAutoML_nightly /s3bucket/input/validation.yaml -t micro-mass -f 1 +Started EC2 instance i-0251c1655e286897c +... +[2019-01-22T12:00:32] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running +[2019-01-22T12:00:33] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: running +[2019-01-22T12:00:48] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running +[2019-01-22T12:00:48] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: running +... +[ 731.511738] cloud-init[1521]: Predictions saved to /s3bucket/output/predictions/h2oautoml_nightly_micro-mass_0.csv +[ 731.512132] cloud-init[1521]: H2O session _sid_96e7 closed. +[ 731.512506] cloud-init[1521]: Loading predictions from /s3bucket/output/predictions/h2oautoml_nightly_micro-mass_0.csv +[ 731.512890] cloud-init[1521]: Metric scores: {'framework': 'H2OAutoML_nightly', 'version': 'nightly', 'task': 'micro-mass', 'fold': 0, 'mode': 'local', 'utc': '2019-01-22T12:00:02', 'logloss': 0.6498889633819804, 'acc': 0.8793103448275862, 'result': 0.6498889633819804} +[ 731.513275] cloud-init[1521]: Job local_micro-mass_0_H2OAutoML_nightly executed in 608.534 seconds +[ 731.513662] cloud-init[1521]: All jobs executed in 608.534 seconds +[ 731.514089] cloud-init[1521]: Scores saved to /s3bucket/output/scores/H2OAutoML_nightly_task_micro-mass.csv +[ 731.514542] cloud-init[1521]: Loaded scores from /s3bucket/output/scores/results.csv +[ 731.515006] cloud-init[1521]: Scores saved to /s3bucket/output/scores/results.csv +[ 731.515357] cloud-init[1521]: Summing up scores for current run: +[ 731.515782] cloud-init[1521]: task framework ... acc logloss +[ 731.516228] cloud-init[1521]: 0 micro-mass H2OAutoML_nightly ... 0.87931 0.649889 +[ 731.516671] cloud-init[1521]: [1 rows x 9 columns] +... +EC2 instance i-0cd081efc97c3bf6f is stopped +Job aws_validation_micro-mass_0_H2OAutoML_nightly executed in 819.305 seconds +[2019-01-22T12:01:34] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running +[2019-01-22T12:01:49] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running +EC2 instance i-0251c1655e286897c is stopping +Job aws_validation_micro-mass_1_H2OAutoML_nightly executed in 818.463 seconds +... +Terminating EC2 instances i-0251c1655e286897c +Terminated EC2 instances i-0251c1655e286897c with response {'TerminatingInstances': [{'CurrentState': {'Code': 32, 'Name': 'shutting-down'}, 'InstanceId': 'i-0251c1655e286897c', 'PreviousState': {'Code': 64, 'Name': 'stopping'}}], 'ResponseMetadata': {'RequestId': 'd09eeb0c-7a58-4cde-8f8b-2308a371a801', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'text/xml;charset=UTF-8', 'transfer-encoding': 'chunked', 'vary': 'Accept-Encoding', 'date': 'Tue, 22 Jan 2019 12:01:53 GMT', 'server': 'AmazonEC2'}, 'RetryAttempts': 0}} +Instance i-0251c1655e286897c state: shutting-down +All jobs executed in 2376.891 seconds +Deleting uploaded resources `['ec2/input/validation.yaml', 'ec2/input/config.yaml', 'ec2/input/frameworks.yaml']` from s3 bucket automl-benchmark. +``` + + +## Configurable AWS Options + +When using AWS mode, the application will use `on-demand` EC2 instances from the `m5` +series by default. However, it is also possible to use `Spot` instances, specify a +`max_hourly_price`, or customize your experience when using this mode in general. +All configuration points are grouped and documented under the `aws` yaml namespace in +the main [config](GITHUB/resources/config.yaml) file. +When setting your own configuration, it is strongly recommended to first create your +own `config.yaml` file as described in [Custom configuration](configuration.md#custom-configurations). +Here is an example of a config file using Spot instances on a non-default region: +```yaml + +aws: + region: 'us-east-1' + resource_files: + - '{user}/config.yaml' + - '{user}/frameworks.yaml' + + ec2: + subnet_id: subnet-123456789 # subnet for account on us-east-1 region + spot: + enabled: true + max_hourly_price: 0.40 # comment out to use default +``` + +### Reducing Costs + +The most important thing you can do to reduce costs is to critically evaluate which +experimental results can be re-used from previous publications. That said, when +conducting new experiments on AWS we have the following recommendations to reduce costs: + + - Use spot instances with a fixed maximum price: set `aws.ec2.spot.enabled: true` and `aws.ec2.spot.max_hourly_price`. + Check which region has [the lowest spot instance prices](https://aws.amazon.com/ec2/spot/) + and configure `aws.region` accordingly. + - Skip the framework installation process by providing a docker image and setting `aws.docker_enabled: true`. + - Set up [AWS Budgets](https://aws.amazon.com/aws-cost-management/aws-budgets/) + to get alerts early if forecasted usage exceeds the budget. It should also be + technically possibly to automatically shut down all running instances in a region + if a budget is exceeded, but this naturally leads to a loss of experimental results, so + it is best avoided. \ No newline at end of file diff --git a/docs/using/configuration.md b/docs/using/configuration.md new file mode 100644 index 000000000..e95350c91 --- /dev/null +++ b/docs/using/configuration.md @@ -0,0 +1,51 @@ +# Configuration + +The AutoML benchmark has a host of settings that can be configured from a `yaml` file. +It is possible to write your own configuration file that overrides the default behavior +in a flexible manner. + +## Configuration Options + +The default configuration options can be found in the +[`resources/config.yaml`](GITHUB/resources/config.yaml) file. + +```{ .yaml title="resources/config.yaml" .limit_max_height } +--8<-- "resources/config.yaml" +``` + +## Custom Configurations +To override default configuration, create your custom `config.yaml` file under the +`user_dir` (specified by the `--userdir` parameter of `runbenchmark.py`). +The application will automatically load this custom file and apply it on top of the defaults. + +When specifying filepaths, configurations support the following placeholders: + +| Placeholder | Replaced By Value Of | Default | Function | +|:------------|:---------------------|:----------------------------|:-----------------------------------------------------------------------| +| `{input}` | `input_dir` | `~/.openml/cache` | Folder from which datasets are loaded (and/or downloaded) | +| `{output}` | `output_dir` | `./results` | Folder where all outputs (results, logs, predictions, ...) are stored. | +| `{user}` | `user_dir` | `~/.config/automlbenchmark` | Folder containing custom configuration files. | +| `{root}` | `root_dir` | Detected at runtime | The root folder of the `automlbenchmark` application. | + +For example, including the following snippet in your custom configuration when +`user_dir` is `~/.config/automlbenchmark` (which it is by default) changes your +input directory to `~/.config/automlbenchmark/data` : + +```yaml title="examples/custom/config.yaml" +--8<-- "examples/custom/config.yaml:6:7" +``` + +!!! tip "Multiple Configuration Files" + It is possible to have multiple configuration files: + just create a folder for each `config.yaml` file and use that folder as your + `user_dir` using `--userdir /path/to/config/folder` when invoking `runbenchmark.py`. + + +Below is an example of a configuration file which **1.** changes the directory the +datasets are loaded from, **2.** specifies additional paths to look up framework, +benchmark, and constraint definitions, **3.** also makes those available in an S3 bucket +when running in AWS mode, and **4.** changes the default EC2 instance type for AWS mode. + +```yaml title="examples/custom/config.yaml" +--8<-- "examples/custom/config.yaml:3" +``` diff --git a/docs/using/parameters.md b/docs/using/parameters.md new file mode 100644 index 000000000..e0494e8db --- /dev/null +++ b/docs/using/parameters.md @@ -0,0 +1,94 @@ +# Parameters of `runbenchmark.py` + +The parameters of the `runbenchmark.py` script can be shown with: + +```{ .text title="python runbenchmark.py --help" .limit_max_height } +usage: runbenchmark.py [-h] [-m {local,aws,docker,singularity}] [-t [task_id ...]] [-f [fold_num ...]] [-i input_dir] [-o output_dir] [-u user_dir] [-p parallel_jobs] [-s {auto,skip,force,only}] [-k [true|false]] + [-e] [--logging LOGGING] [--openml-run-tag OPENML_RUN_TAG] + framework [benchmark] [constraint] + +positional arguments: + framework The framework to evaluate as defined by default in resources/frameworks.yaml. + To use a labelled framework (i.e. a framework defined in resources/frameworks-{label}.yaml), + use the syntax {framework}:{label}. + benchmark The benchmark type to run as defined by default in resources/benchmarks/{benchmark}.yaml, + a path to a benchmark description file, or an openml suite or task. + OpenML references should be formatted as 'openml/s/X' and 'openml/t/Y', + for studies and tasks respectively. Use 'test.openml/s/X' for the + OpenML test server. + (default: 'test') + constraint The constraint definition to use as defined by default in resources/constraints.yaml. + (default: 'test') + +optional arguments: + -h, --help show this help message and exit + -m {local,aws,docker,singularity}, --mode {local,aws,docker,singularity} + The mode that specifies how/where the benchmark tasks will be running. + (default: 'local') + -t [task_id ...], --task [task_id ...] + The specific task name (as defined in the benchmark file) to run. + When an OpenML reference is used as benchmark, the dataset name should be used instead. + If not provided, then all tasks from the benchmark will be run. + -f [fold_num ...], --fold [fold_num ...] + If task is provided, the specific fold(s) to run. + If fold is not provided, then all folds from the task definition will be run. + -i input_dir, --indir input_dir + Folder from where the datasets are loaded by default. + (default: '/Users/pietergijsbers/.openml') + -o output_dir, --outdir output_dir + Folder where all the outputs should be written.(default: '/Users/pietergijsbers/repositories/forks/automlbenchmark-fork/results') + -u user_dir, --userdir user_dir + Folder where all the customizations are stored.(default: '~/.config/automlbenchmark') + -p parallel_jobs, --parallel parallel_jobs + The number of jobs (i.e. tasks or folds) that can run in parallel. + A hard limit is defined by property `job_scheduler.max_parallel_jobs` + in `resources/config.yaml`. + Override this limit in your custom `config.yaml` file if needed. + Supported only in aws mode or container mode (docker, singularity). + (default: 1) + -s {auto,skip,force,only}, --setup {auto,skip,force,only} + Framework/platform setup mode. Available values are: + • auto: setup is executed only if strictly necessary. + • skip: setup is skipped. + • force: setup is always executed before the benchmark. + • only: only setup is executed (no benchmark). + (default: 'auto') + -k [true|false], --keep-scores [true|false] + Set to true (default) to save/add scores in output directory. + -e, --exit-on-error If set, terminates on the first task that does not complete with a model. + --logging LOGGING Set the log levels for the 3 available loggers: + • console + • app: for the log file including only logs from amlb (.log extension). + • root: for the log file including logs from libraries (.full.log extension). + Accepted values for each logger are: notset, debug, info, warning, error, fatal, critical. + Examples: + --logging=info (applies the same level to all loggers) + --logging=root:debug (keeps defaults for non-specified loggers) + --logging=console:warning,app:info + (default: 'console:info,app:debug,root:info') + --openml-run-tag OPENML_RUN_TAG + Tag that will be saved in metadata and OpenML runs created during upload, must match '([a-zA-Z0-9_\-\.])+'. +``` + + +## Profiling the application + +Currently, the application provides a global flag `--profiling` to activate profiling +for some specific methods that can be slow or memory intensive: + +```bash +python runbenchmark.py randomforest --profiling +``` + +Not all methods and functions are not profiled, but if you need to profile more, +you just need to decorate the function with the `@profile()` decorator (from `amlb.utils`). +Profiling reports on memory usage and function durations: + +```{ .text title="Example of profiling logs" } +[PROFILING] `amlb.datasets.openml.OpenmlLoader.load` executed in 7.456s. +[PROFILING] `amlb.datasets.openml.OpenmlDatasplit.data` returned object size: 45.756 MB. +[PROFILING] `amlb.datasets.openml.OpenmlDatasplit.data` memory change; process: +241.09 MB/379.51 MB, resident: +241.09 MB/418.00 MB, virtual: +230.01 MB/4918.16 MB. +[PROFILING] `amlb.data.Datasplit.X_enc` executed in 6.570s. +[PROFILING] `amlb.data.Datasplit.release` executed in 0.007s. +[PROFILING] `amlb.data.Datasplit.release` memory change; process: -45.73 MB/238.80 MB, resident: +0.00 MB/414.60 MB, virtual: +0.00 MB/4914.25 MB. +``` \ No newline at end of file diff --git a/docs/using/result_analysis.md b/docs/using/result_analysis.md new file mode 100644 index 000000000..24d8d9a06 --- /dev/null +++ b/docs/using/result_analysis.md @@ -0,0 +1,217 @@ +# Results + +The AutoML benchmark produces many result files, such as logs, performance records, +and meta-data of the experiments. Some of these files can also be automatically parsed +and visualized by notebooks we provide. + +## Output File Structure + +Except the logs, all the files generated by the application are in easy to process +`csv` or `json` format, and they are all generated in a subfolder of the `output_dir` +unique for each benchmark run. + +For example: +```text +results/randomforest.test.test.local.20201204T192714 +|-- predictions +| |-- cholesterol +| | |-- 0 +| | | |-- metadata.json +| | | `-- predictions.csv +| | `-- 1 +| | |-- metadata.json +| | `-- predictions.csv +| |-- iris +| | |-- 0 +| | | |-- metadata.json +| | | `-- predictions.csv +| | `-- 1 +| | |-- metadata.json +| | `-- predictions.csv +| `-- kc2 +| |-- 0 +| | |-- metadata.json +| | `-- predictions.csv +| `-- 1 +| |-- metadata.json +| `-- predictions.csv +`-- scores + |-- RandomForest.benchmark_test.csv + `-- results.csv +``` + +### `results.csv` + +Here is a sample `results.csv` file from a test run against the `RandomForest` framework: + + +=== "Produced CSV" + + ```csv + id,task,framework,constraint,fold,result,metric,mode,version,params,tag,utc,duration,models,seed,info,acc,auc,balacc,logloss,mae,r2,rmse + openml.org/t/3913,kc2,RandomForest,test,0,0.865801,auc,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:27:46,3.2,2000,2633845682,,0.792453,0.865801,0.634199,0.350891,,, + openml.org/t/3913,kc2,RandomForest,test,1,0.86039,auc,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:27:52,3.0,2000,2633845683,,0.90566,0.86039,0.772727,0.406952,,, + openml.org/t/59,iris,RandomForest,test,0,0.126485,logloss,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:27:56,2.9,2000,2633845682,,0.933333,,0.933333,0.126485,,, + openml.org/t/59,iris,RandomForest,test,1,0.0271781,logloss,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:28:01,3.0,2000,2633845683,,1.0,,1.0,0.0271781,,, + openml.org/t/2295,cholesterol,RandomForest,test,0,44.3352,rmse,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:28:05,3.0,2000,2633845682,,,,,,35.6783,-0.014619,44.3352 + openml.org/t/2295,cholesterol,RandomForest,test,1,55.3163,rmse,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:28:10,3.1,2000,2633845683,,,,,,43.1808,-0.0610752,55.3163 + ``` + +=== "Readable Table" + + ```text + id task framework constraint fold result metric mode version params utc duration models seed acc auc balacc logloss mae r2 rmse + 0 openml.org/t/3913 kc2 RandomForest test 0 0.865801 auc local 0.23.2 {'n_estimators': 2000} 2020-12-04T19:27:46 3.2 2000 2633845682 0.792453 0.865801 0.634199 0.350891 NaN NaN NaN + 1 openml.org/t/3913 kc2 RandomForest test 1 0.860390 auc local 0.23.2 {'n_estimators': 2000} 2020-12-04T19:27:52 3.0 2000 2633845683 0.905660 0.860390 0.772727 0.406952 NaN NaN NaN + 2 openml.org/t/59 iris RandomForest test 0 0.126485 logloss local 0.23.2 {'n_estimators': 2000} 2020-12-04T19:27:56 2.9 2000 2633845682 0.933333 NaN 0.933333 0.126485 NaN NaN NaN + 3 openml.org/t/59 iris RandomForest test 1 0.027178 logloss local 0.23.2 {'n_estimators': 2000} 2020-12-04T19:28:01 3.0 2000 2633845683 1.000000 NaN 1.000000 0.027178 NaN NaN NaN + 4 openml.org/t/2295 cholesterol RandomForest test 0 44.335200 rmse local 0.23.2 {'n_estimators': 2000} 2020-12-04T19:28:05 3.0 2000 2633845682 NaN NaN NaN NaN 35.6783 -0.014619 44.3352 + 5 openml.org/t/2295 cholesterol RandomForest test 1 55.316300 rmse local 0.23.2 {'n_estimators': 2000} 2020-12-04T19:28:10 3.1 2000 2633845683 NaN NaN NaN NaN 43.1808 -0.061075 55.3163 + ``` + +Here is a short description of each column: + + - `id`: a identifier for the dataset used in this result. For convenience, we use the link to the OpenML task by default. + - `task`: the task name as defined in the benchmark definition. + - `framework`: the framework name as defined in the framework definition. + - `fold`: the dataset fold being used for this job. Usually, we're using 10 folds, so the fold varies from 0 to 9. + - `result`: the result score, this is the score for the metric that the framework was trying to optimize. For example, for binary classification, the default metrics defined in `resources/config.yaml` are `binary: ['auc', 'acc']`; this means that the frameworks should try to optimize `auc` and the final `auc` score will become the `result` value, the other metrics (here `acc`) are then computed for information. + - `mode`: one of `local`, `docker`, `aws`, `aws+docker`: tells where/how the job was executed. + - `version`: the version of the framework being benchmarked. + - `params`: if any, a JSON representation of the params defined in the framework definition. This allows to see clearly if some tuning was done for example. + - `tag`: the branch tag of the `automlbenchmark` app that was running the job. + - `utc`: the UTC timestamp at the job completion. + - `duration`: the training duration: the framework integration is supposed to provide this information to ensure that it takes only into account the time taken by the framework itself. When benchmarking large data, the application can use a significant amount of time to prepare the data: this additional time doesn't appear in this `duration` column. + - `models`: for some frameworks, it is possible to know how many models in total were trained by the AutoML framework. + - `seed`: the seed or random state passed to the framework. With some frameworks, it is enough to obtain reproducible results. Note that the seed can be specified at the command line using `-Xseed=` arg (for example `python randomforest -Xseed=1452956522`): when there are multiple folds, the seed is then incremented by the fold number. + - `info`: additional info in text format, this usually contains error messages if the job failed. + - `acc`, `auc`, `logloss` metrics: all the metrics that were computed based on the generated predictions. For each job/row, one of them matches the `result` column, the others are purely informative. Those additional metric columns are simply added in alphabetical order. + +### Predictions Directory + +For each evaluation, the framework integration must generate a predictions file that +will be used by the application to compute the scores. This predictions file is saved +under the `predictions` subfolder as shown [above](#output-file-structure) and +follows the naming convention: `{framework}_{task}_{fold}.csv`. + +The `csv` file contains a header row and contains the following columns, in order: + - For classification tasks only, there is first one column per class, sorted alphabetically. + Each column contains the probability of the sample belonging to that class, as predicted by the AutoML framework. + If a framework does not provide probabilities, it will be 1 for the predicted class and 0 otherwise. + - `predictions`: contains the predictions of the test predictor data by the model trained by the framework, + - `truth`: the true values of the test target data (`test.y`). + +Here are examples of the first few samples for `KC2` (binary classification), +`iris` (multiclass classification), and `cholesterol` (regression): + +=== "KC2 (csv)" + + ```csv + no,yes,predictions,truth + 0.965857617846013,0.034142382153998944,no,no + 0.965857617846013,0.034142382153998944,no,no + 0.5845,0.4155,no,no + 0.6795,0.3205,no,no + 0.965857617846013,0.034142382153998944,no,no + ``` +=== "KC2 (table)" + + | no | yes | predictions | truth | + |-------------------|----------------------|-------------|-------| + | 0.965857617846013 | 0.034142382153998944 | no | no | + | 0.965857617846013 | 0.034142382153998944 | no | no | + | 0.5845 | 0.4155 | no | no | + | 0.6795 | 0.3205 | no | no | + | 0.965857617846013 | 0.034142382153998944 | no | no | + +===! "iris (csv)" + + ```csv + Iris-setosa,Iris-versicolor,Iris-virginica,predictions,truth + 1.0,0.0,0.0,Iris-setosa,Iris-setosa + 0.9715,0.028,0.0005,Iris-setosa,Iris-setosa + 1.0,0.0,0.0,Iris-setosa,Iris-setosa + 1.0,0.0,0.0,Iris-setosa,Iris-setosa + 1.0,0.0,0.0,Iris-setosa,Iris-setosa + 0.0,1.0,0.0,Iris-versicolor,Iris-versicolor + ``` + + +=== "iris (table)" + + | Iris-setosa | Iris-versicolor | Iris-virginica | predictions | truth | + |-------------|-----------------|----------------|-----------------|-----------------| + | 1.0 | 0.0 | 0.0 | Iris-setosa | Iris-setosa | + | 0.9715 | 0.028 | 0.0005 | Iris-setosa | Iris-setosa | + | 1.0 | 0.0 | 0.0 | Iris-setosa | Iris-setosa | + | 1.0 | 0.0 | 0.0 | Iris-setosa | Iris-setosa | + | 1.0 | 0.0 | 0.0 | Iris-setosa | Iris-setosa | + | 0.0 | 1.0 | 0.0 | Iris-versicolor | Iris-versicolor | + + +===! "cholesterol (csv)" + + ```csv + predictions,truth + 241.204,207.0 + 248.9575,249.0 + 302.278,268.0 + 225.9215,234.0 + 226.6995,201.0 + ``` + +=== "cholesterol (table)" + + | predictions | truth | + |-------------|-------| + | 241.204 | 207.0 | + | 248.9575 | 249.0 | + | 302.278 | 268.0 | + | 225.9215 | 234.0 | + | 226.6995 | 201.0 | + + +### Extract more information + +For some frameworks, it is also possible to extract more detailed information, +in the form of `artifacts` that are saved after the training. +Examples of those artifacts are logs generated by the framework, models or descriptions +of the models trained by the framework, predictions for each of the model trained by the +AutoML framework. By default, those artifacts are not saved, and not all frameworks +provide the same artifacts. This is why the artifacts to be stored have to be specified +in the framework definition (_before_ running the experiments!). By convention, +this can be achieved by specifying the `params._save_artifacts` parameter. For example: + +=== "autosklearn" + + Save model descriptions under the `models` subfolder: + ```yaml + autosklearn_debug: + extends: autosklearn + params: + _save_artifacts: ['models'] + ``` + +=== "H2O" + + Save the leaderboard and models under the `models` subfolder, + and the H2O logs under `logs` subfolder: + ```yaml + H2OAutoML_debug: + extends: H2OAutoML + params: + _save_artifacts: ['leaderboard', 'logs', 'models'] + ``` + +=== "TPOT" + + Save the description of models for the Pareto frontin the `models` subfolder: + ```yaml + TPOT_debug: + extends: TPOT + params: + _save_artifacts: ['models'] + ``` + +The framework integrations themselves determine where the artifacts are saved, +this is typically not configurable from the framework definition. diff --git a/docs/using/upload_to_openml.md b/docs/using/upload_to_openml.md new file mode 100644 index 000000000..ce50145da --- /dev/null +++ b/docs/using/upload_to_openml.md @@ -0,0 +1,26 @@ + +### Uploading results to OpenML +The `upload_results.py` script can be used to upload results to OpenML with the following usage: +```text +>python upload_results.py --help +usage: Script to upload results from the benchmark to OpenML. [-h] [-i INPUT_DIRECTORY] [-a APIKEY] [-m MODE] [-x] [-v] [-t TASK] + +optional arguments: + -h, --help show this help message and exit + -i INPUT_DIRECTORY, --input-directory INPUT_DIRECTORY + Directory that stores results from the runbenchmark.py invocation. By default use the most recent folder in the results folder as + specified in the configuration. + -a APIKEY, --api-key APIKEY + OpenML API key to use for uploading results. + -m MODE, --mode MODE Run mode (default=check). + • check: only report whether results can be uploaded. + • upload: upload all complete results. + -x, --fail-fast Stop as soon as a task fails to upload due to an error during uploading. + -v, --verbose Output progress to console. + -t TASK, --task TASK Only upload results for this specific task. +``` + +Note that the default behavior does not upload data but only verifies data is complete. +We strongly encourage you to only upload your data after verifying all expected results are complete. +The OpenML Python package is used for uploading results, so to ensure your API credentials are configured, please refer to their [configuration documentation](https://openml.github.io/openml-python/master/usage.html#installation-set-up). +Results obtained on tasks on the test server (e.g. through the `--test-server` option of `runbenchmark.py`) are uploaded to the test server and don't require additional authentication. diff --git a/docs/website/automl_overview.html b/docs/website/automl_overview.html new file mode 100644 index 000000000..e72f07c04 --- /dev/null +++ b/docs/website/automl_overview.html @@ -0,0 +1,17 @@ + + + AMLB + + + + +

+ The old AutoML framework overview page no longer exists, you will be + redirected to the new page. If you are not redirected within 3 seconds, + you can find it at + https://openml.github.io/automlbenchmark/frameworks.html. +

+ + diff --git a/docs/website/benchmark_datasets.html b/docs/website/benchmark_datasets.html new file mode 100644 index 000000000..f9b28623f --- /dev/null +++ b/docs/website/benchmark_datasets.html @@ -0,0 +1,18 @@ + + + AMLB + + + + +

+ You probably found this link from our 2019 paper. + Unfortunately, we updated our site but the new website does not yet contain a page with a description of our dataset selection strategy.
+ You will be redirected to the markdown file from which the old page was generated, so you can still view the old text.
+ If you are not redirected within 5 seconds, you can find it at + https://github.com/openml/automlbenchmark/blob/2fe3bd41768ce28387f827791bd57ef1a5a84783/docs/benchmark_datasets.md. +

+ + diff --git a/docs/website/frameworks.html b/docs/website/frameworks.html new file mode 100644 index 000000000..08315b7d3 --- /dev/null +++ b/docs/website/frameworks.html @@ -0,0 +1,957 @@ + + + + AMLB: Frameworks + + + + + + + + + + +
+
+

AutoML Frameworks

+ There is more to an AutoML system than just its performance. This page + contains more information about the integrated AutoML frameworks, + including links to their papers, repositories + , and + documentation 📖. Summaries taken directly from the respective + documentation pages. Want to integrate your own framework? + Adding your own framework + is relatively simple. +
+
+
+
+ +

AutoGluon

+ +
+
+ AutoGluon enables easy-to-use and easy-to-extend AutoML with a focus + on automated stack ensembling, deep learning, and real-world + applications spanning image, text, and tabular data. +
+ +
+
+

+ AutoGluon-Tabular: Robust and Accurate AutoML for Structured + Data +

+
+ Nick Erickson, Jonas Mueller, Alexander Shirkov, Hang Zhang, + Pedro Larroy, Mu Li, Alexander Smola +
+
+ We introduce AutoGluon-Tabular, an open-source AutoML framework + that requires only a single line of Python to train highly + accurate machine learning models on an unprocessed tabular + dataset such as a CSV file. Unlike existing AutoML frameworks + that primarily focus on model/hyperparameter selection, + AutoGluon-Tabular succeeds by ensembling multiple models and + stacking them in multiple layers. Experiments reveal that our + multi-layer combination of many models offers better use of + allocated training time than seeking out the best. A second + contribution is an extensive evaluation of public and commercial + AutoML platforms including TPOT, H2O, AutoWEKA, auto-sklearn, + AutoGluon, and Google AutoML Tables. Tests on a suite of 50 + classification and regression tasks from Kaggle and the OpenML + AutoML Benchmark reveal that AutoGluon is faster, more robust, + and much more accurate. We find that AutoGluon often even + outperforms the best-in-hindsight combination of all of its + competitors. In two popular Kaggle competitions, AutoGluon beat + 99% of the participating data scientists after merely 4h of + training on the raw data. +
+ +
+
+ +
+
+
+ +

Auto-sklearn

+ +
+
+ Auto-sklearn is an automated machine learning toolkit and a drop-in + replacement for a scikit-learn estimator. Auto-sklearn frees a + machine learning user from algorithm selection and hyperparameter + tuning. It leverages recent advantages in Bayesian optimization, + meta-learning and ensemble construction. +
+ +
+
+

+ Auto-Sklearn 2.0: Hands-free AutoML via Meta-Learning +

+
+ Matthias Feurer, Katharina Eggensperger, Stefan Falkner, Marius + Lindauer, Frank Hutter +
+
+ Automated Machine Learning (AutoML) supports practitioners and + researchers with the tedious task of designing machine learning + pipelines and has recently achieved substantial success. In this + paper we introduce new AutoML approaches motivated by our + winning submission to the second ChaLearn AutoML challenge. We + develop PoSH Auto-sklearn, which enables AutoML systems to work + well on large datasets under rigid time limits using a new, + simple and meta-feature-free meta-learning technique and employs + a successful bandit strategy for budget allocation. However, + PoSH Auto-sklearn introduces even more ways of running AutoML + and might make it harder for users to set it up correctly. + Therefore, we also go one step further and study the design + space of AutoML itself, proposing a solution towards truly + hands-free AutoML. Together, these changes give rise to the next + generation of our AutoML system, Auto-sklearn 2.0 . We verify + the improvements by these additions in a large experimental + study on 39 AutoML benchmark datasets and conclude the paper by + comparing to other popular AutoML frameworks and Auto-sklearn + 1.0 , reducing the relative error by up to a factor of 4.5, and + yielding a performance in 10 minutes that is substantially + better than what Auto-sklearn 1.0 achieves within an hour. +
+ +
+
+

+ Efficient and Robust Automated Machine Learning +

+
+ Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost + Springenberg, Manuel Blum, Frank Hutter +
+
+ The success of machine learning in a broad range of applications + has led to an ever-growing demand for machine learning systems + that can be used off the shelf by non-experts. To be effective + in practice, such systems need to automatically choose a good + algorithm and feature preprocessing steps for a new dataset at + hand, and also set their respective hyperparameters. Recent work + has started to tackle this automated machine learning (AutoML) + problem with the help of efficient Bayesian optimization + methods. In this work we introduce a robust new AutoML system + based on scikit-learn (using 15 classifiers, 14 feature + preprocessing methods, and 4 data preprocessing methods, giving + rise to a structured hypothesis space with 110 hyperparameters). + This system, which we dub auto-sklearn, improves on existing + AutoML methods by automatically taking into account past + performance on similar datasets, and by constructing ensembles + from the models evaluated during the optimization. Our system + won the first phase of the ongoing ChaLearn AutoML challenge, + and our comprehensive analysis on over 100 diverse datasets + shows that it substantially outperforms the previous state of + the art in AutoML. We also demonstrate the performance gains due + to each of our contributions and derive insights into the + effectiveness of the individual components of auto-sklearn. +
+ +
+
+ +
+
+
+ +

FLAML

+ +
+
+ FLAML is a lightweight Python library that finds accurate machine + learning models automatically, efficiently and economically. It + frees users from selecting learners and hyperparameters for each + learner. +
+ +
+
+

+ FLAML: A Fast and Lightweight AutoML Library +

+
+ Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu +
+
+ We study the problem of using low computational cost to automate + the choices of learners and hyperparameters for an ad-hoc + training dataset and error metric, by conducting trials of + different configurations on the given training data. We + investigate the joint impact of multiple factors on both trial + cost and model error, and propose several design guidelines. + Following them, we build a fast and lightweight library FLAML + which optimizes for low computational resource in finding + accurate models. FLAML integrates several simple but effective + search strategies into an adaptive system. It significantly + outperforms top-ranked AutoML libraries on a large open source + AutoML benchmark under equal, or sometimes orders of magnitude + smaller budget constraints. +
+ +
+
+ +
+
+
+ +

GAMA

+ +
+
+ GAMA is developed for AutoML research and features a flexible AutoML + pipeline, which makes it easy to develop and evaluate new AutoML + components. GAMA's benchmarking configuration features evolutionary + optimization and ensemble construction. +
+ +
+
+

+ GAMA: A General Automated Machine Learning Assistant +

+
+ Pieter Gijsbers, Joaquin Vanschoren +
+ + + +
+ The General Automated Machine learning Assistant (GAMA) is a + modular AutoML system developed to empower users to track and + control how AutoML algorithms search for optimal machine + learning pipelines, and facilitate AutoML research itself. In + contrast to current, often black-box systems, GAMA allows users + to plug in different AutoML and post-processing techniques, logs + and visualizes the search process, and supports easy + benchmarking. It currently features three AutoML search + algorithms, two model post-processing steps, and is designed to + allow for more components to be added. +
+ +
+
+ +
+
+
+ +

H2O AutoML

+ +
+
+ H2O's AutoML can be used for automating the machine learning + workflow, which includes automatic training and tuning of many + models within a user-specified time-limit. H2O offers a number of + model explainability methods that apply to AutoML objects (groups of + models), as well as individual models (e.g. leader model). + Explanations can be generated automatically with a single function + call, providing a simple interface to exploring and explaining the + AutoML models. +
+ +
+
+

+ H2O AutoML: Scalable Automatic Machine Learning +

+
Erin LeDell and Sébastien Poirier
+
+ H2O is an open source, distributed machine learning platform + designed to scale to very large datasets, with APIs in R, + Python, Java and Scala. We present H2O AutoML, a highly + scalable, fully-automated, supervised learning algorithm which + automates the pro- cess of training a large selection of + candidate models and stacked ensembles within a single function. + The result of the AutoML run is a “leaderboard”: a ranked list + of models, all of which can be easily exported for use in a + production environment. Models in the leader- board can be + ranked by numerous model performance metrics or other model + attributes such as training time or average per-row prediction + speed. The H2O AutoML algorithm relies on the efficient training + of H2O machine learning al- gorithms to produce a large number + of models in a short amount of time. H2O AutoML uses a + combination of fast random search and stacked ensembles to + achieve results competitive with, and often better than, other + frameworks which rely on more complex model tuning techniques + such as Bayesian optimization or genetic algorithms. H2O AutoML + trains a va- riety of algorithms (e.g. GBMs, Random Forests, + Deep Neural Networks, GLMs), yielding a healthy amount of + diversity across candidate models, which can be exploited by + stacked ensembles to produce a powerful final model. The + effectiveness of this technique is reflected in the OpenML + AutoML Benchmark, which compares the performance of several of + the most well known, open source AutoML systems across a number + of datasets. +
+ +
+
+ +
+
+
+ +

LightAutoML

+ +
+
+ LightAutoML is open-source Python library aimed at automated machine + learning. It is designed to be lightweight and efficient for various + tasks with tabular, text data. +
+ +
+
Paper to be added.
+
+ +
+ Alexander Ryzhkov, Anton Vakhrushev, Dmitry Simakov, Vasilii + Bunakov, Rinchin Damdinov, Alexander Kirilin, Pavel Shvets +
+ +
+
+ +
+
+
+ +

mljar-supervised

+ +
+
+ The mljar-supervised is an Automated Machine Learning Python package + that works with tabular data. It is designed to save time for a data + scientist. It abstracts the common way to preprocess the data, + construct the machine learning models, and perform hyper-parameters + tuning to find the best model 🏆. It is no black-box as you can see + exactly how the ML pipeline is constructed (with a detailed Markdown + report for each ML model). +
+ +
+
No paper available.
+
+ +
+
+
+ +

TPOT

+ +
+
+ TPOT is a Python Automated Machine Learning tool that optimizes + machine learning pipelines using genetic programming. It has a focus + on optimizing models for biomedical data. +
+ +
+
+

+ Automating biomedical data science through tree-based pipeline + optimization +

+
+ Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. + Lavender, La Creis Kidd, and Jason H. Moore +
+
+ Automated machine learning (AutoML) systems are helpful data + science assistants designed to scan data for novel features, + select appropriate supervised learning models and optimize their + parameters. For this purpose, Tree-based Pipeline Optimization + Tool (TPOT) was developed using strongly typed genetic + programing (GP) to recommend an optimized analysis pipeline for + the data scientist’s prediction problem. However, like other + AutoML systems, TPOT may reach computational resource limits + when working on big data such as whole-genome expression data. +
+ +
+
+ +
+
+
+ + + diff --git a/docs/website/img/logos/GAMA-icon.png b/docs/website/img/logos/GAMA-icon.png new file mode 100644 index 000000000..c9626aa28 Binary files /dev/null and b/docs/website/img/logos/GAMA-icon.png differ diff --git a/docs/website/img/logos/GAMA.png b/docs/website/img/logos/GAMA.png new file mode 100644 index 000000000..0ba100f0c Binary files /dev/null and b/docs/website/img/logos/GAMA.png differ diff --git a/docs/website/img/logos/GitHub-Mark-32px.png b/docs/website/img/logos/GitHub-Mark-32px.png new file mode 100644 index 000000000..8b25551a9 Binary files /dev/null and b/docs/website/img/logos/GitHub-Mark-32px.png differ diff --git a/docs/website/img/logos/GitHub-Mark-64px.png b/docs/website/img/logos/GitHub-Mark-64px.png new file mode 100644 index 000000000..182a1a3f7 Binary files /dev/null and b/docs/website/img/logos/GitHub-Mark-64px.png differ diff --git a/docs/website/img/logos/GitHub-Mark-Light-64px.png b/docs/website/img/logos/GitHub-Mark-Light-64px.png new file mode 100644 index 000000000..73db1f61f Binary files /dev/null and b/docs/website/img/logos/GitHub-Mark-Light-64px.png differ diff --git a/docs/website/img/logos/LightAutoML_logo_small.png b/docs/website/img/logos/LightAutoML_logo_small.png new file mode 100644 index 000000000..8d268e390 Binary files /dev/null and b/docs/website/img/logos/LightAutoML_logo_small.png differ diff --git a/docs/website/img/logos/auto-sklearn.png b/docs/website/img/logos/auto-sklearn.png new file mode 100644 index 000000000..65141af67 Binary files /dev/null and b/docs/website/img/logos/auto-sklearn.png differ diff --git a/docs/website/img/logos/autogluon.png b/docs/website/img/logos/autogluon.png new file mode 100644 index 000000000..8afef59ab Binary files /dev/null and b/docs/website/img/logos/autogluon.png differ diff --git a/docs/website/img/logos/flaml.svg b/docs/website/img/logos/flaml.svg new file mode 100644 index 000000000..5ae22b683 --- /dev/null +++ b/docs/website/img/logos/flaml.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/website/img/logos/h2o-automl-logo.jpeg b/docs/website/img/logos/h2o-automl-logo.jpeg new file mode 100644 index 000000000..68ab95b32 Binary files /dev/null and b/docs/website/img/logos/h2o-automl-logo.jpeg differ diff --git a/docs/website/img/logos/mljar.png b/docs/website/img/logos/mljar.png new file mode 100644 index 000000000..a61427bef Binary files /dev/null and b/docs/website/img/logos/mljar.png differ diff --git a/docs/website/img/logos/tpot.jpeg b/docs/website/img/logos/tpot.jpeg new file mode 100644 index 000000000..fd4d56add Binary files /dev/null and b/docs/website/img/logos/tpot.jpeg differ diff --git a/docs/website/img/shiny.png b/docs/website/img/shiny.png new file mode 100644 index 000000000..9942dfeec Binary files /dev/null and b/docs/website/img/shiny.png differ diff --git a/docs/website/index.html b/docs/website/index.html new file mode 100644 index 000000000..fdfc40b7b --- /dev/null +++ b/docs/website/index.html @@ -0,0 +1,424 @@ + + + + AMLB: an AutoML Benchmark + + + + + + + + + + + +
+
+
AMLB
+
An AutoML Benchmark
+
+ Comparing different AutoML frameworks is notoriously challenging. AMLB + is an open and extensible benchmark that follows best practices and + avoids common mistakes when comparing AutoML frameworks. +
+
+
+
+
+

Easy to Use

+
+ You can run an entire benchmark with a single command! The AutoML + benchmark tool automates the installation of the AutoML framework, + the experimental setup, and executing the experiment. +
+
+ > python runbenchmark.py autosklearn openml/s/269 1h8c +
+
+ + + + + Installation guide + +
+
+

Visualize Results

+
+ The results can be visualized with our + interactive visualization tool + or one of our + notebooks. This includes stripplots, critical difference diagrams, + Bradley-Terry trees, and more! +
+ +
+ + + + + Results + +
+
+

Easy to Extend

+
+ Adding a framework + and + adding a dataset + is easy. These extensions can be kept completely private, or + shared with the community. For datasets, it is even possible to + work with + OpenML + tasks and suites directly! +
+ + + + + + + + +
+ + + + + Extending the benchmark + +
+
+
+
+
+
+

📄 Paper

+
+ A preprint of our most recent paper is available on + ArXiv. It includes an in-depth discussion of the different design + decisions and its limiations, as well as a multi-faceted analysis + of results from large scale comparison across 9 frameworks on more + than 100 tasks. + +
+
+
+

🧑‍💻 Code

+
+ The entire benchmark tool is open source and developed on + Github. The Github discussion board and issue trackers are the main way + for us to interact with the community. +
+
+
+ +
+
+

AutoML Frameworks

+
+
+ Many AutoML frameworks are already integrated with the AutoML + benchmark tool and + adding more is easy. + We have more information about the different frameworks on our + framework overview page. The icons below + link directly to their respective Github repositories. +
+
+ + + + + + + + +
+
+
+
+
+

Community

+
+ We welcome any contributions to the AutoML benchmark. Our goal is to + provide the best benchmark tools for AutoML research and we can't do + that on our own. Contributions are appreciated in many forms, + including feedback on the benchmark design, feature requests, bug + reports, code and documentation contributions, and more. Why not stop + by on our + welcome board + and let us know what got you interested in the benchmark? +
+
+
+ + + diff --git a/docs/website/papers.html b/docs/website/papers.html new file mode 100644 index 000000000..579aee363 --- /dev/null +++ b/docs/website/papers.html @@ -0,0 +1,289 @@ + + + + AMLB: Papers + + + + + + + + + + + +
+

Papers

+
+

AMLB: an AutoML Benchmark

+
+ Pieter Gijsbers, Marcos L. P. Bueno, Stefan Coors, Erin LeDell, + Sébastien Poirier, Janek Thomas, Bernd Bischl and Joaquin Vanschoren +
+ + + +
+ Comparing different AutoML frameworks is notoriously challenging and + often done incorrectly. We introduce an open and extensible benchmark + that follows best practices and avoids common mistakes when comparing + AutoML frameworks. We conduct a thorough comparison of 9 well-known + AutoML frameworks across 71 classification and 33 regression tasks. + The differences between the AutoML frameworks are explored with a + multi-faceted analysis, evaluating model accuracy, its trade-offs with + inference time, and framework failures. We also use Bradley-Terry + trees to discover subsets of tasks where the relative AutoML framework + rankings differ. The benchmark comes with an open-source tool that + integrates with many AutoML frameworks and automates the empirical + evaluation process end-to-end: from framework installation and + resource allocation to in-depth evaluation. The benchmark uses public + data sets, can be easily extended with other AutoML frameworks and + tasks, and has a website with up-to-date results. +
+ +
+ +
+ @misc{https://doi.org/10.48550/arxiv.2207.12560,
+   doi = {10.48550/ARXIV.2207.12560},
+   url = {https://arxiv.org/abs/2207.12560},
+   author = {Gijsbers, Pieter and Bueno, Marcos L. P. and Coors, + Stefan and LeDell, Erin and Poirier, S\'{e}bastien and Thomas, Janek + and Bischl, Bernd and Vanschoren, Joaquin},
+   keywords = {Machine Learning (cs.LG), Machine Learning + (stat.ML), FOS: Computer and information sciences, FOS: Computer and + information sciences},
+   title = {AMLB: an AutoML Benchmark},
+   publisher = {arXiv},
+   year = {2022},
+   copyright = {Creative Commons Attribution 4.0 International} +
+ } +
+
+
+
+

An Open Source AutoML Benchmark

+
+ Pieter Gijsbers, Erin LeDell, Janek Thomas, Sébastien Poirier, Bernd + Bischl, Joaquin Vanschoren +
+ + + +
+ In recent years, an active field of research has developed around + automated machine learning (AutoML). Unfortunately, comparing + different AutoML systems is hard and often done incorrectly. We + introduce an open, ongoing, and extensible benchmark framework which + follows best practices and avoids common mistakes. The framework is + open-source, uses public datasets and has a website with up-to-date + results. We use the framework to conduct a thorough comparison of 4 + AutoML systems across 39 datasets and analyze the results. +
+ +
+ +
+ @article{amlb2019,
+   title={An Open Source AutoML Benchmark},
+   author={Gijsbers, P. and LeDell, E. and Poirier, S. and + Thomas, J. and Bischl, B. and Vanschoren, J.},
+   journal={arXiv preprint arXiv:1907.00909 [cs.LG]},
+   url={https://arxiv.org/abs/1907.00909},
+   note={Accepted at AutoML Workshop at ICML 2019},
+   year={2019}
} +
+
+
+
+ + + diff --git a/docs/website/results.html b/docs/website/results.html new file mode 100644 index 000000000..dc4c84cc8 --- /dev/null +++ b/docs/website/results.html @@ -0,0 +1,262 @@ + + + + AMLB: Results + + + + + + + + + + + +
+
+

Results

+
+
+ ⚠️ Our paper outlines + important limitations for the interpretation of + results. These limitations include: +
+ +
+
    +
  • + We use AutoML framework versions from + September 2021, many frameworks have since seen + major updates. +
  • +
  • + We use the "benchmark" modes of the frameworks, which generally + only optimize for performance. Most AutoML frameworks + have multiple modes to support different use cases. +
  • +
  • + Results can not be used to make conclusions about which + algorithm is best, as all frameworks differ in multiple ways. +
  • +
  • + Performance statistics are often independent from many + qualitative differences, such as ease of use or + interpretability. +
  • +
+ Please read Section 5.3 in our paper for a more elaborate discussion + of these and other limitations. +
+ +
+
+
+
+ All results are available as + raw files + 📂, but we open source the tools we used for generating the figures in + our paper. The best way to explore the results is through our + interactive + Shiny app. It loads the latest results by default. It is also possible to use + our + notebooks + which contain additional visualizations. +
+ +
+
+ + + diff --git a/docs/website/style.css b/docs/website/style.css new file mode 100644 index 000000000..117611739 --- /dev/null +++ b/docs/website/style.css @@ -0,0 +1,527 @@ +* { + margin: 0; + padding: 0; + box-sizing: border-box; + font-size: 18px; +} +.grey { + color: #495057; +} + +/* main: #1971c2 */ + +body { + font-family: "lato", sans-serif; + color: #343a40; +} + +section { + margin-bottom: 64px; +} + +a { + color: #1971c2; +} + +.page-content { + padding: 0 clamp(15%, 100%, calc((100vw - 1000px) / 2)); /* should maybe set width in px instead */ +} + +h1 { + font-size: 72px; + padding-bottom: 24px; +} + +h2 { + font-size: 48px; + padding-bottom: 24px; +} + +h3 { + font-size: 32px; + padding-bottom: 16px; +} + +footer { + color: #e7f5ff; + font-size: 14px; + background-color: #1971c2; + text-align: center; + padding: 8px; +} + +footer > a { + color: #e7f5ff; + font-size: 14px; +} + +.terminal { + background-color: #343a40; + color: #f8f9fa; + font-family: "Inconsolata", sans-serif; + padding: 8px; + margin: 16px 5%; +} + +/* NAVBAR */ +.construction-banner { + background-color: #1c7ed6; + color: white; + text-align: center; +} + +.navigation-bar { + background-color: #1971c2; + color: white; + text-transform: uppercase; + + display: flex; + gap: 24px; + justify-content: flex-end; + align-items: center; + padding: 16px; + height: 60px; +} + +nav a:first-child { + flex-grow: 1; +} + +.nav-icon { + height: 24px; + fill: white; +} + +.nav-icon-with-text { + height: 18px; +} + +.nav-icon:hover, +.nav-icon:active { + height: 28px; +} + +.nav-link:link, +.nav-link:visited { + text-decoration: none; + color: white; +} +.nav-link:hover, +.nav-link:active { + color: white; + font-weight: bold; +} + +/* TITLE */ + +.title, +.subtitle { + color: #343a40; + text-align: center; +} + +.title { + font-size: 74px; + padding-top: 48px; + font-weight: bold; +} + +.subtitle { + font-size: 62px; + font-weight: bold; + padding-bottom: 24px; +} + +.summary { + font-size: 24px; + color: #495057; + padding: 0 max((100% - 900px) / 2, 0px); /* does not seem to work with just the math expression */ +} + +/* CARDS */ + +.three-cols { + display: grid; + grid-template-columns: repeat(3, 1fr); + column-gap: 5%; +} + +.card { + box-shadow: 0 5px 10px rgba(0, 0, 0, 0.1); + border-radius: 5px; + padding: 16px; + + display: flex; + flex-direction: column; +} + +.card img { + padding: 16px 2%; +} + +.card > svg { + display: block; + margin: 16px auto; +} + +.flex-grow { + flex-grow: 1; +} + +.card > .card-nav { + color: #1971c2; + text-decoration: none; + + display: flex; + align-items: center; + justify-content: flex-start; + gap: 8px; +} + +.card > .card-nav > svg { + height: 24px; + fill: #1971c2; +} + +h3 { + font-size: 24px; + padding-bottom: 10px; +} + +/* Frameworks */ +/* .icon-grid { + padding: 24px; + display: grid; + grid-auto-rows: 60px; +} */ +.icon-flex { + margin-top: 24px; + display: flex; + flex-wrap: wrap; + justify-content: center; + align-items: center; + gap: 40px; +} + +.icon-flex a img { + height: 90px; +} +/* Paper */ + +.two-col { + display: grid; + grid-template-columns: 1fr 1fr; + column-gap: 2%; + + background-color: #1971c2; + color: #e7f5ff; + border-radius: 5px; +} + +.two-col a { + color: white; +} + +.two-col h3 { + color: white; +} + +.flat-card { + padding: 16px; +} + +/* FRAMEWORKS PAGE */ +.page-title { + margin-top: 72px; +} + +.framework-card-list { + background-color: #e7f5ff; + padding: 32px 5%; + border-radius: 10px; +} + +.framework-card-list > .accordion { + margin-bottom: 32px; +} +/* +.framework-card { + background-color: white; + margin-bottom: 64; + padding: 32; + border-radius: 10px; +} */ +.framework-card:last-of-type { + margin-bottom: 0; +} + +.framework-header { + display: flex; + align-items: center; + gap: 16px; + flex-wrap: wrap; +} + +.framework-header > h3 { + padding-bottom: 0px; + font-size: 32px; +} + +.framework-links { + display: flex; + gap: 16px; + justify-content: flex-end; + + flex-grow: 1; +} + +.framework-links > a { + text-decoration: none; + font-size: 24px; +} + +/* RESULTS */ + +.limitations-list { + padding: 0px 32px 16px 32px; +} + +.limitations-list li { + padding: 8px 0px; +} + +.results { + display: flex; + gap: 40px; +} + +/* Accordions */ +.accordion { +} + +.acard { + box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2); + border-radius: 10px; + background-color: white; +} + +.accordion > div { + padding: 8px 32px; +} + +.accordion > div:first-child { + padding-top: 16px; +} + +.accordion > label { + width: 100%; + display: flex; + justify-content: center; +} + +.accordion-input { + display: none; +} + +.accordion-icon { + height: 24px; +} + +.accordion-input:checked ~ .accordion-content { + display: block; +} + +.accordion-input:checked ~ label .accordion-chevron-down { + display: none; +} + +.accordion-input:checked ~ label .accordion-chevron-up { + display: block; +} + +.accordion-input ~ .accordion-content { + display: none; +} + +.accordion-input ~ label .accordion-chevron-up { + display: none; +} + +.accordion-input ~ label .accordion-chevron-down { + display: block; +} + +.accordion > label { + background: linear-gradient(0deg, rgba(0, 0, 0, 0.02), white); + border-radius: 10px; + padding-bottom: 8px; +} + +.accordion > label:hover { + cursor: pointer; +} + +/* Papers */ +.paper { + margin-bottom: 48px; +} + +.paper-title { + padding-bottom: 0px; + margin-bottom: 0px; +} + +.paper-links { + display: flex; + align-items: center; + gap: 12px; + + margin-bottom: 16px; +} + +.paper-links > a { + color: white; + background-color: #1c7ed6; + padding: 4px 12px; + border-radius: 4px; + + text-decoration: none; + text-transform: uppercase; +} + +.paper-year { + color: #343a40; + display: inline; + font-weight: bold; +} + +.paper-venue { + color: #495057; + display: inline; +} + +.paper-authors { + color: #4263eb; + padding-bottom: 8px; +} + +.paper-abstract { + color: #495057; + padding-bottom: 16px; +} + +.hover-expand { + display: flex; + gap: 20px; + + color: #343a40; + background-color: white; + border: 2px solid #495057; + border-radius: 4px; + padding: 2px 8px; +} + +.hover-expand > div { + display: none; +} + +.hover-expand:hover > div { + display: block; +} + +.hidden-content { + display: none; +} + +.hidden-content-checkbox { + display: none; +} + +.hidden-content-checkbox:checked ~ .hidden-content { + display: block; +} + +.bibtex { + font-family: "Inconsolata", sans-serif; + color: #343a40; + background-color: #e7f5ff; + border-radius: 10px; + padding: 16px; +} + +.flat-button { + cursor: pointer; +} + +.flat-button > label { + cursor: pointer; +} + +.mobile { + display: none; +} + +.nav-mobile { + justify-content: space-between; + display: none; +} + +@media (max-width: 900px) { + .desktop { + display: none; + } + + .mobile { + display: block; + padding: 25px; + } + + .shiny { + height: auto; + width: 100%; + } + + .results { + flex-wrap: wrap; + } + + .framework-header > h3 { + order: -1; + flex-basis: 100%; + } + + h1 { + font-size: 48px; + padding-bottom: 18px; + } + + .page-title { + margin-top: 32px; + } + + .page-content { + padding: 0 32px; + } + + .three-cols { + grid-template-columns: 1fr; + row-gap: 32px; + } + + .two-col { + grid-template-columns: 1fr; + row-gap: 32px; + } + + .nav-mobile { + display: flex; + } +} + +.mobile * { + color: white; +} + +.mobile > .summary > a { + font-size: 24px; +} diff --git a/frameworks/AutoGluon/README.md b/frameworks/AutoGluon/README.md index 51286533e..1b5c2dc65 100644 --- a/frameworks/AutoGluon/README.md +++ b/frameworks/AutoGluon/README.md @@ -1,16 +1,5 @@ # AutoGluon -To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...``` +To run v0.8.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...``` -To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ...``` - - -# AutoGluonTS - -AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems. - -## Run Steps - -To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts timeseries ...``` - -To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest timeseries ...``` +To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluon:latest ...``` diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py index 9d3d980a3..2d5734e33 100644 --- a/frameworks/AutoGluon/__init__.py +++ b/frameworks/AutoGluon/__init__.py @@ -10,11 +10,11 @@ def setup(*args, **kwargs): def run(dataset: Dataset, config: TaskConfig): - if dataset.type is not DatasetType.timeseries: + if dataset.type == DatasetType.timeseries: + return run_autogluon_timeseries(dataset, config) + else: return run_autogluon_tabular(dataset, config) - else: - return run_autogluon_timeseries(dataset, config) def run_autogluon_tabular(dataset: Dataset, config: TaskConfig): from frameworks.shared.caller import run_in_venv @@ -36,26 +36,18 @@ def run_autogluon_tabular(dataset: Dataset, config: TaskConfig): def run_autogluon_timeseries(dataset: Dataset, config: TaskConfig): from frameworks.shared.caller import run_in_venv dataset = deepcopy(dataset) - if not hasattr(dataset, 'timestamp_column'): - dataset.timestamp_column = None - if not hasattr(dataset, 'id_column'): - dataset.id_column = None - if not hasattr(dataset, 'forecast_range_in_steps'): - raise AttributeError("Unspecified `forecast_range_in_steps`.") data = dict( - # train=dict(path=dataset.train.data_path('parquet')), - # test=dict(path=dataset.test.data_path('parquet')), - train=dict(path=dataset.train.path), - test=dict(path=dataset.test.path), - target=dict( - name=dataset.target.name, - classes=dataset.target.values - ), - problem_type=dataset.type.name, # AutoGluon problem_type is using same names as amlb.data.DatasetType - timestamp_column=dataset.timestamp_column, + train_path=dataset.train.path, + test_path=dataset.test.path, + target=dataset.target.name, id_column=dataset.id_column, - forecast_range_in_steps=dataset.forecast_range_in_steps + timestamp_column=dataset.timestamp_column, + forecast_horizon_in_steps=dataset.forecast_horizon_in_steps, + freq=dataset.freq, + seasonality=dataset.seasonality, + repeated_abs_seasonal_error=dataset.repeated_abs_seasonal_error, + repeated_item_id=dataset.repeated_item_id, ) return run_in_venv(__file__, "exec_ts.py", diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py index ab7c4110f..32fd34072 100644 --- a/frameworks/AutoGluon/exec_ts.py +++ b/frameworks/AutoGluon/exec_ts.py @@ -1,21 +1,20 @@ import logging +import numpy as np import os +import pandas as pd import shutil -import warnings import sys import tempfile -import numpy as np +import warnings warnings.simplefilter("ignore") if sys.platform == 'darwin': os.environ['OMP_NUM_THREADS'] = '1' -import pandas as pd - from autogluon.core.utils.savers import save_pd, save_pkl -from autogluon.tabular import TabularDataset from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame from autogluon.timeseries.version import __version__ +from joblib.externals.loky import get_reusable_executor from frameworks.shared.callee import call_run, result, output_subdir from frameworks.shared.utils import Timer, zip_path @@ -25,111 +24,69 @@ def run(dataset, config): log.info(f"\n**** AutoGluon TimeSeries [v{__version__}] ****\n") + prediction_length = dataset.forecast_horizon_in_steps - timestamp_column = dataset.timestamp_column - id_column = dataset.id_column - prediction_length = dataset.forecast_range_in_steps - - eval_metric = get_eval_metric(config) - label = dataset.target.name - time_limit = config.max_runtime_seconds - - training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')} - - train_data, test_data = load_data(train_path=dataset.train.path, - test_path=dataset.test.path, - timestamp_column=timestamp_column, - id_column=id_column) - test_data_past = test_data.copy().slice_by_timestep(slice(None, -prediction_length)) + train_data = TimeSeriesDataFrame.from_path( + dataset.train_path, + id_column=dataset.id_column, + timestamp_column=dataset.timestamp_column, + ) predictor_path = tempfile.mkdtemp() + os.sep with Timer() as training: predictor = TimeSeriesPredictor( - target=label, + target=dataset.target, path=predictor_path, prediction_length=prediction_length, - eval_metric=eval_metric, + eval_metric=get_eval_metric(config), + eval_metric_seasonal_period=dataset.seasonality, + quantile_levels=config.quantile_levels, ) predictor.fit( train_data=train_data, - time_limit=time_limit, - **training_params, + time_limit=config.max_runtime_seconds, + **{k: v for k, v in config.framework_params.items() if not k.startswith('_')}, ) with Timer() as predict: - predictions = predictor.predict(test_data_past) - log.info(predictions) + predictions = pd.DataFrame(predictor.predict(train_data)) - predictions_only = predictions['mean'].values - test_data_future = test_data.copy().slice_by_timestep(slice(-prediction_length, None)) - truth_only = test_data_future[label].values + # Add columns necessary for the metric computation + quantile forecast to `optional_columns` + test_data_future = pd.read_csv(dataset.test_path, parse_dates=[dataset.timestamp_column]) + optional_columns = dict( + repeated_item_id=np.load(dataset.repeated_item_id), + repeated_abs_seasonal_error=np.load(dataset.repeated_abs_seasonal_error), + ) + for q in config.quantile_levels: + optional_columns[str(q)] = predictions[str(q)].values + + predictions_only = get_point_forecast(predictions, config.metric) + truth_only = test_data_future[dataset.target].values - log.info(predictions_only) - log.info(truth_only) + # Sanity check - make sure predictions are ordered correctly + future_index = pd.MultiIndex.from_frame(test_data_future[[dataset.id_column, dataset.timestamp_column]]) + assert predictions.index.equals(future_index), "Predictions and test data index do not match" - leaderboard = predictor.leaderboard(test_data, silent=True) + test_data_full = pd.concat([train_data, test_data_future.set_index([dataset.id_column, dataset.timestamp_column])]) + leaderboard = predictor.leaderboard(test_data_full, silent=True) with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000): log.info(leaderboard) - num_models_trained = len(leaderboard) - save_artifacts(predictor=predictor, leaderboard=leaderboard, config=config) shutil.rmtree(predictor.path, ignore_errors=True) - quantiles = predictions.drop(columns=['mean']).reset_index(drop=True) - period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works. - - # we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)| - # 1. retrieve item_ids for each sequence/item - #dataset..X /. y - item_ids, inverse_item_ids = np.unique(test_data.reset_index()["item_id"].squeeze().to_numpy(), return_index=False, return_inverse=True) - # 2. capture sequences in a list - y_past = [test_data[label].squeeze().to_numpy()[inverse_item_ids == i][:-prediction_length] for i in range(len(item_ids))] - # 3. calculate period error per sequence - y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past] - # 4. repeat period error for each sequence, to save one for each element - y_past_period_error_rep = np.repeat(y_past_period_error, prediction_length) - - optional_columns = quantiles - optional_columns = optional_columns.assign(y_past_period_error=y_past_period_error_rep) + # Kill child processes spawned by Joblib to avoid spam in the AMLB log + get_reusable_executor().shutdown(wait=True) return result(output_file=config.output_predictions_file, predictions=predictions_only, truth=truth_only, - probabilities=None, - probabilities_labels=None, target_is_encoded=False, - models_count=num_models_trained, + models_count=len(leaderboard), training_duration=training.duration, predict_duration=predict.duration, - optional_columns=optional_columns) - -def load_data(train_path, test_path, timestamp_column, id_column): - - train_df = pd.read_csv( - train_path, - parse_dates=[timestamp_column], - ) - - train_data = TimeSeriesDataFrame.from_data_frame( - train_df, - id_column=id_column, - timestamp_column=timestamp_column, - ) - - test_df = pd.read_csv( - test_path, - parse_dates=[timestamp_column], - ) - - test_data = TimeSeriesDataFrame.from_data_frame( - test_df, - id_column=id_column, - timestamp_column=timestamp_column, - ) - - return train_data, test_data + optional_columns=pd.DataFrame(optional_columns)) def get_eval_metric(config): @@ -148,6 +105,16 @@ def get_eval_metric(config): return eval_metric +def get_point_forecast(predictions, metric): + # Return median for metrics optimized by median, if possible + if metric.lower() in ["rmse", "mse"] or "0.5" not in predictions.columns: + log.info("Using mean as point forecast") + return predictions["mean"].values + else: + log.info("Using median as point forecast") + return predictions["0.5"].values + + def save_artifacts(predictor, leaderboard, config): artifacts = config.framework_params.get('_save_artifacts', ['leaderboard']) try: diff --git a/frameworks/FEDOT/__init__.py b/frameworks/FEDOT/__init__.py new file mode 100644 index 000000000..86e68de98 --- /dev/null +++ b/frameworks/FEDOT/__init__.py @@ -0,0 +1,25 @@ +from amlb.benchmark import TaskConfig +from amlb.data import Dataset +from amlb.utils import call_script_in_same_dir + + +def setup(*args, **kwargs): + call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs) + + +def run(dataset: Dataset, config: TaskConfig): + from frameworks.shared.caller import run_in_venv + + data = dict( + train=dict( + X=dataset.train.X, + y=dataset.train.y + ), + test=dict( + X=dataset.test.X, + y=dataset.test.y + ) + ) + + return run_in_venv(__file__, "exec.py", + input_data=data, dataset=dataset, config=config) diff --git a/frameworks/FEDOT/exec.py b/frameworks/FEDOT/exec.py new file mode 100644 index 000000000..b57448949 --- /dev/null +++ b/frameworks/FEDOT/exec.py @@ -0,0 +1,99 @@ +import logging +import os +from pathlib import Path + +from fedot.api.main import Fedot + +from frameworks.shared.callee import call_run, result, output_subdir +from frameworks.shared.utils import Timer + +log = logging.getLogger(__name__) + + +def run(dataset, config): + log.info("\n**** FEDOT ****\n") + + is_classification = config.type == 'classification' + # Mapping of benchmark metrics to FEDOT metrics + metrics_mapping = dict( + acc='acc', + auc='roc_auc', + f1='f1', + logloss='logloss', + mae='mae', + mse='mse', + msle='msle', + r2='r2', + rmse='rmse' + ) + scoring_metric = metrics_mapping.get(config.metric, None) + + if scoring_metric is None: + log.warning("Performance metric %s not supported.", config.metric) + + training_params = {"preset": "best_quality", "n_jobs": config.cores} + training_params |= {k: v for k, v in config.framework_params.items() if not k.startswith('_')} + n_jobs = training_params["n_jobs"] + + log.info('Running FEDOT with a maximum time of %ss on %s cores, optimizing %s.', + config.max_runtime_seconds, n_jobs, scoring_metric) + runtime_min = config.max_runtime_seconds / 60 + + fedot = Fedot(problem=config.type, timeout=runtime_min, metric=scoring_metric, seed=config.seed, + max_pipeline_fit_time=runtime_min / 10, **training_params) + + with Timer() as training: + fedot.fit(features=dataset.train.X, target=dataset.train.y) + + log.info('Predicting on the test set.') + with Timer() as predict: + predictions = fedot.predict(features=dataset.test.X) + probabilities = None + if is_classification: + probabilities = fedot.predict_proba(features=dataset.test.X, probs_for_all_classes=True) + + save_artifacts(fedot, config) + + return result(output_file=config.output_predictions_file, + predictions=predictions, + truth=dataset.test.y, + probabilities=probabilities, + target_is_encoded=False, + models_count=fedot.current_pipeline.length, + training_duration=training.duration, + predict_duration=predict.duration) + + +def save_artifacts(automl, config): + + artifacts = config.framework_params.get('_save_artifacts', []) + if 'models' in artifacts: + try: + models_dir = output_subdir('models', config) + models_file = os.path.join(models_dir, 'model.json') + automl.current_pipeline.save(models_file) + except Exception as e: + log.info(f"Error when saving 'models': {e}.", exc_info=True) + + if 'info' in artifacts: + try: + info_dir = output_subdir("info", config) + if automl.history: + automl.history.save(os.path.join(info_dir, 'history.json')) + else: + log.info(f"There is no optimization history info to save.") + except Exception as e: + log.info(f"Error when saving info about optimisation history: {e}.", exc_info=True) + + if 'leaderboard' in artifacts: + try: + leaderboard_dir = output_subdir("leaderboard", config) + if automl.history: + lb = automl.history.get_leaderboard() + Path(os.path.join(leaderboard_dir, "leaderboard.csv")).write_text(lb) + except Exception as e: + log.info(f"Error when saving 'leaderboard': {e}.", exc_info=True) + + +if __name__ == '__main__': + call_run(run) diff --git a/frameworks/FEDOT/setup.sh b/frameworks/FEDOT/setup.sh new file mode 100644 index 000000000..a89781583 --- /dev/null +++ b/frameworks/FEDOT/setup.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash +HERE=$(dirname "$0") +VERSION=${1:-"stable"} +REPO=${2:-"https://github.com/aimclub/FEDOT.git"} +PKG=${3:-"fedot"} +if [[ "$VERSION" == "latest" ]]; then + VERSION="master" +fi + +# creating local venv +. ${HERE}/../shared/setup.sh ${HERE} true + +RAWREPO=$(echo ${REPO} | sed "s/github\.com/raw\.githubusercontent\.com/") +if [[ "$VERSION" == "stable" ]]; then + PIP install --no-cache-dir -U ${PKG} + echo GET_VERSION_STABLE + VERSION=$(PY -c "${GET_VERSION_STABLE}") +elif [[ "$VERSION" =~ ^[0-9] ]]; then + PIP install --no-cache-dir -U ${PKG}==${VERSION} +else + TARGET_DIR="${HERE}/lib/${PKG}" + rm -Rf ${TARGET_DIR} + + if [[ "$VERSION" =~ ^# ]]; then + COMMIT="${VERSION:1}" + else + # find the latest commit to the VERSION branch + COMMIT=$(git ls-remote "${REPO}" | grep "refs/heads/${VERSION}" | cut -f 1) + DEPTH="--depth 1 --branch ${VERSION}" + fi + + git clone --recurse-submodules --shallow-submodules ${DEPTH} ${REPO} ${TARGET_DIR} + cd ${TARGET_DIR} + git checkout "${COMMIT}" + git submodule update --init --recursive + cd ${HERE} + PIP install -U -e ${TARGET_DIR} +fi + +installed="${HERE}/.setup/installed" +PY -c "from fedot import __version__; print(__version__)" >> "$installed" +if [[ -n $COMMIT ]]; then + truncate -s-1 "$installed" + echo "#${COMMIT}" >> "$installed" +fi diff --git a/frameworks/NaiveAutoML/__init__.py b/frameworks/NaiveAutoML/__init__.py index 32a48deed..889396d4c 100644 --- a/frameworks/NaiveAutoML/__init__.py +++ b/frameworks/NaiveAutoML/__init__.py @@ -1,6 +1,6 @@ from amlb.benchmark import TaskConfig from amlb.data import Dataset -from amlb.utils import call_script_in_same_dir, unsparsify +from amlb.utils import call_script_in_same_dir def setup(*args, **kwargs): @@ -14,11 +14,11 @@ def run(dataset: Dataset, config: TaskConfig): target=dataset.target.name, train=dict( X=dataset.train.X, - y=unsparsify(dataset.train.y_enc), + y=dataset.train.y_enc, ), test=dict( X=dataset.test.X, - y=unsparsify(dataset.test.y_enc), + y=dataset.test.y_enc, ), ) if config.measure_inference_time: diff --git a/frameworks/NaiveAutoML/exec.py b/frameworks/NaiveAutoML/exec.py index 4f0c00050..aa9072156 100644 --- a/frameworks/NaiveAutoML/exec.py +++ b/frameworks/NaiveAutoML/exec.py @@ -45,11 +45,9 @@ def run(dataset, config): if scoring_metric is None: raise ValueError(f"Performance metric {config.metric} not supported.") - is_classification = (config.type == 'classification') kwargs = dict( scoring=scoring_metric, num_cpus=config.cores, - task_type=config.type, ) # NAML wasn't really designed to run for long time constraints, so we # make it easy to run NAML with its default configuration for time/iterations. @@ -65,13 +63,14 @@ def run(dataset, config): log.info("`_use_default_time_and_iterations` is set, ignoring time constraint.") kwargs |= {k: v for k, v in config.framework_params.items() if not k.startswith("_")} - log.info(f"Initializing NaiveAutoml(**{kwargs})") automl = NaiveAutoML(**kwargs) with Timer() as training: automl.fit(dataset.train.X, dataset.train.y) log.info(f"Finished fit in {training.duration}s.") + is_classification = (config.type == 'classification') + def infer(data: Union[str, pd.DataFrame]): test_data = pd.read_parquet(data) if isinstance(data, str) else data predict_fn = automl.predict_proba if is_classification else automl.predict diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py index e2e7023dc..dda275f2a 100644 --- a/frameworks/autosklearn/exec.py +++ b/frameworks/autosklearn/exec.py @@ -8,7 +8,6 @@ from typing import Union import pandas as pd -import pandas.api.types from numpy.random import default_rng os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() @@ -69,10 +68,7 @@ def run(dataset, config): ) log.info("Environment: %s", os.environ) - def is_sparse(data: pd.DataFrame) -> bool: - return any(pd.api.types.is_sparse(data[column]) for column in data) - - use_pandas = (askl_version >= version.parse("0.15")) and not is_sparse(dataset.train.X) + use_pandas = askl_version >= version.parse("0.15") X_train = dataset.train.X if use_pandas else dataset.train.X_enc y_train = dataset.train.y if use_pandas else dataset.train.y_enc predictors_type = dataset.predictors_type diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..5f2dd0f6c --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,51 @@ +site_name: AutoML Benchmark +theme: + name: material + features: + - navigation.expand + - navigation.indexes + - content.tabs.link + - content.code.annotate + icon: + logo: material/home + admonition: + windows: fontawesome/brands/windows +extra: + homepage: WEBSITE/index.html + +nav: + - index.md + - getting_started.md + - Using the Benchmark: + - Parameters: using/parameters.md + - Configuration: using/configuration.md + - using/aws.md + - using/result_analysis.md + - Extending the Benchmark: + - extending/index.md + - extending/benchmark.md + - extending/constraint.md + - Frameworks: extending/framework.md + - FAQ: faq.md + +extra_css: + - stylesheets/extra.css + +markdown_extensions: + - def_list + - admonition + - toc + - attr_list + - pymdownx.details + - pymdownx.superfences + - pymdownx.snippets + - pymdownx.inlinehilite + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.tabbed: + alternate_style: true + - pymdownx.emoji: + emoji_index: !!python/name:materialx.emoji.twemoji + emoji_generator: !!python/name:materialx.emoji.to_svg \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..78d101b38 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,37 @@ +[tool.mypy] +files=[ + "amlb/**/*.py" +] +python_version = "3.9" +# Required because the normal usage pattern of namespaces raises [attr-defined] errors. +# I can't a way to disable [attr-defined] errors for `Namespace` only. +disable_error_code = "attr-defined" + +[[tool.mypy.overrides]] +ignore_errors = false +module = "amlb.utils.*" + + +[[tool.mypy.overrides]] +ignore_errors = true +module = "amlb.benchmarks.*" + + +[[tool.mypy.overrides]] +ignore_errors = true +module = "amlb.datasets.*" + + +[[tool.mypy.overrides]] +ignore_errors = true +module = "amlb.frameworks.*" + + +[[tool.mypy.overrides]] +ignore_errors = true +module = "amlb.runners.*" + + +[[tool.mypy.overrides]] +ignore_errors = true +module = "amlb.*" \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index 4ff2a0f94..ab42f00ce 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,7 @@ pytest pytest-mock -pip-tools \ No newline at end of file +pip-tools + +types-psutil +pandas-stubs +mypy diff --git a/resources/benchmarks/timeseries.yaml b/resources/benchmarks/timeseries.yaml index 26af06497..586cf738a 100644 --- a/resources/benchmarks/timeseries.yaml +++ b/resources/benchmarks/timeseries.yaml @@ -1,13 +1,16 @@ --- -- name: covid +- name: m4_hourly dataset: - train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv - test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv - target: ConfirmedCases + path: https://autogluon.s3.amazonaws.com/datasets/timeseries/m4_hourly/test.csv type: timeseries - forecast_range_in_steps: 19 - id_column: name - timestamp_column: Date + freq: H + forecast_horizon_in_steps: 48 + seasonality: 24 + target: target + id_column: item_id + timestamp_column: timestamp + metric: [mase, smape, mape, rmse, mql, wql, sql] + quantile_levels: [0.05, 0.5, 0.95] - folds: 1 + folds: 2 diff --git a/resources/config.yaml b/resources/config.yaml index faae3657e..a3b7809b8 100644 --- a/resources/config.yaml +++ b/resources/config.yaml @@ -54,13 +54,15 @@ benchmarks: # configuration namespace for the benchmarks def binary: ['auc', 'logloss', 'acc', 'balacc'] # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error). multiclass: ['logloss', 'acc', 'balacc'] # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average. regression: ['rmse', 'r2', 'mae'] # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2). - timeseries: ['mase', 'mape', 'smape', 'rmse', 'mse', 'nrmse', 'wape', 'ncrps'] + timeseries: ['mase', 'mape', 'smape', 'wape', 'rmse', 'mse', 'mql', 'wql', 'sql'] # available metrics: mase (Mean Absolute Scaled Error), mape (Mean Absolute Percentage Error), smape (Symmetric Mean Absolute Percentage Error), wape (Weighted Absolute Percentage Error), rmse (Root Mean Square Error), mse (Mean Square Error), mql (Mean Quantile Loss), wql (Weighted Quantile Loss), sql (Scaled Quantile Loss). + defaults: # the default constraints, usually overridden by a constraint. folds: 10 # the amount of fold-runs executed for each dataset. max_runtime_seconds: 3600 # default time allocated to the framework to train a model. cores: -1 # default amount of cores used for each automl task. If <= 0, will try to use all cores. max_mem_size_mb: -1 # default amount of memory assigned to each automl task. If <= 0, then the amount of memory is computed from os available memory. min_vol_size_mb: -1 # default minimum amount of free space required on the volume. If <= 0, skips verification. + quantile_levels: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # default quantile_levels for timeseries problem type job_scheduler: # configuration namespace exit_on_job_failure: # if true, the entire run will be aborted on the first job failure (mainly used for testing) : set by caller (runbenchmark.py) diff --git a/resources/frameworks.yaml b/resources/frameworks.yaml index dbaa0d1ac..da2881ce0 100644 --- a/resources/frameworks.yaml +++ b/resources/frameworks.yaml @@ -203,17 +203,15 @@ TPOT: # population_size: 25 # verbosity: 2 -#################################### -### TimeSeries AutoML frameworks ### -#################################### - -AutoGluonTS: - extends: AutoGluon - version: "stable" +FEDOT: + version: 'master' description: | - AutoGluon-TimeSeries - setup_env: - MODULE: timeseries + FEDOT is a AutoML tool that optimizes composite machine learning pipelines using evolutionary optimisation. + project: https://github.com/aimclub/FEDOT + refs: + - https://doi.org/10.1016/j.future.2021.08.022 +# params: +# _save_artifacts: ['leaderboard', 'models', 'info'] ####################################### ### Non AutoML reference frameworks ### diff --git a/resources/frameworks_2023Q2.yaml b/resources/frameworks_2023Q2.yaml index eab8170cc..1ac098b6f 100644 --- a/resources/frameworks_2023Q2.yaml +++ b/resources/frameworks_2023Q2.yaml @@ -97,8 +97,7 @@ mlr3automl: project: https://github.com/a-hanf/mlr3automl NaiveAutoML: - repo: https://github.com/pgijsbers/naiveautoml - version: '#1af07c22b64510df7700798f4c0f32c3ca76ab93' + version: '0.0.27' TPOT: version: '0.12.0' diff --git a/resources/frameworks_latest.yaml b/resources/frameworks_latest.yaml index 7a5ca78a5..44f924232 100644 --- a/resources/frameworks_latest.yaml +++ b/resources/frameworks_latest.yaml @@ -86,17 +86,8 @@ oboe: TPOT: version: 'latest' -#################################### -### TimeSeries AutoML frameworks ### -#################################### - -AutoGluonTS: - extends: AutoGluon - version: "latest" - description: | - AutoGluon-TimeSeries - setup_env: - MODULE: timeseries +FEDOT: + version: 'latest' ####################################### ### Non AutoML reference frameworks ### diff --git a/resources/frameworks_stable.yaml b/resources/frameworks_stable.yaml index 3de7da369..d6b5a1ce0 100644 --- a/resources/frameworks_stable.yaml +++ b/resources/frameworks_stable.yaml @@ -91,8 +91,8 @@ oboe: TPOT: version: 'stable' - - +FEDOT: + version: 'stable' ####################################### ### Non AutoML reference frameworks ### diff --git a/tests/unit/amlb/datasets/file/resources/m4_hourly_subset.csv b/tests/unit/amlb/datasets/file/resources/m4_hourly_subset.csv new file mode 100644 index 000000000..7ee20e07d --- /dev/null +++ b/tests/unit/amlb/datasets/file/resources/m4_hourly_subset.csv @@ -0,0 +1,301 @@ +item_id,timestamp,target +T1,2015-07-01 12:00:00,605.0 +T1,2015-07-01 13:00:00,586.0 +T1,2015-07-01 14:00:00,586.0 +T1,2015-07-01 15:00:00,559.0 +T1,2015-07-01 16:00:00,511.0 +T1,2015-07-01 17:00:00,443.0 +T1,2015-07-01 18:00:00,422.0 +T1,2015-07-01 19:00:00,395.0 +T1,2015-07-01 20:00:00,382.0 +T1,2015-07-01 21:00:00,370.0 +T1,2015-07-01 22:00:00,383.0 +T1,2015-07-01 23:00:00,397.0 +T1,2015-07-02 00:00:00,420.0 +T1,2015-07-02 01:00:00,455.0 +T1,2015-07-02 02:00:00,493.0 +T1,2015-07-02 03:00:00,554.0 +T1,2015-07-02 04:00:00,610.0 +T1,2015-07-02 05:00:00,666.0 +T1,2015-07-02 06:00:00,715.0 +T1,2015-07-02 07:00:00,755.0 +T1,2015-07-02 08:00:00,778.0 +T1,2015-07-02 09:00:00,794.0 +T1,2015-07-02 10:00:00,806.0 +T1,2015-07-02 11:00:00,808.0 +T1,2015-07-02 12:00:00,776.0 +T1,2015-07-02 13:00:00,723.0 +T1,2015-07-02 14:00:00,709.0 +T1,2015-07-02 15:00:00,660.0 +T1,2015-07-02 16:00:00,585.0 +T1,2015-07-02 17:00:00,527.0 +T1,2015-07-02 18:00:00,462.0 +T1,2015-07-02 19:00:00,437.0 +T1,2015-07-02 20:00:00,413.0 +T1,2015-07-02 21:00:00,407.0 +T1,2015-07-02 22:00:00,404.0 +T1,2015-07-02 23:00:00,420.0 +T1,2015-07-03 00:00:00,441.0 +T1,2015-07-03 01:00:00,471.0 +T1,2015-07-03 02:00:00,526.0 +T1,2015-07-03 03:00:00,571.0 +T1,2015-07-03 04:00:00,612.0 +T1,2015-07-03 05:00:00,635.0 +T1,2015-07-03 06:00:00,613.0 +T1,2015-07-03 07:00:00,608.0 +T1,2015-07-03 08:00:00,614.0 +T1,2015-07-03 09:00:00,637.0 +T1,2015-07-03 10:00:00,669.0 +T1,2015-07-03 11:00:00,683.0 +T1,2015-07-03 12:00:00,687.0 +T1,2015-07-03 13:00:00,660.0 +T1,2015-07-03 14:00:00,661.0 +T1,2015-07-03 15:00:00,632.0 +T1,2015-07-03 16:00:00,573.0 +T1,2015-07-03 17:00:00,521.0 +T1,2015-07-03 18:00:00,481.0 +T1,2015-07-03 19:00:00,452.0 +T1,2015-07-03 20:00:00,447.0 +T1,2015-07-03 21:00:00,425.0 +T1,2015-07-03 22:00:00,427.0 +T1,2015-07-03 23:00:00,441.0 +T1,2015-07-04 00:00:00,438.0 +T1,2015-07-04 01:00:00,472.0 +T1,2015-07-04 02:00:00,528.0 +T1,2015-07-04 03:00:00,596.0 +T1,2015-07-04 04:00:00,661.0 +T1,2015-07-04 05:00:00,708.0 +T1,2015-07-04 06:00:00,754.0 +T1,2015-07-04 07:00:00,781.0 +T1,2015-07-04 08:00:00,808.0 +T1,2015-07-04 09:00:00,819.0 +T1,2015-07-04 10:00:00,820.0 +T1,2015-07-04 11:00:00,801.0 +T1,2015-07-04 12:00:00,770.0 +T1,2015-07-04 13:00:00,717.0 +T1,2015-07-04 14:00:00,697.0 +T1,2015-07-04 15:00:00,655.0 +T1,2015-07-04 16:00:00,607.0 +T1,2015-07-04 17:00:00,552.0 +T1,2015-07-04 18:00:00,512.0 +T1,2015-07-04 19:00:00,475.0 +T1,2015-07-04 20:00:00,452.0 +T1,2015-07-04 21:00:00,436.0 +T1,2015-07-04 22:00:00,429.0 +T1,2015-07-04 23:00:00,433.0 +T1,2015-07-05 00:00:00,430.0 +T1,2015-07-05 01:00:00,472.0 +T1,2015-07-05 02:00:00,536.0 +T1,2015-07-05 03:00:00,611.0 +T1,2015-07-05 04:00:00,662.0 +T1,2015-07-05 05:00:00,705.0 +T1,2015-07-05 06:00:00,707.0 +T1,2015-07-05 07:00:00,718.0 +T1,2015-07-05 08:00:00,733.0 +T1,2015-07-05 09:00:00,741.0 +T1,2015-07-05 10:00:00,737.0 +T1,2015-07-05 11:00:00,710.0 +T1,2015-07-05 12:00:00,647.0 +T1,2015-07-05 13:00:00,593.0 +T1,2015-07-05 14:00:00,564.0 +T1,2015-07-05 15:00:00,528.0 +T2,2015-07-01 12:00:00,3124.0 +T2,2015-07-01 13:00:00,2990.0 +T2,2015-07-01 14:00:00,2862.0 +T2,2015-07-01 15:00:00,2809.0 +T2,2015-07-01 16:00:00,2544.0 +T2,2015-07-01 17:00:00,2201.0 +T2,2015-07-01 18:00:00,1996.0 +T2,2015-07-01 19:00:00,1861.0 +T2,2015-07-01 20:00:00,1735.0 +T2,2015-07-01 21:00:00,1713.0 +T2,2015-07-01 22:00:00,1724.0 +T2,2015-07-01 23:00:00,1798.0 +T2,2015-07-02 00:00:00,1891.0 +T2,2015-07-02 01:00:00,2037.0 +T2,2015-07-02 02:00:00,2102.0 +T2,2015-07-02 03:00:00,2163.0 +T2,2015-07-02 04:00:00,2269.0 +T2,2015-07-02 05:00:00,2404.0 +T2,2015-07-02 06:00:00,2515.0 +T2,2015-07-02 07:00:00,2621.0 +T2,2015-07-02 08:00:00,2745.0 +T2,2015-07-02 09:00:00,2816.0 +T2,2015-07-02 10:00:00,2938.0 +T2,2015-07-02 11:00:00,3022.0 +T2,2015-07-02 12:00:00,2976.0 +T2,2015-07-02 13:00:00,2892.0 +T2,2015-07-02 14:00:00,2784.0 +T2,2015-07-02 15:00:00,2725.0 +T2,2015-07-02 16:00:00,2530.0 +T2,2015-07-02 17:00:00,2211.0 +T2,2015-07-02 18:00:00,1995.0 +T2,2015-07-02 19:00:00,1833.0 +T2,2015-07-02 20:00:00,1768.0 +T2,2015-07-02 21:00:00,1712.0 +T2,2015-07-02 22:00:00,1707.0 +T2,2015-07-02 23:00:00,1762.0 +T2,2015-07-03 00:00:00,1880.0 +T2,2015-07-03 01:00:00,1995.0 +T2,2015-07-03 02:00:00,2134.0 +T2,2015-07-03 03:00:00,2227.0 +T2,2015-07-03 04:00:00,2376.0 +T2,2015-07-03 05:00:00,2477.0 +T2,2015-07-03 06:00:00,2597.0 +T2,2015-07-03 07:00:00,2691.0 +T2,2015-07-03 08:00:00,2751.0 +T2,2015-07-03 09:00:00,2782.0 +T2,2015-07-03 10:00:00,2810.0 +T2,2015-07-03 11:00:00,2781.0 +T2,2015-07-03 12:00:00,2693.0 +T2,2015-07-03 13:00:00,2567.0 +T2,2015-07-03 14:00:00,2490.0 +T2,2015-07-03 15:00:00,2448.0 +T2,2015-07-03 16:00:00,2277.0 +T2,2015-07-03 17:00:00,1997.0 +T2,2015-07-03 18:00:00,1785.0 +T2,2015-07-03 19:00:00,1689.0 +T2,2015-07-03 20:00:00,1562.0 +T2,2015-07-03 21:00:00,1560.0 +T2,2015-07-03 22:00:00,1505.0 +T2,2015-07-03 23:00:00,1538.0 +T2,2015-07-04 00:00:00,1641.0 +T2,2015-07-04 01:00:00,1735.0 +T2,2015-07-04 02:00:00,1950.0 +T2,2015-07-04 03:00:00,2138.0 +T2,2015-07-04 04:00:00,2303.0 +T2,2015-07-04 05:00:00,2432.0 +T2,2015-07-04 06:00:00,2528.0 +T2,2015-07-04 07:00:00,2656.0 +T2,2015-07-04 08:00:00,2740.0 +T2,2015-07-04 09:00:00,2803.0 +T2,2015-07-04 10:00:00,2855.0 +T2,2015-07-04 11:00:00,2880.0 +T2,2015-07-04 12:00:00,2778.0 +T2,2015-07-04 13:00:00,2637.0 +T2,2015-07-04 14:00:00,2479.0 +T2,2015-07-04 15:00:00,2381.0 +T2,2015-07-04 16:00:00,2228.0 +T2,2015-07-04 17:00:00,2037.0 +T2,2015-07-04 18:00:00,1758.0 +T2,2015-07-04 19:00:00,1648.0 +T2,2015-07-04 20:00:00,1560.0 +T2,2015-07-04 21:00:00,1508.0 +T2,2015-07-04 22:00:00,1486.0 +T2,2015-07-04 23:00:00,1486.0 +T2,2015-07-05 00:00:00,1515.0 +T2,2015-07-05 01:00:00,1623.0 +T2,2015-07-05 02:00:00,1919.0 +T2,2015-07-05 03:00:00,2172.0 +T2,2015-07-05 04:00:00,2416.0 +T2,2015-07-05 05:00:00,2605.0 +T2,2015-07-05 06:00:00,2755.0 +T2,2015-07-05 07:00:00,2822.0 +T2,2015-07-05 08:00:00,2917.0 +T2,2015-07-05 09:00:00,2997.0 +T2,2015-07-05 10:00:00,3060.0 +T2,2015-07-05 11:00:00,3046.0 +T2,2015-07-05 12:00:00,2942.0 +T2,2015-07-05 13:00:00,2758.0 +T2,2015-07-05 14:00:00,2487.0 +T2,2015-07-05 15:00:00,2349.0 +T3,2015-07-01 12:00:00,1828.0 +T3,2015-07-01 13:00:00,1806.0 +T3,2015-07-01 14:00:00,1897.0 +T3,2015-07-01 15:00:00,1750.0 +T3,2015-07-01 16:00:00,1679.0 +T3,2015-07-01 17:00:00,1620.0 +T3,2015-07-01 18:00:00,1463.0 +T3,2015-07-01 19:00:00,1342.0 +T3,2015-07-01 20:00:00,1192.0 +T3,2015-07-01 21:00:00,1108.0 +T3,2015-07-01 22:00:00,1058.0 +T3,2015-07-01 23:00:00,1024.0 +T3,2015-07-02 00:00:00,1031.0 +T3,2015-07-02 01:00:00,1091.0 +T3,2015-07-02 02:00:00,1208.0 +T3,2015-07-02 03:00:00,1337.0 +T3,2015-07-02 04:00:00,1435.0 +T3,2015-07-02 05:00:00,1515.0 +T3,2015-07-02 06:00:00,1593.0 +T3,2015-07-02 07:00:00,1667.0 +T3,2015-07-02 08:00:00,1753.0 +T3,2015-07-02 09:00:00,1768.0 +T3,2015-07-02 10:00:00,1823.0 +T3,2015-07-02 11:00:00,1813.0 +T3,2015-07-02 12:00:00,1842.0 +T3,2015-07-02 13:00:00,1838.0 +T3,2015-07-02 14:00:00,1800.0 +T3,2015-07-02 15:00:00,1761.0 +T3,2015-07-02 16:00:00,1670.0 +T3,2015-07-02 17:00:00,1609.0 +T3,2015-07-02 18:00:00,1467.0 +T3,2015-07-02 19:00:00,1309.0 +T3,2015-07-02 20:00:00,1189.0 +T3,2015-07-02 21:00:00,1102.0 +T3,2015-07-02 22:00:00,1054.0 +T3,2015-07-02 23:00:00,1017.0 +T3,2015-07-03 00:00:00,1014.0 +T3,2015-07-03 01:00:00,1063.0 +T3,2015-07-03 02:00:00,1187.0 +T3,2015-07-03 03:00:00,1314.0 +T3,2015-07-03 04:00:00,1424.0 +T3,2015-07-03 05:00:00,1497.0 +T3,2015-07-03 06:00:00,1586.0 +T3,2015-07-03 07:00:00,1659.0 +T3,2015-07-03 08:00:00,1722.0 +T3,2015-07-03 09:00:00,1781.0 +T3,2015-07-03 10:00:00,1805.0 +T3,2015-07-03 11:00:00,1831.0 +T3,2015-07-03 12:00:00,1851.0 +T3,2015-07-03 13:00:00,1831.0 +T3,2015-07-03 14:00:00,1809.0 +T3,2015-07-03 15:00:00,1755.0 +T3,2015-07-03 16:00:00,1685.0 +T3,2015-07-03 17:00:00,1618.0 +T3,2015-07-03 18:00:00,1487.0 +T3,2015-07-03 19:00:00,1311.0 +T3,2015-07-03 20:00:00,1180.0 +T3,2015-07-03 21:00:00,1087.0 +T3,2015-07-03 22:00:00,1033.0 +T3,2015-07-03 23:00:00,1002.0 +T3,2015-07-04 00:00:00,991.0 +T3,2015-07-04 01:00:00,1005.0 +T3,2015-07-04 02:00:00,1071.0 +T3,2015-07-04 03:00:00,1191.0 +T3,2015-07-04 04:00:00,1307.0 +T3,2015-07-04 05:00:00,1407.0 +T3,2015-07-04 06:00:00,1495.0 +T3,2015-07-04 07:00:00,1576.0 +T3,2015-07-04 08:00:00,1635.0 +T3,2015-07-04 09:00:00,1688.0 +T3,2015-07-04 10:00:00,1711.0 +T3,2015-07-04 11:00:00,1741.0 +T3,2015-07-04 12:00:00,1768.0 +T3,2015-07-04 13:00:00,1765.0 +T3,2015-07-04 14:00:00,1738.0 +T3,2015-07-04 15:00:00,1684.0 +T3,2015-07-04 16:00:00,1605.0 +T3,2015-07-04 17:00:00,1553.0 +T3,2015-07-04 18:00:00,1433.0 +T3,2015-07-04 19:00:00,1297.0 +T3,2015-07-04 20:00:00,1177.0 +T3,2015-07-04 21:00:00,1082.0 +T3,2015-07-04 22:00:00,1028.0 +T3,2015-07-04 23:00:00,987.0 +T3,2015-07-05 00:00:00,970.0 +T3,2015-07-05 01:00:00,959.0 +T3,2015-07-05 02:00:00,993.0 +T3,2015-07-05 03:00:00,1083.0 +T3,2015-07-05 04:00:00,1215.0 +T3,2015-07-05 05:00:00,1310.0 +T3,2015-07-05 06:00:00,1415.0 +T3,2015-07-05 07:00:00,1479.0 +T3,2015-07-05 08:00:00,1525.0 +T3,2015-07-05 09:00:00,1599.0 +T3,2015-07-05 10:00:00,1623.0 +T3,2015-07-05 11:00:00,1652.0 +T3,2015-07-05 12:00:00,1671.0 +T3,2015-07-05 13:00:00,1664.0 +T3,2015-07-05 14:00:00,1637.0 +T3,2015-07-05 15:00:00,1574.0 diff --git a/tests/unit/amlb/datasets/file/resources/m4_hourly_subset_nondefault_cols.csv b/tests/unit/amlb/datasets/file/resources/m4_hourly_subset_nondefault_cols.csv new file mode 100644 index 000000000..bc2b6c24f --- /dev/null +++ b/tests/unit/amlb/datasets/file/resources/m4_hourly_subset_nondefault_cols.csv @@ -0,0 +1,301 @@ +CustomId,CustomTimestamp,CustomTarget +T1,2015-07-01 12:00:00,605.0 +T1,2015-07-01 13:00:00,586.0 +T1,2015-07-01 14:00:00,586.0 +T1,2015-07-01 15:00:00,559.0 +T1,2015-07-01 16:00:00,511.0 +T1,2015-07-01 17:00:00,443.0 +T1,2015-07-01 18:00:00,422.0 +T1,2015-07-01 19:00:00,395.0 +T1,2015-07-01 20:00:00,382.0 +T1,2015-07-01 21:00:00,370.0 +T1,2015-07-01 22:00:00,383.0 +T1,2015-07-01 23:00:00,397.0 +T1,2015-07-02 00:00:00,420.0 +T1,2015-07-02 01:00:00,455.0 +T1,2015-07-02 02:00:00,493.0 +T1,2015-07-02 03:00:00,554.0 +T1,2015-07-02 04:00:00,610.0 +T1,2015-07-02 05:00:00,666.0 +T1,2015-07-02 06:00:00,715.0 +T1,2015-07-02 07:00:00,755.0 +T1,2015-07-02 08:00:00,778.0 +T1,2015-07-02 09:00:00,794.0 +T1,2015-07-02 10:00:00,806.0 +T1,2015-07-02 11:00:00,808.0 +T1,2015-07-02 12:00:00,776.0 +T1,2015-07-02 13:00:00,723.0 +T1,2015-07-02 14:00:00,709.0 +T1,2015-07-02 15:00:00,660.0 +T1,2015-07-02 16:00:00,585.0 +T1,2015-07-02 17:00:00,527.0 +T1,2015-07-02 18:00:00,462.0 +T1,2015-07-02 19:00:00,437.0 +T1,2015-07-02 20:00:00,413.0 +T1,2015-07-02 21:00:00,407.0 +T1,2015-07-02 22:00:00,404.0 +T1,2015-07-02 23:00:00,420.0 +T1,2015-07-03 00:00:00,441.0 +T1,2015-07-03 01:00:00,471.0 +T1,2015-07-03 02:00:00,526.0 +T1,2015-07-03 03:00:00,571.0 +T1,2015-07-03 04:00:00,612.0 +T1,2015-07-03 05:00:00,635.0 +T1,2015-07-03 06:00:00,613.0 +T1,2015-07-03 07:00:00,608.0 +T1,2015-07-03 08:00:00,614.0 +T1,2015-07-03 09:00:00,637.0 +T1,2015-07-03 10:00:00,669.0 +T1,2015-07-03 11:00:00,683.0 +T1,2015-07-03 12:00:00,687.0 +T1,2015-07-03 13:00:00,660.0 +T1,2015-07-03 14:00:00,661.0 +T1,2015-07-03 15:00:00,632.0 +T1,2015-07-03 16:00:00,573.0 +T1,2015-07-03 17:00:00,521.0 +T1,2015-07-03 18:00:00,481.0 +T1,2015-07-03 19:00:00,452.0 +T1,2015-07-03 20:00:00,447.0 +T1,2015-07-03 21:00:00,425.0 +T1,2015-07-03 22:00:00,427.0 +T1,2015-07-03 23:00:00,441.0 +T1,2015-07-04 00:00:00,438.0 +T1,2015-07-04 01:00:00,472.0 +T1,2015-07-04 02:00:00,528.0 +T1,2015-07-04 03:00:00,596.0 +T1,2015-07-04 04:00:00,661.0 +T1,2015-07-04 05:00:00,708.0 +T1,2015-07-04 06:00:00,754.0 +T1,2015-07-04 07:00:00,781.0 +T1,2015-07-04 08:00:00,808.0 +T1,2015-07-04 09:00:00,819.0 +T1,2015-07-04 10:00:00,820.0 +T1,2015-07-04 11:00:00,801.0 +T1,2015-07-04 12:00:00,770.0 +T1,2015-07-04 13:00:00,717.0 +T1,2015-07-04 14:00:00,697.0 +T1,2015-07-04 15:00:00,655.0 +T1,2015-07-04 16:00:00,607.0 +T1,2015-07-04 17:00:00,552.0 +T1,2015-07-04 18:00:00,512.0 +T1,2015-07-04 19:00:00,475.0 +T1,2015-07-04 20:00:00,452.0 +T1,2015-07-04 21:00:00,436.0 +T1,2015-07-04 22:00:00,429.0 +T1,2015-07-04 23:00:00,433.0 +T1,2015-07-05 00:00:00,430.0 +T1,2015-07-05 01:00:00,472.0 +T1,2015-07-05 02:00:00,536.0 +T1,2015-07-05 03:00:00,611.0 +T1,2015-07-05 04:00:00,662.0 +T1,2015-07-05 05:00:00,705.0 +T1,2015-07-05 06:00:00,707.0 +T1,2015-07-05 07:00:00,718.0 +T1,2015-07-05 08:00:00,733.0 +T1,2015-07-05 09:00:00,741.0 +T1,2015-07-05 10:00:00,737.0 +T1,2015-07-05 11:00:00,710.0 +T1,2015-07-05 12:00:00,647.0 +T1,2015-07-05 13:00:00,593.0 +T1,2015-07-05 14:00:00,564.0 +T1,2015-07-05 15:00:00,528.0 +T2,2015-07-01 12:00:00,3124.0 +T2,2015-07-01 13:00:00,2990.0 +T2,2015-07-01 14:00:00,2862.0 +T2,2015-07-01 15:00:00,2809.0 +T2,2015-07-01 16:00:00,2544.0 +T2,2015-07-01 17:00:00,2201.0 +T2,2015-07-01 18:00:00,1996.0 +T2,2015-07-01 19:00:00,1861.0 +T2,2015-07-01 20:00:00,1735.0 +T2,2015-07-01 21:00:00,1713.0 +T2,2015-07-01 22:00:00,1724.0 +T2,2015-07-01 23:00:00,1798.0 +T2,2015-07-02 00:00:00,1891.0 +T2,2015-07-02 01:00:00,2037.0 +T2,2015-07-02 02:00:00,2102.0 +T2,2015-07-02 03:00:00,2163.0 +T2,2015-07-02 04:00:00,2269.0 +T2,2015-07-02 05:00:00,2404.0 +T2,2015-07-02 06:00:00,2515.0 +T2,2015-07-02 07:00:00,2621.0 +T2,2015-07-02 08:00:00,2745.0 +T2,2015-07-02 09:00:00,2816.0 +T2,2015-07-02 10:00:00,2938.0 +T2,2015-07-02 11:00:00,3022.0 +T2,2015-07-02 12:00:00,2976.0 +T2,2015-07-02 13:00:00,2892.0 +T2,2015-07-02 14:00:00,2784.0 +T2,2015-07-02 15:00:00,2725.0 +T2,2015-07-02 16:00:00,2530.0 +T2,2015-07-02 17:00:00,2211.0 +T2,2015-07-02 18:00:00,1995.0 +T2,2015-07-02 19:00:00,1833.0 +T2,2015-07-02 20:00:00,1768.0 +T2,2015-07-02 21:00:00,1712.0 +T2,2015-07-02 22:00:00,1707.0 +T2,2015-07-02 23:00:00,1762.0 +T2,2015-07-03 00:00:00,1880.0 +T2,2015-07-03 01:00:00,1995.0 +T2,2015-07-03 02:00:00,2134.0 +T2,2015-07-03 03:00:00,2227.0 +T2,2015-07-03 04:00:00,2376.0 +T2,2015-07-03 05:00:00,2477.0 +T2,2015-07-03 06:00:00,2597.0 +T2,2015-07-03 07:00:00,2691.0 +T2,2015-07-03 08:00:00,2751.0 +T2,2015-07-03 09:00:00,2782.0 +T2,2015-07-03 10:00:00,2810.0 +T2,2015-07-03 11:00:00,2781.0 +T2,2015-07-03 12:00:00,2693.0 +T2,2015-07-03 13:00:00,2567.0 +T2,2015-07-03 14:00:00,2490.0 +T2,2015-07-03 15:00:00,2448.0 +T2,2015-07-03 16:00:00,2277.0 +T2,2015-07-03 17:00:00,1997.0 +T2,2015-07-03 18:00:00,1785.0 +T2,2015-07-03 19:00:00,1689.0 +T2,2015-07-03 20:00:00,1562.0 +T2,2015-07-03 21:00:00,1560.0 +T2,2015-07-03 22:00:00,1505.0 +T2,2015-07-03 23:00:00,1538.0 +T2,2015-07-04 00:00:00,1641.0 +T2,2015-07-04 01:00:00,1735.0 +T2,2015-07-04 02:00:00,1950.0 +T2,2015-07-04 03:00:00,2138.0 +T2,2015-07-04 04:00:00,2303.0 +T2,2015-07-04 05:00:00,2432.0 +T2,2015-07-04 06:00:00,2528.0 +T2,2015-07-04 07:00:00,2656.0 +T2,2015-07-04 08:00:00,2740.0 +T2,2015-07-04 09:00:00,2803.0 +T2,2015-07-04 10:00:00,2855.0 +T2,2015-07-04 11:00:00,2880.0 +T2,2015-07-04 12:00:00,2778.0 +T2,2015-07-04 13:00:00,2637.0 +T2,2015-07-04 14:00:00,2479.0 +T2,2015-07-04 15:00:00,2381.0 +T2,2015-07-04 16:00:00,2228.0 +T2,2015-07-04 17:00:00,2037.0 +T2,2015-07-04 18:00:00,1758.0 +T2,2015-07-04 19:00:00,1648.0 +T2,2015-07-04 20:00:00,1560.0 +T2,2015-07-04 21:00:00,1508.0 +T2,2015-07-04 22:00:00,1486.0 +T2,2015-07-04 23:00:00,1486.0 +T2,2015-07-05 00:00:00,1515.0 +T2,2015-07-05 01:00:00,1623.0 +T2,2015-07-05 02:00:00,1919.0 +T2,2015-07-05 03:00:00,2172.0 +T2,2015-07-05 04:00:00,2416.0 +T2,2015-07-05 05:00:00,2605.0 +T2,2015-07-05 06:00:00,2755.0 +T2,2015-07-05 07:00:00,2822.0 +T2,2015-07-05 08:00:00,2917.0 +T2,2015-07-05 09:00:00,2997.0 +T2,2015-07-05 10:00:00,3060.0 +T2,2015-07-05 11:00:00,3046.0 +T2,2015-07-05 12:00:00,2942.0 +T2,2015-07-05 13:00:00,2758.0 +T2,2015-07-05 14:00:00,2487.0 +T2,2015-07-05 15:00:00,2349.0 +T3,2015-07-01 12:00:00,1828.0 +T3,2015-07-01 13:00:00,1806.0 +T3,2015-07-01 14:00:00,1897.0 +T3,2015-07-01 15:00:00,1750.0 +T3,2015-07-01 16:00:00,1679.0 +T3,2015-07-01 17:00:00,1620.0 +T3,2015-07-01 18:00:00,1463.0 +T3,2015-07-01 19:00:00,1342.0 +T3,2015-07-01 20:00:00,1192.0 +T3,2015-07-01 21:00:00,1108.0 +T3,2015-07-01 22:00:00,1058.0 +T3,2015-07-01 23:00:00,1024.0 +T3,2015-07-02 00:00:00,1031.0 +T3,2015-07-02 01:00:00,1091.0 +T3,2015-07-02 02:00:00,1208.0 +T3,2015-07-02 03:00:00,1337.0 +T3,2015-07-02 04:00:00,1435.0 +T3,2015-07-02 05:00:00,1515.0 +T3,2015-07-02 06:00:00,1593.0 +T3,2015-07-02 07:00:00,1667.0 +T3,2015-07-02 08:00:00,1753.0 +T3,2015-07-02 09:00:00,1768.0 +T3,2015-07-02 10:00:00,1823.0 +T3,2015-07-02 11:00:00,1813.0 +T3,2015-07-02 12:00:00,1842.0 +T3,2015-07-02 13:00:00,1838.0 +T3,2015-07-02 14:00:00,1800.0 +T3,2015-07-02 15:00:00,1761.0 +T3,2015-07-02 16:00:00,1670.0 +T3,2015-07-02 17:00:00,1609.0 +T3,2015-07-02 18:00:00,1467.0 +T3,2015-07-02 19:00:00,1309.0 +T3,2015-07-02 20:00:00,1189.0 +T3,2015-07-02 21:00:00,1102.0 +T3,2015-07-02 22:00:00,1054.0 +T3,2015-07-02 23:00:00,1017.0 +T3,2015-07-03 00:00:00,1014.0 +T3,2015-07-03 01:00:00,1063.0 +T3,2015-07-03 02:00:00,1187.0 +T3,2015-07-03 03:00:00,1314.0 +T3,2015-07-03 04:00:00,1424.0 +T3,2015-07-03 05:00:00,1497.0 +T3,2015-07-03 06:00:00,1586.0 +T3,2015-07-03 07:00:00,1659.0 +T3,2015-07-03 08:00:00,1722.0 +T3,2015-07-03 09:00:00,1781.0 +T3,2015-07-03 10:00:00,1805.0 +T3,2015-07-03 11:00:00,1831.0 +T3,2015-07-03 12:00:00,1851.0 +T3,2015-07-03 13:00:00,1831.0 +T3,2015-07-03 14:00:00,1809.0 +T3,2015-07-03 15:00:00,1755.0 +T3,2015-07-03 16:00:00,1685.0 +T3,2015-07-03 17:00:00,1618.0 +T3,2015-07-03 18:00:00,1487.0 +T3,2015-07-03 19:00:00,1311.0 +T3,2015-07-03 20:00:00,1180.0 +T3,2015-07-03 21:00:00,1087.0 +T3,2015-07-03 22:00:00,1033.0 +T3,2015-07-03 23:00:00,1002.0 +T3,2015-07-04 00:00:00,991.0 +T3,2015-07-04 01:00:00,1005.0 +T3,2015-07-04 02:00:00,1071.0 +T3,2015-07-04 03:00:00,1191.0 +T3,2015-07-04 04:00:00,1307.0 +T3,2015-07-04 05:00:00,1407.0 +T3,2015-07-04 06:00:00,1495.0 +T3,2015-07-04 07:00:00,1576.0 +T3,2015-07-04 08:00:00,1635.0 +T3,2015-07-04 09:00:00,1688.0 +T3,2015-07-04 10:00:00,1711.0 +T3,2015-07-04 11:00:00,1741.0 +T3,2015-07-04 12:00:00,1768.0 +T3,2015-07-04 13:00:00,1765.0 +T3,2015-07-04 14:00:00,1738.0 +T3,2015-07-04 15:00:00,1684.0 +T3,2015-07-04 16:00:00,1605.0 +T3,2015-07-04 17:00:00,1553.0 +T3,2015-07-04 18:00:00,1433.0 +T3,2015-07-04 19:00:00,1297.0 +T3,2015-07-04 20:00:00,1177.0 +T3,2015-07-04 21:00:00,1082.0 +T3,2015-07-04 22:00:00,1028.0 +T3,2015-07-04 23:00:00,987.0 +T3,2015-07-05 00:00:00,970.0 +T3,2015-07-05 01:00:00,959.0 +T3,2015-07-05 02:00:00,993.0 +T3,2015-07-05 03:00:00,1083.0 +T3,2015-07-05 04:00:00,1215.0 +T3,2015-07-05 05:00:00,1310.0 +T3,2015-07-05 06:00:00,1415.0 +T3,2015-07-05 07:00:00,1479.0 +T3,2015-07-05 08:00:00,1525.0 +T3,2015-07-05 09:00:00,1599.0 +T3,2015-07-05 10:00:00,1623.0 +T3,2015-07-05 11:00:00,1652.0 +T3,2015-07-05 12:00:00,1671.0 +T3,2015-07-05 13:00:00,1664.0 +T3,2015-07-05 14:00:00,1637.0 +T3,2015-07-05 15:00:00,1574.0 diff --git a/tests/unit/amlb/datasets/file/test_file_dataloader.py b/tests/unit/amlb/datasets/file/test_file_dataloader.py index fa8151789..778cccdf7 100644 --- a/tests/unit/amlb/datasets/file/test_file_dataloader.py +++ b/tests/unit/amlb/datasets/file/test_file_dataloader.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import pytest +import pandas.api.types as pat from amlb.resources import from_config from amlb.data import DatasetType @@ -241,14 +242,15 @@ def _assert_data_paths(dataset, definition): assert dataset.train.data_path(f) == path_from_split(s) -def _assert_X_y_types(data_split): +def _assert_X_y_types(data_split, check_encoded=True): assert isinstance(data_split.X, pd.DataFrame) assert isinstance(data_split.y, pd.DataFrame) - assert isinstance(data_split.X_enc, np.ndarray) - assert isinstance(data_split.y_enc, np.ndarray) + if check_encoded: + assert isinstance(data_split.X_enc, np.ndarray) + assert isinstance(data_split.y_enc, np.ndarray) -def _assert_data_consistency(dataset): +def _assert_data_consistency(dataset, check_encoded=True): assert len(dataset.train.X) == len(dataset.train.y) assert len(dataset.train.X.columns) == len(dataset.predictors) assert len(dataset.train.y.columns) == 1 @@ -257,11 +259,107 @@ def _assert_data_consistency(dataset): assert not any([p.is_target for p in dataset.predictors]) - assert dataset.train.X_enc.shape == dataset.train.X.shape assert dataset.test.X.dtypes.equals(dataset.train.X.dtypes) assert dataset.test.y.dtypes.equals(dataset.train.y.dtypes) - assert np.issubdtype(dataset.train.X_enc.dtype, np.floating) - assert np.issubdtype(dataset.train.y_enc.dtype, np.floating) # not ideal given that it's also for classification targets, but well… + if check_encoded: + assert dataset.train.X_enc.shape == dataset.train.X.shape + assert np.issubdtype(dataset.train.X_enc.dtype, np.floating) + assert np.issubdtype(dataset.train.y_enc.dtype, np.floating) # not ideal given that it's also for classification targets, but well… + + +@pytest.mark.use_disk +def test_load_timeseries_task_csv(file_loader): + ds_def = ns( + path=os.path.join(res, "m4_hourly_subset.csv"), + forecast_horizon_in_steps=24, + seasonality=24, + freq="H", + target="target", + type="timeseries", + ) + ds = file_loader.load(ds_def) + assert ds.type is DatasetType.timeseries + print(ds.train.X.dtypes) + _assert_data_consistency(ds, check_encoded=False) + _assert_X_y_types(ds.train, check_encoded=False) + _assert_X_y_types(ds.test, check_encoded=False) + + assert isinstance(ds.train.data, pd.DataFrame) + assert isinstance(ds.test.data, pd.DataFrame) + assert len(ds.repeated_abs_seasonal_error) == len(ds.test.data) + assert len(ds.repeated_item_id) == len(ds.test.data) + + assert pat.is_categorical_dtype(ds._dtypes[ds.id_column]) + assert pat.is_datetime64_dtype(ds._dtypes[ds.timestamp_column]) + assert pat.is_float_dtype(ds._dtypes[ds.target.name]) + + # timeseries uses different task schema - set attributes for test to work + ds_def['train'] = ds.train.path + ds_def['test'] = ds.test.path + _assert_data_paths(ds, ds_def) + + +@pytest.mark.parametrize("missing_key", ["freq", "forecast_horizon_in_steps", "seasonality"]) +def test_when_timeseries_task_key_is_missing_then_exception_is_raised(file_loader, missing_key): + task_kwargs = dict( + path=os.path.join(res, "m4_hourly_subset.csv"), + forecast_horizon_in_steps=24, + seasonality=24, + freq="H", + target="target", + type="timeseries", + ) + task_kwargs.pop(missing_key) + ds_def = ns.from_dict(task_kwargs) + with pytest.raises(AssertionError, match=f"Task definition for timeseries must include `{missing_key}`"): + file_loader.load(ds_def) + + +@pytest.mark.parametrize("missing_key", ["id_column", "timestamp_column"]) +def test_given_nondefault_column_names_when_key_is_missing_then_exception_is_raised(file_loader, missing_key): + task_kwargs = dict( + path=os.path.join(res, "m4_hourly_subset_nondefault_cols.csv"), + forecast_horizon_in_steps=24, + seasonality=24, + freq="H", + type="timeseries", + target="CustomTarget", + id_column="CustomId", + timestamp_column="CustomTimestamp", + ) + task_kwargs.pop(missing_key) + ds_def = ns.from_dict(task_kwargs) + with pytest.raises(ValueError, match=missing_key): + file_loader.load(ds_def) + + +def test_given_nondefault_column_names_then_timeseries_dataset_can_be_loaded(file_loader): + task_kwargs = dict( + path=os.path.join(res, "m4_hourly_subset_nondefault_cols.csv"), + forecast_horizon_in_steps=24, + seasonality=24, + freq="H", + type="timeseries", + target="CustomTarget", + id_column="CustomId", + timestamp_column="CustomTimestamp", + ) + ds_def = ns.from_dict(task_kwargs) + ds = file_loader.load(ds_def) + _assert_data_consistency(ds, check_encoded=False) + + +@pytest.mark.parametrize("forecast_horizon, fold", [(50, 2), (100, 0), (10, 9)]) +def test_if_timeseries_dataset_too_short_for_requested_fold_then_exception_is_raised(file_loader, forecast_horizon, fold): + ds_def = ns( + path=os.path.join(res, "m4_hourly_subset.csv"), + forecast_horizon_in_steps=forecast_horizon, + seasonality=24, + freq="H", + type="timeseries", + ) + with pytest.raises(ValueError, match="All time series in the dataset must have length"): + file_loader.load(ds_def, fold=fold)