diff --git a/.github/workflows/build_deploy_docs.yml b/.github/workflows/build_deploy_docs.yml
new file mode 100644
index 000000000..ddaedfa59
--- /dev/null
+++ b/.github/workflows/build_deploy_docs.yml
@@ -0,0 +1,66 @@
+# Simple workflow for deploying static content to GitHub Pages generated by Github
+# except for added job steps "Copy Static Files" through "Build MkDocs Pages".
+name: Deploy static content to Pages
+
+on:
+  # Runs on pushes targeting the default branch
+  push:
+    branches: ["master"]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  # Single deploy job since we're just deploying
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Setup Pages
+        uses: actions/configure-pages@v3
+      - name: Copy Static Files
+        run: |
+          cp -R docs/website site/
+      - name: Replace GITHUB token
+        # Use different sed delimiter to avoid clashing with forward slash in URL
+        run: |
+          find docs/ -type f -exec sed -i "s@GITHUB@${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}\/blob\/master@g" {} \;
+      - name: Replace WEBSITE token
+        # Use different sed delimiter to avoid clashing with forward slash in URL
+        run: |
+          WEBSITE_URL="https://${GITHUB_REPOSITORY_OWNER}.github.io/automlbenchmark"
+          find docs/ -type f -exec sed -i "s@WEBSITE@${WEBSITE_URL}@g" {} \;
+          sed -i "s@WEBSITE@${WEBSITE_URL}@g" mkdocs.yml
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11' 
+      - name: Install MkDocs
+        run: |
+           python -m pip install mkdocs-material
+      - name: Build MkDocs Pages
+        run: |
+          mkdocs build --site-dir site/docs
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v2
+        with:
+          # Upload entire repository
+          path: './site/'
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v2
diff --git a/amlb/benchmark.py b/amlb/benchmark.py
index 572dc91a3..b8c531208 100644
--- a/amlb/benchmark.py
+++ b/amlb/benchmark.py
@@ -379,7 +379,7 @@ def _is_task_enabled(task_def):
 
 class TaskConfig:
 
-    def __init__(self, name, openml_task_id, test_server, fold, metrics, seed,
+    def __init__(self, name, openml_task_id, test_server, fold, metrics, quantile_levels, seed,
                  max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb,
                  input_dir, output_dir, tag, command, git_info, measure_inference_time: bool = False):
         self.framework = None
@@ -404,6 +404,7 @@ def __init__(self, name, openml_task_id, test_server, fold, metrics, seed,
         self.git_info = git_info
         self.measure_inference_time = measure_inference_time
         self.ext = ns()  # used if frameworks require extra config points
+        self.quantile_levels = list(sorted(quantile_levels))
 
     def __setattr__(self, name, value):
         if name == 'metrics':
@@ -477,9 +478,10 @@ def __init__(self, benchmark: Benchmark, task_def, fold):
         self.fold = fold
         self.task_config = TaskConfig(
             name=task_def.name,
-            openml_task_id=task_def.openml_task_id,
+            openml_task_id=task_def["openml_task_id"],
             fold=fold,
             metrics=task_def.metric,
+            quantile_levels=task_def.quantile_levels,
             seed=rget().seed(fold),
             max_runtime_seconds=task_def.max_runtime_seconds,
             cores=task_def.cores,
diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
index ee92d3b1a..f39e3ca7b 100644
--- a/amlb/datasets/file.py
+++ b/amlb/datasets/file.py
@@ -1,5 +1,6 @@
 from abc import abstractmethod
 import logging
+import math
 import os
 import re
 import tempfile
@@ -33,17 +34,17 @@ def __init__(self, cache_dir=None):
     def load(self, dataset, fold=0):
         dataset = dataset if isinstance(dataset, ns) else ns(path=dataset)
         log.debug("Loading dataset %s", dataset)
+        target = dataset['target']
+        type_ = dataset['type']
+        features = dataset['features']
+
+        if type_ and DatasetType[type_] == DatasetType.timeseries:
+            return TimeSeriesDataset(path=dataset['path'], fold=fold, target=target, features=features, cache_dir=self._cache_dir, config=dataset)
+
         paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold, name=dataset['name'] if 'name' in dataset else None)
         assert fold < len(paths['train']), f"No training dataset available for fold {fold} among dataset files {paths['train']}"
-        # seed = rget().seed(fold)
-        # if len(paths['test']) == 0:
-        #     log.warning("No test file in the dataset, the train set will automatically be split 90%/10% using the given seed.")
-        # else:
         assert fold < len(paths['test']), f"No test dataset available for fold {fold} among dataset files {paths['test']}"
 
-        target = dataset['target']
-        type_ = dataset['type']
-        features = dataset['features']
         ext = os.path.splitext(paths['train'][fold])[1].lower()
         train_path = paths['train'][fold]
         test_path = paths['test'][fold] if len(paths['test']) > 0 else None
@@ -139,40 +140,6 @@ def __repr__(self):
         return repr_def(self)
 
 
-    def extend_dataset_with_timeseries_config(self, dataset, dataset_config):
-        dataset = deepcopy(dataset)
-        dataset_config = deepcopy(dataset_config)
-        if dataset_config['id_column'] is None:
-            log.warning("Warning: For timeseries task setting undefined `id_column` to `item_id`.")
-            dataset_config['id_column'] = "item_id"
-        if dataset_config['forecast_range_in_steps'] is None:
-            log.warning("Warning: For timeseries task setting undefined `forecast_range_in_steps` to `1`.")
-            dataset_config['forecast_range_in_steps'] = "1"
-
-        dataset.timestamp_column=dataset_config['timestamp_column']
-        dataset.id_column=dataset_config['id_column']
-        dataset.forecast_range_in_steps=int(dataset_config['forecast_range_in_steps'])
-
-        train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count()
-        test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count()
-        forecast_range_in_steps_mean_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
-        forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
-        if not dataset.forecast_range_in_steps == forecast_range_in_steps_mean_diff_train_test:
-            msg = f"Warning: Forecast range {dataset.forecast_range_in_steps}, does not equal mean difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
-            log.warning(msg)
-        if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
-            msg = f"Error: Not all sequences of train and test set have same sequence length difference."
-            raise ValueError(msg)
-        if dataset.forecast_range_in_steps > forecast_range_in_steps_mean_diff_train_test:
-            msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
-            raise ValueError(msg)
-        if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test:
-            msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than minimum sequence length + 1, {forecast_range_in_steps_max_min_train_test}."
-            raise ValueError(msg)
-        return dataset
-
-
-
 class FileDataset(Dataset):
 
     def __init__(self, train: Datasplit, test: Datasplit,
@@ -350,10 +317,88 @@ def __init__(self, train_path, test_path,
         # todo: handle auto-split (if test_path is None): requires loading the training set, split, save
         super().__init__(None, None,
                          target=target, features=features, type=type)
-        self._train = CsvDatasplit(self, train_path, timestamp_column=timestamp_column)
-        self._test = CsvDatasplit(self, test_path, timestamp_column=timestamp_column)
+        self._train = CsvDatasplit(self, train_path)
+        self._test = CsvDatasplit(self, test_path)
+        self._dtypes = None
+
+
+class TimeSeriesDataset(FileDataset):
+    def __init__(self, path, fold, target, features, cache_dir, config):
+        super().__init__(None, None, target=target, features=features, type="timeseries")
+        if config['forecast_horizon_in_steps'] is None:
+            raise AssertionError("Task definition for timeseries must include `forecast_horizon_in_steps`")
+        if config['freq'] is None:
+            raise AssertionError("Task definition for timeseries must include `freq`")
+        if config['seasonality'] is None:
+            raise AssertionError("Task definition for timeseries must include `seasonality`")
+
+        full_data = read_csv(path)
+        if config['id_column'] is None:
+            log.warning("Warning: For timeseries task, setting undefined `id_column` to `item_id`")
+            config['id_column'] = 'item_id'
+        if config['id_column'] not in full_data.columns:
+            raise ValueError(f'The id_column with name {config["id_column"]} is missing from the dataset')
+        if config['timestamp_column'] is None:
+            log.warning("Warning: For timeseries task, setting undefined `timestamp_column` to `timestamp`")
+            config['timestamp_column'] = 'timestamp'
+        if config['timestamp_column'] not in full_data.columns:
+            raise ValueError(f'The timestamp_column with name {config["timestamp_column"]} is missing from the dataset')
+
+        self.forecast_horizon_in_steps = int(config['forecast_horizon_in_steps'])
+        self.freq = pd.tseries.frequencies.to_offset(config['freq']).freqstr
+        self.seasonality = int(config['seasonality'])
+        self.id_column = config['id_column']
+        self.timestamp_column = config['timestamp_column']
+
+        full_data[self.timestamp_column] = pd.to_datetime(full_data[self.timestamp_column])
+        if config['name'] is not None:
+            file_name = config['name']
+        else:
+            file_name = os.path.splitext(os.path.basename(path))[0]
+        save_dir = os.path.join(cache_dir, file_name, str(fold))
+        train_path, test_path = self.save_train_and_test_splits(full_data, fold=fold, save_dir=save_dir)
+
+        self._train = CsvDatasplit(self, train_path, timestamp_column=self.timestamp_column)
+        self._test = CsvDatasplit(self, test_path, timestamp_column=self.timestamp_column)
         self._dtypes = None
 
+        # Store repeated item_id & in-sample seasonal error for each time step in the forecast horizon - needed later for metrics like MASE.
+        # We need to store this information here because Result object has no access to past time series values.
+        self.repeated_item_id = self.test.data[self.id_column].astype("category").cat.codes.to_numpy()
+        self.repeated_abs_seasonal_error = self.compute_seasonal_error()
+
+    def save_train_and_test_splits(self, full_data, fold, save_dir):
+        full_data = full_data.sort_values(by=[self.id_column, self.timestamp_column])
+        shortest_ts_length = full_data.groupby(self.id_column).size().min()
+        min_expected_ts_length = (fold + 1) * self.forecast_horizon_in_steps + 1
+        if shortest_ts_length < min_expected_ts_length:
+            raise ValueError(
+                f'All time series in the dataset must have length > `(fold + 1) * forecast_horizon_in_steps` '
+                f'(at least {min_expected_ts_length + 1}), but shortest time series has length {shortest_ts_length}'
+            )
+        # Remove the last `steps_to_remove` steps from each time series to obtain the correct fold
+        if fold > 0:
+            steps_to_remove = (fold + 1) * self.forecast_horizon_in_steps
+            full_data = full_data.groupby(self.id_column, as_index=False).nth(slice(None, -steps_to_remove))
+        train_data = full_data.groupby(self.id_column, as_index=False).nth(slice(None, -self.forecast_horizon_in_steps))
+        test_data = full_data.groupby(self.id_column, as_index=False).nth(slice(-self.forecast_horizon_in_steps, None))
+
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        train_path = os.path.join(save_dir, "train.csv")
+        test_path = os.path.join(save_dir, "test.csv")
+
+        train_data.to_csv(train_path, index=False)
+        test_data.to_csv(test_path, index=False)
+        return train_path, test_path
+
+    def compute_seasonal_error(self):
+        train_data_with_index = self.train.data.set_index(self.id_column)
+        seasonal_diffs = train_data_with_index[self.target.name].groupby(level=self.id_column).diff(self.seasonality).abs()
+        abs_seasonal_error = seasonal_diffs.groupby(level=self.id_column).mean().fillna(1.0).values
+        # Repeat seasonal error for each time step in the forecast horizon
+        return np.repeat(abs_seasonal_error, self.forecast_horizon_in_steps)
+
 
 class CsvDatasplit(FileDatasplit):
 
@@ -396,8 +441,7 @@ def load_metadata(self):
                                       else 'string' if pat.is_string_dtype(dt)
                                       else 'datetime' if pat.is_datetime64_dtype(dt)
                                       else 'object')
-        features = [Feature(i, col, to_feature_type(dtypes[i]))
-                    for i, col in enumerate(self._ds.columns)]
+        features = [Feature(i, col, to_feature_type(dtypes[i])) for i, col in enumerate(self._ds.columns)]
 
         for f in features:
             col = self._ds.iloc[:, f.index]
diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
index 678d1854d..e924774be 100644
--- a/amlb/datasets/openml.py
+++ b/amlb/datasets/openml.py
@@ -25,6 +25,12 @@
 from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify
 
 
+# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
+try:
+  set_openml_cache = oml.config.set_cache_directory
+except AttributeError:
+  set_openml_cache = oml.config.set_root_cache_directory
+
 log = logging.getLogger(__name__)
 
 # hack (only adding a ? to the regexp pattern) to ensure that '?' values remain quoted when we save dataplits in arff format.
@@ -39,7 +45,7 @@ class OpenmlLoader:
     def __init__(self, api_key, cache_dir=None):
         oml.config.apikey = api_key
         if cache_dir:
-            oml.config.set_cache_directory(cache_dir)
+            set_openml_cache(cache_dir)
 
         if oml.config.retry_policy != "robot":
             log.debug("Setting openml retry_policy from '%s' to 'robot'." % oml.config.retry_policy)
diff --git a/amlb/datautils.py b/amlb/datautils.py
index a002a236a..d8a24d2ef 100644
--- a/amlb/datautils.py
+++ b/amlb/datautils.py
@@ -37,19 +37,21 @@ def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dty
     :param header: if the columns header should be read.
     :param as_data_frame: if the result should be returned as a data frame (default) or a numpy array.
     :param dtype: data type for columns.
-    :param timestamp_column: column name for timestamp, to ensure dates are correctly parsed by pandas.
+    :param timestamp_column: name of the column that should be parsed as date.
     :return: a DataFrame
     """
-    if dtype is not None and timestamp_column is not None and timestamp_column in dtype:
-            dtype = dtype.copy() # to avoid outer context manipulation
-            del dtype[timestamp_column]
-
+    if timestamp_column is None:
+        parse_dates = None
+    else:
+        if dtype is not None:
+            dtype.pop(timestamp_column, None)
+        parse_dates = [timestamp_column]
     df = pd.read_csv(path,
                      nrows=nrows,
                      header=0 if header else None,
                      index_col=0 if index else None,
                      dtype=dtype,
-                     parse_dates=[timestamp_column] if timestamp_column is not None else None)
+                     parse_dates=parse_dates)
     return df if as_data_frame else df.values
 
 
diff --git a/amlb/defaults.py b/amlb/defaults.py
index 6d0bf35c5..3031be71b 100644
--- a/amlb/defaults.py
+++ b/amlb/defaults.py
@@ -1,9 +1,15 @@
 import pathlib
 
-from openml.config import cache_directory
+import openml
 
 from amlb.utils import Namespace as ns
 
+# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
+try:
+    cache_directory = openml.config.cache_directory
+except AttributeError:
+    cache_directory = openml.config.get_cache_directory()
+
 default_dirs = ns(
     input_dir=cache_directory,
     output_dir=str(pathlib.Path(__file__).parent.parent / "results"),
diff --git a/amlb/resources.py b/amlb/resources.py
index 808a7f954..f3667b891 100644
--- a/amlb/resources.py
+++ b/amlb/resources.py
@@ -210,7 +210,7 @@ def _validate_task(self, task, lenient=False):
         if not lenient and len(missing) > 0:
             raise ValueError("{missing} mandatory properties as missing in task definition {taskdef}.".format(missing=missing, taskdef=task))
 
-        for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb']:
+        for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb', 'quantile_levels']:
             if task[conf] is None:
                 task[conf] = self.config.benchmarks.defaults[conf]
                 log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf]))
@@ -310,4 +310,3 @@ def output_dirs(root, session=None, subdirs=None, create=False):
     TransformRule(from_key='aws.query_frequency_seconds', to_key='aws.query_interval_seconds'),
     TransformRule(from_key='aws.ec2.monitoring.cpu.query_frequency_seconds', to_key='aws.ec2.monitoring.cpu.query_interval_seconds'),
 ]
-
diff --git a/amlb/results.py b/amlb/results.py
index 6e1a5bc60..cdaa56725 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -2,7 +2,6 @@
 **results** module provides the logic to format, save and read predictions generated by the *automl frameworks* (cf. ``TaskResult``),
 as well as logic to compute, format, save, read and merge scores obtained from those predictions (cf. ``Result`` and ``Scoreboard``).
 """
-from functools import partial
 import collections
 import io
 import logging
@@ -16,6 +15,7 @@
 from numpy import nan, sort
 import pandas as pd
 import scipy as sci
+import scipy.sparse
 
 from .data import Dataset, DatasetType, Feature
 from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \
@@ -244,7 +244,7 @@ def load_predictions(predictions_file):
                 if rconfig().test_mode:
                     TaskResult.validate_predictions(df)
 
-                if  'y_past_period_error' in df.columns:
+                if 'repeated_item_id' in df.columns:
                     return TimeSeriesResult(df)
                 else:
                     if df.shape[1] > 2:
@@ -293,6 +293,8 @@ def save_predictions(dataset: Dataset, output_file: str,
             predictions = predictions.squeeze()
         if isinstance(predictions, S):
             predictions = predictions.values
+        if scipy.sparse.issparse(truth) and truth.shape[1] == 1:
+            truth = pd.DataFrame(truth.todense())
         if isinstance(truth, DF):
             truth = truth.squeeze()
         if isinstance(truth, S):
@@ -750,6 +752,108 @@ def ncrps(self):
         weighted_losses = quantile_losses.sum(0) / denom  # shape [num_quantiles]
         return weighted_losses.mean()
 
+class TimeSeriesResult(RegressionResult):
+    def __init__(self, predictions_df, info=None):
+        super().__init__(predictions_df, info)
+        required_columns = {'truth', 'predictions', 'repeated_item_id', 'repeated_abs_seasonal_error'}
+        if required_columns - set(self.df.columns):
+            raise ValueError(f'Missing columns for calculating time series metrics: {required_columns - set(self.df.columns)}.')
+
+        quantile_columns = [column for column in self.df.columns if column.startswith('0.')]
+        unrecognized_columns = [column for column in self.df.columns if column not in required_columns and column not in quantile_columns]
+        if len(unrecognized_columns) > 0:
+            raise ValueError(f'Predictions contain unrecognized columns: {unrecognized_columns}.')
+
+        self.type = DatasetType.timeseries
+        self.truth = self.df['truth'].values.astype(float)
+        self.item_ids = self.df['repeated_item_id'].values
+        self.abs_seasonal_error = self.df['repeated_abs_seasonal_error'].values.astype(float)
+        # predictions = point forecast, quantile_predictions = quantile forecast
+        self.predictions = self.df['predictions'].values.astype(float)
+        self.quantile_predictions = self.df[quantile_columns].values.astype(float)
+        self.quantile_levels = np.array(quantile_columns, dtype=float)
+
+        if (~np.isfinite(self.predictions)).any() or (~np.isfinite(self.quantile_predictions)).any():
+            raise ValueError('Predictions contain NaN or Inf values')
+
+        _, unique_item_ids_counts = np.unique(self.item_ids, return_counts=True)
+        if len(set(unique_item_ids_counts)) != 1:
+            raise ValueError(f'Error: Predicted sequences have different lengths {unique_item_ids_counts}.')
+
+    def _itemwise_mean(self, values):
+        """Compute mean for each time series."""
+        return pd.Series(values).groupby(self.item_ids, sort=False).mean().values
+
+    def _safemean(self, values):
+        """Compute mean, while ignoring nan, +inf, -inf values."""
+        return np.mean(values[np.isfinite(values)])
+
+    @metric(higher_is_better=False)
+    def smape(self):
+        """Symmetric Mean Absolute Percentage Error"""
+        num = np.abs(self.truth - self.predictions)
+        denom = (np.abs(self.truth) + np.abs(self.predictions)) / 2
+        return self._safemean(num / denom)
+
+    @metric(higher_is_better=False)
+    def mape(self):
+        """Mean Absolute Percentage Error"""
+        num = np.abs(self.truth - self.predictions)
+        denom = np.abs(self.truth)
+        return self._safemean(num / denom)
+
+    @metric(higher_is_better=False)
+    def wape(self):
+        """Weighted Average Percentage Error"""
+        return np.sum(np.abs(self.truth - self.predictions)) / np.sum(np.abs(self.truth))
+
+    @metric(higher_is_better=False)
+    def mase(self):
+        """Mean Absolute Scaled Error
+
+        Error for each item is normalized by the in-sample error of the naive forecaster.
+        This makes scores comparable across different items.
+        """
+        error = np.abs(self.truth - self.predictions)
+        error_per_item = self._itemwise_mean(error / self.abs_seasonal_error)
+        return self._safemean(error_per_item)
+
+    def _quantile_loss_per_step(self):
+        # Array of shape [len(self.predictions), len(self.quantile_levels)]
+        return 2 * np.abs(
+            (self.quantile_predictions - self.truth[:, None])
+            * ((self.quantile_predictions >= self.truth[:, None]) - self.quantile_levels)
+        )
+
+    @metric(higher_is_better=False)
+    def mql(self):
+        """Quantile Loss, also known as Pinball Loss, averaged across all quantile levels & time steps.
+
+        Equivalent to the Weighted Interval Score if the quantile_levels are symmetric around 0.5
+
+        Approximates the Continuous Ranked Probability Score
+        """
+        return np.mean(self._quantile_loss_per_step())
+
+    @metric(higher_is_better=False)
+    def wql(self):
+        """Weighted Quantile Loss.
+
+        Defined as total quantile loss normalized by the total abs value of target time series.
+        """
+        return self._quantile_loss_per_step().mean(axis=1).sum() / np.sum(np.abs(self.truth))
+
+    @metric(higher_is_better=False)
+    def sql(self):
+        """Scaled Quantile Loss, also known as Scaled Pinball Loss.
+
+        Similar to MASE, the quantile loss for each item is normalized by the in-sample error of the naive forecaster.
+        This makes scores comparable across different items.
+        """
+        pl_per_item = self._itemwise_mean(self._quantile_loss_per_step().mean(axis=1) / self.abs_seasonal_error)
+        return self._safemean(pl_per_item)
+
+
 _encode_predictions_and_truth_ = False
 
 save_predictions = TaskResult.save_predictions
diff --git a/amlb/runners/aws.py b/amlb/runners/aws.py
index 6aae9cff1..221fa7fb8 100644
--- a/amlb/runners/aws.py
+++ b/amlb/runners/aws.py
@@ -1314,4 +1314,3 @@ def _download_resources(self):
 
     def _upload_results(self):
         pass
-
diff --git a/amlb/utils/config.py b/amlb/utils/config.py
index 59bf7db30..5bf80412e 100644
--- a/amlb/utils/config.py
+++ b/amlb/utils/config.py
@@ -1,4 +1,4 @@
-from collections import namedtuple
+from __future__ import annotations
 from copy import deepcopy
 from dataclasses import dataclass
 from importlib.util import find_spec
@@ -59,19 +59,15 @@ def config_load(path, verbose=False):
         return loader(file, as_namespace=True)
 
 
-# TransformRule = namedtuple('TransformRule',
-#                            ['from_key', 'to_key', 'fn', 'keep_from'],
-#                            defaults=[None, identity, False],
-#                            module=__name__)
 @dataclass
 class TransformRule:
     from_key: Union[str, List[str]]
-    to_key: str = None
+    to_key: str | None = None  # if not provided, used for transformations on same key
     fn: Callable = identity
     keep_from: bool = False
 
 
-def transform_config(config: Namespace, transform_rules: [TransformRule], inplace=True) -> Namespace:
+def transform_config(config: Namespace, transform_rules: list[TransformRule], inplace=True) -> Namespace:
     """
     Allows to modify a configuration namespace (for example if the configuration format is modified)
     by applying a list of transformation rules.
diff --git a/amlb/utils/process.py b/amlb/utils/process.py
index d50d8615d..8849da05a 100644
--- a/amlb/utils/process.py
+++ b/amlb/utils/process.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import gc
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
@@ -18,7 +20,7 @@
 import threading
 import _thread
 import traceback
-from typing import Dict, List, Union, Tuple
+from typing import Dict, List, Union, Tuple, cast
 
 import psutil
 
@@ -118,7 +120,7 @@ def live_output_windows(process: subprocess.Popen, **_) -> Tuple[str, str]:
             queue=queue.Queue(),
             lines=[],
         ),
-    )
+    )  # type: ignore  # no reasonable type annotation, should refactor
 
     def forward_output(stream, queue_):
         if isinstance(stream, io.TextIOWrapper):
@@ -135,12 +137,14 @@ def forward_output(stream, queue_):
         for output in outputs.values():
             while True:
                 try:
-                    line = output["queue"].get(timeout=0.5)
-                    output["lines"].append(line)
+                    line = cast(queue.Queue, output["queue"]).get(timeout=0.5)
+                    cast(list[str], output["lines"]).append(line)
                     print(line.rstrip())
                 except queue.Empty:
                     break
-    return ''.join(outputs["out"]["lines"]), ''.join(outputs["err"]["lines"])
+    stdout = ''.join(cast(list[str], outputs["out"]["lines"]))
+    stderr = ''.join(cast(list[str], outputs["err"]["lines"]))
+    return stdout, stderr
 
 
 def live_output_unix(process, input=None, timeout=None, activity_timeout=None, mode='line', **_):
@@ -448,7 +452,7 @@ class InterruptTimeout(Timeout):
 
     def __init__(self, timeout_secs, message=None, log_level=logging.WARNING,
                  interrupt='thread', sig=signal.SIGINT, id=None,
-                 interruptions: Union[Dict, List[Dict]] = None, wait_retry_secs=1,
+                 interruptions: Union[Dict, List[Dict]] | None = None, wait_retry_secs=1,
                  before_interrupt=None):
         def interruption():
             inter_iter = iter(self._interruptions)
diff --git a/amlb/utils/serialization.py b/amlb/utils/serialization.py
index dffa4c53c..65f0c817d 100644
--- a/amlb/utils/serialization.py
+++ b/amlb/utils/serialization.py
@@ -23,7 +23,7 @@ def _import_data_libraries():
     except ImportError:
         pd = None
     try:
-        import scipy.sparse as sp
+        import scipy.sparse as sp  # type: ignore # https://github.com/scipy/scipy/issues/17158
     except ImportError:
         sp = None
     return np, pd, sp
diff --git a/amlb/utils/time.py b/amlb/utils/time.py
index 416688bcc..cbb409f89 100644
--- a/amlb/utils/time.py
+++ b/amlb/utils/time.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import datetime as dt
 import logging
 import math
@@ -41,8 +43,8 @@ def datetime_iso(datetime=None, date=True, time=True, micros=False, date_sep='-'
     return datetime.strftime(strf)
 
 
-def countdown(timeout_secs, on_timeout: Callable = None, message: str = None, interval=1, log_level=logging.INFO,
-              interrupt_event: threading.Event = None, interrupt_cond: Callable = None):
+def countdown(timeout_secs, on_timeout: Callable | None = None, message: str = "", interval=1, log_level=logging.INFO,
+              interrupt_event: threading.Event | None = None, interrupt_cond: Callable | None = None):
     timeout_epoch = time.time() + timeout_secs
     remaining = timeout_secs
     interrupt = interrupt_event or threading.Event()
diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index 491eeb627..000000000
--- a/docs/README.md
+++ /dev/null
@@ -1,399 +0,0 @@
-# OpenML AutoML Benchmark
-
-The OpenML AutoML Benchmark provides a framework for evaluating and comparing open-source AutoML systems.  The system is *extensible* because you can [add your own](https://github.com/openml/automlbenchmark/blob/master/docs/extending.md) AutoML frameworks and datasets. For a thorough explanation of the benchmark, and evaluation of results, you can read our [paper](https://openml.github.io/automlbenchmark/paper.html) which was accepted at the [2019 ICML AutoML Workshop](https://sites.google.com/view/automl2019icml/).
-
-_**NOTE:**_ _This benchmarking framework currently features binary and multiclass classification; extending to regression is a work in progress.  Please file an issue with any concerns/questions._
-
-  * [Installation](#installation)
-     * [Pre-requisites](#pre-requisites)
-     * [Setup](#setup)
-  * [Quickstart](#quickstart)
-  * [Running benchmarks](#running-benchmarks)
-     * [In Docker image](#in-docker-image)
-     * [In local environment](#in-local-environment)
-     * [On AWS](#on-aws)
-     * [Output](#output)
-  * [Advanced configuration](#advanced-configuration)
-  * [Issues](#issues)
-  * [Frequently Asked Questions](#frequently-asked-questions)
-      
-Automatic Machine Learning (AutoML) systems automatically build machine learning pipelines or neural architectures in a data-driven, objective, and automatic way. They automate a lot of drudge work in designing machine learning systems, so that better systems can be developed, faster. However, AutoML research is also slowed down by two factors:
-
-* We currently lack standardized, easily-accessible benchmarking suites of tasks (datasets) that are curated to reflect important problem domains, practical to use, and sufficiently challenging to support a rigorous analysis of performance results. 
-
-* Subtle differences in the problem definition, such as the design of the hyperparameter search space or the way time budgets are defined, can drastically alter a task’s difficulty. This issue makes it difficult to reproduce published research and compare results from different papers.
-
-This toolkit aims to address these problems by setting up standardized environments for in-depth experimentation with a wide range of AutoML systems.
-
-Documentation: <https://openml.github.io/automlbenchmark/>
-
-### Features:
-* Curated suites of [benchmarking datasets](https://openml.github.io/automlbenchmark/benchmark_datasets.html) from [OpenML](https://www.openml.org/s/218/data).
-* Includes code to benchmark a number of [popular AutoML systems](https://openml.github.io/automlbenchmark/automl_overview.html) on regression and classification tasks.
-* [New AutoML systems can be added](./HOWTO.md#add-an-automl-framework)
-* Experiments can be run in Docker or Singularity containers
-* Execute experiments locally or on AWS (see below)
-
-
-## Installation
-### Pre-requisites
-To run the benchmarks, you will need:
-* Python 3.9+.
-* PIP3: ensure you have a recent version. If necessary, upgrade your pip using `python -m pip install -U pip`.
-* The Python libraries listed in [requirements.txt](../requirements.txt): it is strongly recommended to first create a [Python virtual environment](https://docs.python.org/3/library/venv.html#venv-def) (cf. also [Pyenv](https://github.com/pyenv/pyenv): quick install using `curl https://pyenv.run | bash` or `brew install pyenv`) and work in it if you don't want to mess up your global Python environment.
-* [Docker](https://docs.docker.com/install/), if you plan to run the benchmarks in a container.
-
-### Setup
-Clone the repo (in development environment, you should of course remove the `--depth 1` argument):
-```bash
-git clone https://github.com/openml/automlbenchmark.git --branch stable --depth 1
-cd automlbenchmark
-```
-Optional: create a Python3 virtual environment.
-
-- _**NOTE**: we don't recommend creating your virtual environment with `virtualenv` library here as the application may create additional virtual environments for some frameworks to run in isolation._
-_Those virtual environments are created internally using `python -m venv` and we encountered issues with `pip` when `venv` is used on top of a `virtualenv` environment._
-_Therefore, we rather suggest one of the method below:_ 
-
-using venv on Linux/macOS:
-```bash
-python3 -m venv ./venv
-source venv/bin/activate
-# remember to call `deactivate` once you're done using the application
-```
-using venv on Windows:
-```bash
-python3 -m venv ./venv
-venv\Scripts\activate
-# remember to call `venv\Scripts\deactivate` once you're done using the application
-```
-
-or using pyenv:
-```bash
-pyenv install {python_version: 3.9.16}
-pyenv virtualenv ve-automl
-pyenv local ve-automl
-```
-Then pip install the dependencies:
-
-```bash
-python -m pip install -r requirements.txt
-```
-
-- _**NOTE**: in case of issues when installing Python requirements, you may want to try the following:_
-    - _on some platforms, we need to ensure that requirements are installed sequentially:_ `xargs -L 1 python -m pip install < requirements.txt`.
-    - _enforce the `python -m pip` version above in your virtualenv:_ `python -m pip install --upgrade pip==19.3.1`.
-
-## Quickstart
-To run a benchmark call the `runbenchmark.py` script with at least the following arguments:
-
-1. The AutoML framework that should be evaluated, see [frameworks.yaml](../resources/frameworks.yaml) for supported frameworks. If you want to add a framework see [HOWTO](./HOWTO.md#add-an-automl-framework).
-2. The benchmark suite to run should be one implemented in [benchmarks folder](../resources/benchmarks), or an OpenML study or task (formatted as `openml/s/X` or `openml/t/Y` respectively).
-3. (Optional) The constraints applied to the benchmark as defined by default in [constraints.yaml](../resources/constraints.yaml). Default constraint is `test` (2 folds for 10 min each).
-4. (Optional) If the benchmark should be run `local` (default, tested on Linux and macOS only), in a `docker` container or on `aws` using multiple ec2 instances.
-
-Examples:
-```bash
-python3 runbenchmark.py 
-python3 runbenchmark.py constantpredictor
-python3 runbenchmark.py tpot test
-python3 runbenchmark.py autosklearn openml/t/59 -m docker
-python3 runbenchmark.py h2oautoml validation 1h4c -m aws
-python3 runbenchmark.py autogluon:latest validation
-python3 runbenchmark.py tpot:2020Q2
-```
-
-For the complete list of supported arguments, run:
-```bash
-python3 runbenchmark.py --help
-```
-
-```text
-usage: runbenchmark.py [-h] [-m {local,aws,docker,singularity}]
-                       [-t [task_id [task_id ...]]]
-                       [-f [fold_num ...]] [-i input_dir]
-                       [-o output_dir] [-u user_dir] [-p parallel_jobs]
-                       [-s {auto,skip,force,only}] [-k [true|false]]
-                       [-e] [--logging LOGGING]
-                       [--openml-run-tag OPENML_RUN_TAG]
-                       framework [benchmark] [constraint]
-
-positional arguments:
-  framework             The framework to evaluate as defined by default in resources/frameworks.yaml.
-                        To use a labelled framework (i.e. a framework defined in resources/frameworks-{label}.yaml),
-                        use the syntax {framework}:{label}.
-  benchmark             The benchmark type to run as defined by default in resources/benchmarks/{benchmark}.yaml,
-                        a path to a benchmark description file, or an openml suite or task.
-                        OpenML references should be formatted as 'openml/s/X' and 'openml/t/Y',
-                        for studies and tasks respectively. Use 'test.openml/s/X' for the 
-                        OpenML test server.
-                        (default: 'test')
-  constraint            The constraint definition to use as defined by default in resources/constraints.yaml.
-                        (default: 'test')
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -m {local,aws,docker,singularity}, --mode {local,aws,docker,singularity}
-                        The mode that specifies how/where the benchmark tasks will be running.
-                        (default: 'local')
-  -t [task_id ...], --task [task_id ...]
-                        The specific task name (as defined in the benchmark file) to run.
-                        When an OpenML reference is used as benchmark, the dataset name should be used instead.
-                        If not provided, then all tasks from the benchmark will be run.
-  -f [fold_num ...], --fold [fold_num ...]
-                        If task is provided, the specific fold(s) to run.
-                        If fold is not provided, then all folds from the task definition will be run.
-  -i input_dir, --indir input_dir
-                        Folder from where the datasets are loaded by default.
-                        (default: '~/.openml')
-  -o output_dir, --outdir output_dir
-                        Folder where all the outputs should be written.(default: './results')
-  -u user_dir, --userdir user_dir
-                        Folder where all the customizations are stored.(default: '~/.config/automlbenchmark')
-  -p parallel_jobs, --parallel parallel_jobs
-                        The number of jobs (i.e. tasks or folds) that can run in parallel.
-                        A hard limit is defined by property `job_scheduler.max_parallel_jobs`
-                         in `resources/config.yaml`.
-                        Override this limit in your custom `config.yaml` file if needed.
-                        Supported only in aws mode or container mode (docker, singularity).
-                        (default: 1)
-  -s {auto,skip,force,only}, --setup {auto,skip,force,only}
-                        Framework/platform setup mode. Available values are:
-                        • auto: setup is executed only if strictly necessary.
-                        • skip: setup is skipped.
-                        • force: setup is always executed before the benchmark.
-                        • only: only setup is executed (no benchmark).
-                        (default: 'auto')
-  -k [true|false], --keep-scores [true|false]
-                        Set to true (default) to save/add scores in output directory.
-  -e, --exit-on-error   If set, terminates on the first task that does not complete with a model.
-  --logging LOGGING     Set the log levels for the 3 available loggers:
-                        • console
-                        • app: for the log file including only logs from amlb (.log extension).
-                        • root: for the log file including logs from libraries (.full.log extension).
-                        Accepted values for each logger are: notset, debug, info, warning, error, fatal, critical.
-                        Examples:
-                          --logging=info (applies the same level to all loggers)
-                          --logging=root:debug (keeps defaults for non-specified loggers)
-                          --logging=console:warning,app:info
-                        (default: 'console:info,app:debug,root:info')
-  --openml-run-tag OPENML_RUN_TAG
-                        Tag that will be saved in metadata and OpenML runs created during upload, must match '([a-zA-Z0-9_\-\.])+'.
-```
-
-The script will produce output that records task metadata and the result.
-The result is the score on the test set, where the score is a specific model performance metric (e.g. "AUC") defined by the benchmark.
-```text
-   task  framework  fold    result   mode   version                  utc       acc       auc   logloss
-0  iris  H2OAutoML     0  1.000000  local  3.22.0.5  2019-01-21T15:19:07  1.000000       NaN  0.023511
-1  iris  H2OAutoML     1  1.000000  local  3.22.0.5  2019-01-21T15:20:12  1.000000       NaN  0.091685
-2   kc2  H2OAutoML     0  0.811321  local  3.22.0.5  2019-01-21T15:21:11  0.811321  0.859307       NaN
-3   kc2  H2OAutoML     1  0.886792  local  3.22.0.5  2019-01-21T15:22:12  0.886792  0.888528       NaN
-```
-
-## Running benchmarks
-The `automlbenchmark` app currently allows running benchmarks in various environments:
-* in a docker container (running locally or on multiple AWS instances).
-* completely locally, if the framework is supported on the local system.
-* on AWS, possibly distributing the tasks to multiple EC2 instances, each of them running the benchmark either locally or in a docker container.
-
-### In Docker image
-The [Docker] image is automatically built before running the benchmark if it doesn't already exist locally or in a public repository (by default in <https://hub.docker.com/orgs/automlbenchmark/repositories>).
-Especially, without docker image, the application will need to download and install all the dependencies when building the image, so this may take some time.
-
-The generated image is usually named `automlbenchmark/{framework}:{tag}`, but this is customizable per framework: cf. `resources/frameworks.yaml` and [HOWTO](HOWTO.md#framework-definition) for details.
-
-For example, this will build a Docker image for the `RandomForest` framework and then immediately start a container to run the `validation` benchmark, using all folds, allocating 1h and 4 cores for each task:
-```bash
-python3 runbenchmark.py RandomForest validation 1h4c -m docker
-```
-
-If the corresponding image already exists locally and you want it to be rebuilt before running the benchmark, then the setup needs to be forced:
-```bash
-python3 runbenchmark.py {framework} {benchmark} {constraint} -m docker -s force
-```
-
-The image can also be built without running any benchmark:
-```bash
-python3 runbenchmark.py {framework} -m docker -s only
-```
-
-In rare cases, mainly for development, you may want to specify the docker image:
-```bash
-python3 runbenchmark.py {framework} {benchmark} {constraint} -m docker -Xdocker.image={image}
-```
-
-### In local environment
-If docker allows portability, it is still possible to run the benchmarks locally without container on some environments (currently Linux, and macOS for most frameworks).
-
-A minimal example would be to run the test benchmarks with a random forest:
-```bash
-python3 runbenchmark.py RandomForest test
-```
-
-The majority of frameworks though require a `setup` step before being able to run a benchmark. Please note that this step may take some time depending on the framework.
-This setup is executed by default on first run of the framework, but in this case, it is not guaranteed that the benchmark run following immediately will manage to complete successfully (for most frameworks though, it does).
-
-In case of error, just run the benchmark one more time.
-
-If it still fails, you may need to rerun the setup step manually:
-```bash
-python3 runbenchmark.py {framework} -s only
-```
-You can then run the benchmarks as many times as you wish.
-
-When testing a framework or a new dataset, you may want to run only a single task and a specific fold, for example:
-```bash
-python3 runbenchmark.py TPOT validation -t bioresponse -f 0
-```
-
-### On AWS
-To run a benchmark on AWS you additionally need to have a configured AWS account.
-The application is using the [boto3] Python package to exchange files through S3 and create EC2 instances.
-
- If this is your first time setting up your AWS account on the machine that will run the `automlbenchmark` app, you can use the [AWS CLI](http://aws.amazon.com/cli/) tool and run:
- ```bash
- aws configure
- ```
-You will need your AWS Access Key ID, AWS Secret Access Key, and pick a default [EC2 region](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html#concepts-available-regions).
-
-- _**NOTE:** Currently the AMI is only configured for the following regions so you'll have to set your default region as one of these_:
-  - us-east-1
-  - us-west-1
-  - eu-west-1
-  - eu-central-1
-  
-On first use, it is recommended to simply copy the `config.yaml` from [examples/aws] to your user `~/.config/automlbenchmark` folder (or merge it if you already have a `config.yaml` in this user folder) and follow the instructions in that file.
-
-To run a test to see if the benchmark framework is working on AWS, do the following:
-```bash
-python3 runbenchmark.py constantpredictor test -m aws
-```
-This will create and start an EC2 instance for each benchmark job and run the 4 jobs (2 OpenML tasks * 2 folds) from the `test` benchmark sequentially, each job running for 1mn in this case (excluding setup time for the EC2 instances).
-
-For longer benchmarks, you'll probably want to run multiple jobs in parallel and distribute the work to several EC2 instances, for example:
-```bash
-python3 runbenchmark.py AUTOWEKA validation 1h4c -m aws -p 4
-```
-will keep 4 EC2 instances running, monitor them in a dedicated thread, and finally collect all outputs from s3.
-
-- _**NOTE**: each EC2 instance is provided with a time limit at startup to ensure that in any case, the instance is stopped even if there is an issue when running the benchmark task. In this case the instance is stopped, not terminated, and we can therefore inspect the machine manually (ideally after resetting its UserData field to avoid re-triggering the benchmark on the next startup)._
-
-The console output is still showing the instances starting, outputs the progress and then the results for each dataset/fold combination:
-```text
-Running `H2OAutoML_nightly` on `validation` benchmarks in `aws` mode
-Loading frameworks definitions from ['/Users/me/repos/automlbenchmark/resources/frameworks.yaml'].
-Loading benchmark definitions from /Users/me/repos/automlbenchmark/resources/benchmarks/validationt.yaml.
-Uploading `/Users/me/repos/automlbenchmark/resources/benchmarks/validation.yaml` to `ec2/input/validation.yaml` on s3 bucket automl-benchmark.
-...
-Starting new EC2 instance with params: H2OAutoML_nightly /s3bucket/input/validation.yaml -t micro-mass -f 0
-Started EC2 instance i-0cd081efc97c3bf6f 
-[2019-01-22T11:51:32] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: pending 
-Starting new EC2 instance with params: H2OAutoML_nightly /s3bucket/input/validation.yaml -t micro-mass -f 1
-Started EC2 instance i-0251c1655e286897c 
-...
-[2019-01-22T12:00:32] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running
-[2019-01-22T12:00:33] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: running
-[2019-01-22T12:00:48] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running
-[2019-01-22T12:00:48] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: running
-...
-[  731.511738] cloud-init[1521]: Predictions saved to /s3bucket/output/predictions/h2oautoml_nightly_micro-mass_0.csv
-[  731.512132] cloud-init[1521]: H2O session _sid_96e7 closed.
-[  731.512506] cloud-init[1521]: Loading predictions from /s3bucket/output/predictions/h2oautoml_nightly_micro-mass_0.csv
-[  731.512890] cloud-init[1521]: Metric scores: {'framework': 'H2OAutoML_nightly', 'version': 'nightly', 'task': 'micro-mass', 'fold': 0, 'mode': 'local', 'utc': '2019-01-22T12:00:02', 'logloss': 0.6498889633819804, 'acc': 0.8793103448275862, 'result': 0.6498889633819804}
-[  731.513275] cloud-init[1521]: Job local_micro-mass_0_H2OAutoML_nightly executed in 608.534 seconds
-[  731.513662] cloud-init[1521]: All jobs executed in 608.534 seconds
-[  731.514089] cloud-init[1521]: Scores saved to /s3bucket/output/scores/H2OAutoML_nightly_task_micro-mass.csv
-[  731.514542] cloud-init[1521]: Loaded scores from /s3bucket/output/scores/results.csv
-[  731.515006] cloud-init[1521]: Scores saved to /s3bucket/output/scores/results.csv
-[  731.515357] cloud-init[1521]: Summing up scores for current run:
-[  731.515782] cloud-init[1521]:          task          framework    ...         acc   logloss
-[  731.516228] cloud-init[1521]: 0  micro-mass  H2OAutoML_nightly    ...     0.87931  0.649889
-[  731.516671] cloud-init[1521]: [1 rows x 9 columns]
-...
-EC2 instance i-0cd081efc97c3bf6f is stopped
-Job aws_validation_micro-mass_0_H2OAutoML_nightly executed in 819.305 seconds
-[2019-01-22T12:01:34] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running
-[2019-01-22T12:01:49] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running
-EC2 instance i-0251c1655e286897c is stopping
-Job aws_validation_micro-mass_1_H2OAutoML_nightly executed in 818.463 seconds
-...
-Terminating EC2 instances i-0251c1655e286897c
-Terminated EC2 instances i-0251c1655e286897c with response {'TerminatingInstances': [{'CurrentState': {'Code': 32, 'Name': 'shutting-down'}, 'InstanceId': 'i-0251c1655e286897c', 'PreviousState': {'Code': 64, 'Name': 'stopping'}}], 'ResponseMetadata': {'RequestId': 'd09eeb0c-7a58-4cde-8f8b-2308a371a801', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'text/xml;charset=UTF-8', 'transfer-encoding': 'chunked', 'vary': 'Accept-Encoding', 'date': 'Tue, 22 Jan 2019 12:01:53 GMT', 'server': 'AmazonEC2'}, 'RetryAttempts': 0}}
-Instance i-0251c1655e286897c state: shutting-down
-All jobs executed in 2376.891 seconds
-Deleting uploaded resources `['ec2/input/validation.yaml', 'ec2/input/config.yaml', 'ec2/input/frameworks.yaml']` from s3 bucket automl-benchmark.
-```
-
-### Output
-By default, a benchmark run creates the following subdirectories and files in the output directory (by default a subdirectory of `./results` with unique name identifying the benchmark run):
-* `scores`: this subdirectory contains
-    * `results.csv`: a global scoreboard, keeping scores from all benchmark runs. 
-       For safety reasons, this file is automatically backed up to `scores/backup/results.{currentdate}.csv` by the application before any modification. 
-    * individual score files keeping scores for each framework+benchmark combination (not backed up). 
-* `predictions`, this subdirectory contains the last predictions in a standardized format made by each framework-dataset combination.
-  Those last predictions are systematically backed up with current data to `predictions/backup` subdirectory before a new prediction is written.
-* `logs`: this subdirectory contains logs produced by the `automlbenchmark` app, including when it's been run in Docker container or on AWS.
-
-
-### Uploading results to OpenML
-The `upload_results.py` script can be used to upload results to OpenML with the following usage:
-```text
->python upload_results.py --help
-usage: Script to upload results from the benchmark to OpenML. [-h] [-i INPUT_DIRECTORY] [-a APIKEY] [-m MODE] [-x] [-v] [-t TASK]
-
-optional arguments:
-  -h, --help            show this help message and exit
-  -i INPUT_DIRECTORY, --input-directory INPUT_DIRECTORY
-                        Directory that stores results from the runbenchmark.py invocation. By default use the most recent folder in the results folder as
-                        specified in the configuration.
-  -a APIKEY, --api-key APIKEY
-                        OpenML API key to use for uploading results.
-  -m MODE, --mode MODE  Run mode (default=check).
-                        • check: only report whether results can be uploaded.
-                        • upload: upload all complete results.
-  -x, --fail-fast       Stop as soon as a task fails to upload due to an error during uploading.
-  -v, --verbose         Output progress to console.
-  -t TASK, --task TASK  Only upload results for this specific task.
-```
-
-Note that the default behavior does not upload data but only verifies data is complete.
-We strongly encourage you to only upload your data after verifying all expected results are complete.
-The OpenML Python package is used for uploading results, so to ensure your API credentials are configured, please refer to their [configuration documentation](https://openml.github.io/openml-python/master/usage.html#installation-set-up).
-Results obtained on tasks on the test server (e.g. through the `--test-server` option of `runbenchmark.py`) are uploaded to the test server and don't require additional authentication.
-
-## Advanced configuration
-If you need to create your own benchmark, add a framework, create a plugin for a proprietary framework, or simply want to use some advanced options (e.g. run some frameworks with non-default parameters), see the [HOWTO].
-
-## Issues
-If you face any issue, please first have a look at the [Troubleshooting guide] and check the [existing issues](https://github.com/openml/automlbenchmark/issues).
-Any new issue should also be reported there.
-
-
-[HOWTO]: ./HOWTO.md
-[Troubleshooting guide]: ./HOWTO.md#troubleshooting-guide
-[examples/aws]: ../examples/aws/config.yaml
-
-[Docker]: https://docs.docker.com/
-[boto3]: https://boto3.readthedocs.io/
-
-
-## Frequently Asked Questions
-
-**When will results be updated, also for the new/updated frameworks?**
-
-We don't perform a benchmark evaluation for each new package or update.
-Due to budget constraints, we can only do a limited number of evaluations.
-The next full evaluation will be performed before the end of the year 2020.
-We hope to find funding to guarantee regular evaluations.
-
----
-**(When) will you add framework X?**
-
-We are currently not focused on integrating additional AutoML systems.
-However, we process any pull requests that add frameworks and will assist with the integration.
-The best way to make sure framework X gets included is to start with the integration yourself or encourage the package authors to do so (for technical details see [HOWTO]).
-
-It is also possible to open a Github issue indicating the framework you would like added.
-Please use a clear title (e.g. "Add framework: X") and provide some relevant information (e.g. a link to the documentation).
-This helps us keep track of which frameworks people are interested in seeing included.
diff --git a/docs/about.md b/docs/about.md
deleted file mode 100644
index c63eaa53c..000000000
--- a/docs/about.md
+++ /dev/null
@@ -1,50 +0,0 @@
----
-title: About
-layout: category
-sidebar_sort_order: 10
----
-
-## Goals
-
-We want to provide an ongoing benchmark with up-to-date results on realistic and current machine learning problems.
-By making it open-source and open to contributions, we hope that all packages will be used as intended and evaluated fairly.
-Fair results for each framework are enabled by allowing authors to contribute directly to the repository.
-To ensure the benchmark accurately reflects the state of AutoML, evaluations will be rerun when frameworks get major updates,
-and the selection of problems will be updated<sup>1</sup>.
-
-Currently, we limit the datasets to involve single-label classification problems on i.i.d. tabular data optimizing for one of two metrics.
-We would like to extend the types of tasks to include e.g. regression, multi-label classification and temporal data,
-but also to include problem-specific metrics (e.g. have a false negative incur a higher cost than a false positive for a disease diagnosis problem).
-
-## Open Science
-Open science is important to us.
-This is a transparent benchmark: no favorites, no cheating.
-We require that all evaluated AutoML systems are open-source and all data to be freely available on [OpenML](https://www.openml.org/).
-All the code required to run the benchmark is available on [Github](https://github.com/openml/automlbenchmark).
-
-## Limitations
-It is important to note that the current benchmark has some limitations.
-
-First, we evaluate the AutoML systems by their default settings, only specifying the resources to be used (number of cores, wallclock time and memory).
-We do not tune their search space or optimization hyperparameters, even though all packages allow at least some tuning.
-There are of course valid reasons to tune these settings, such as only allowing a subset of models that are most interpretable.
-However, in a general sense we feel that requiring tuning of AutoML frameworks defeats the purpose of AutoML, and thus opt not to do so.
-That said, tuning the search space or hyperparameter values may drastically change the results.
-Our hope is that authors of AutoML packages put more thought in picking good default settings, possibly dependent on the task at hand.
-Over time, we hope this becomes a non-issue.
-
-We must stress that this benchmark does *not* tell us what optimization technique is best.
-For each package, the search space from which to construct a model is very different.
-These differences are caused by many design differences.
-These are differences in their representation of machine learning pipelines (e.g. fixed-length vs. unlimited-length), 
-by the underlying machine learning packages (e.g. scikit-learn vs. WEKA), 
-and even the selection of included algorithms and allowed hyperparameter values.
-Finally some packages use meta-learning for warm-starting, or post-processing techniques to improve results.
-
-There are also qualities of frameworks which are not evaluated.
-Perhaps the most interesting one is the convergence rate, or how good the any-time stopping performance is of each framework along the optimization process.
-But other qualities, such as ease of use or level of support can also be important to some users.
-
-
----
-<sup>1</sup> Due to the high (computational) cost involved, we need to find a balance here.
\ No newline at end of file
diff --git a/docs/automl_overview.md b/docs/automl_overview.md
deleted file mode 100644
index 890bcfed5..000000000
--- a/docs/automl_overview.md
+++ /dev/null
@@ -1,334 +0,0 @@
----
-layout: category
-title: AutoML Systems
-sidebar_sort_order: 2
----
-
-There is more to an AutoML system than just its performance.
-An AutoML framework may only be available through an API for a specific programming language, while others can work stand-alone.
-Some systems might output models which can be used without further dependency on the AutoML package,
-in other cases the AutoML system is still required to use the model.
-Some systems might be developed with a specific domain in mind. 
-When choosing an AutoML system, it is essential to consider things that are important to you.
-
-On this page a brief description and further references for the AutoML systems in the benchmark is provided.
-
-List of AutoML systems in the benchmark, in alphabetical order:
-
-- [auto-sklearn](#auto-sklearn)
-- [Auto-WEKA](#auto-weka)
-- [H2O AutoML](#h2o-automl)
-- [TPOT](#tpot)
-
-There are many more AutoML frameworks, and unfortunately we could not yet evaluate them all.
-While we hope to cover them in the comparison in the future, for now we will
-Some other frameworks worth mentioning are, again in alphabetical order:
-
-- [autoxgboost](#autoxgboost)
-- [FLAML](#flaml)
-- [GAMA](#gama)
-- [hyperopt-sklearn](#hyperopt-sklearn)
-- [ML-Plan](#ml-plan)
-- [mlr3automl](#mlr3automl)  
-- [oboe](#oboe)
-
-For completeness, the baseline methods are also described:
-
-- [Constant Predictor](#constant-predictor)
-- [Random Forest](#random-forest)
-- [Tuned Random Forest](#tuned-random-forest)
-
-##### Statement To Authors
-We did our best to provide a reasonable description which highlights some unique or important aspects of each package.
-If you want to change or add to the description and references of your AutoML package, please submit a pull request with your proposed changes. 
-
-The description needs to be kept brief and factual.
-The goal is to get an impression, based on which the reader can delve more in-depth in the provided documentation.
-
-If your AutoML framework is not on this page and feel it should be, please open a PR with the proposed addition.
-Keep the formatting consistent with the rest of the page.
-
------
-
-# Included AutoML Frameworks
-
-## auto-sklearn
-[source](https://github.com/automl/auto-sklearn) |
-[documentation](http://automl.github.io/auto-sklearn/stable/) |
-Python |
-Optimization: Bayesian Optimization |
-3-clause BSD 
-
-> auto-sklearn is an automated machine learning toolkit and a drop-in replacement for a scikit-learn estimator.
-
-Auto-sklearn is declared the overall winner of the [ChaLearn AutoML](http://automl.chalearn.org/) Challenge
-[1](https://docs.google.com/a/chalearn.org/viewer?a=v&pid=sites&srcid=Y2hhbGVhcm4ub3JnfGF1dG9tbHxneDoyYThjZjhhNzRjMzI3MTg4)
-in 2015-2016 and
-[2](https://www.4paradigm.com/competition/pakdd2018)
-in 2017-2018.
-It provides a scikit-learn-like interface in Python and uses Bayesian optimization to find good machine learning pipelines.
-
-It features automatic ensemble construction.
-Meta-learning is used to warm-start the search procedure, this means that the search is more likely to start with good pipelines.
-
-#### Papers
-
-Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum and Frank Hutter (2015).
-[Efficient and Robust Automated Machine Learning](http://papers.nips.cc/paper/5872-efficient-and-robust-automated-machine-learning.pdf)
-*Advances in Neural Information Processing Systems 28 (NIPS 2015)*.
-
-## Auto-WEKA
-[source](https://github.com/automl/autoweka) | 
-[documentation](http://www.cs.ubc.ca/labs/beta/Projects/autoweka/manual.pdf) |
-Java, CLI, GUI |
-Optimization: Bayesian Optimization |
-GPLv3
-
-> Our hope is that Auto-WEKA will help non-expert users to more effectively identify machine learning algorithms and
-> hyperparameter settings appropriate to their applications, and hence to achieve improved performance.
-
-Auto-WEKA is built on the Java machine learning package [WEKA](http://www.cs.waikato.ac.nz/ml/weka/).
-Auto-WEKA can be used through a graphical user interface, which means there is no need to use a terminal or programming language.
-It is one of the first systems to consider joint algorithm selection and hyperparameter optimization in addition to preprocessing steps.
-
-
-
-#### Papers
-
-Lars Kotthoff, Chris Thornton, Holger Hoos, Frank Hutter, and Kevin Leyton-Brown (2017).
-[Auto-WEKA 2.0: Automatic model selection and hyperparameter optimization in WEKA](http://www.cs.ubc.ca/labs/beta/Projects/autoweka/papers/16-599.pdf)
-*JMLR. 18(25):1−5, 2017*
-
-Chris Thornton, Frank Hutter, Holger Hoos, and Kevin Leyton-Brown (2013).
-[Auto-WEKA: Combined Selection and Hyperparameter Optimization of Classification Algorithms](http://www.cs.ubc.ca/labs/beta/Projects/autoweka/papers/autoweka.pdf)
-*Proceedings of KDD 2013*.
-
-
-## H2O AutoML
-[source](https://github.com/h2oai/h2o-3) |
-[documentation](http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html) |
-Python, R |
-Optimization: Random Search |
-Apache-2.0
-
-> H2O’s AutoML can be used for automating the machine learning workflow,
-> which includes automatic training and tuning of many models within a user-specified time-limit.
-
-
-H2O AutoML performs Random Search followed by a stacking stage.
-By default it uses the H2O machine learning package, which supports distributed training.
-
-#### Papers
-
-\-
-
-
-## TPOT 
-[source](https://github.com/EpistasisLab/tpot) |
-[documentation](https://epistasislab.github.io/tpot/) |
-Python, CLI |
-Optimization: Genetic Programming |
-LGPL-3.0
-
-> Consider TPOT your Data Science Assistant.
-> TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.
-
-TPOT provides a scikit-learn-like interface for use in Python, but can be called from the command line as well.
-It constructs machine learning pipelines of arbitrary length using scikit-learn algorithms and, optionally, xgboost.
-In its search, preprocessing and stacking are both considered.
-After the search, it is able to export python code so that you may reconstruct the pipeline without dependencies on TPOT.
-
-While technically pipelines can be of any length, TPOT performs multi-objective optimization: 
-it aims to keep the number of components in the pipeline small while optimizing the main metric.
-TPOT features support for sparse matrices, multiprocessing and custom pipeline components.
- 
-#### Papers
-
-Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A. Lavender, La Creis Kidd, and Jason H. Moore (2016).
-[Automating biomedical data science through tree-based pipeline optimization](http://dx.doi.org/10.1007/978-3-319-31204-0_9).
-*Applications of Evolutionary Computation*, pages 123-137.
-
-Randal S. Olson, Nathan Bartley, Ryan J. Urbanowicz, and Jason H. Moore (2016).
-[Evaluation of a Tree-based Pipeline Optimization Tool for Automating Data Science](http://doi.acm.org/10.1145/2908812.2908918).
-*Proceedings of GECCO 2016*, pages 485-492.  
-
-
-
-# Other AutoML Frameworks
-
-## autoxgboost
-[source](https://github.com/ja-thomas/autoxgboost) |
-[documentation](https://github.com/ja-thomas/autoxgboost/blob/master/poster_2018.pdf) |
-R |
-Optimization: Bayesian Optimization | -
-
-> autoxgboost aims to find an optimal xgboost model automatically using the machine learning framework mlr and the bayesian optimization framework mlrMBO.
-
-Autoxgboost is different from most frameworks on this page in that it does not search over multiple learning algorithms.
-Instead, it restricts itself to finding a good hyperparameter configuration for xgboost.
-The exception to this is a preprocessing step for categorical variables, where the specific encoding strategy to use is tuned as well.
-
-#### Papers
-
-Janek Thomas, Stefan Coors and Bernd Bischl (2018). 
-[Automatic Gradient Boosting](https://arxiv.org/pdf/1807.03873v2.pdf)
-*International Workshop on Automatic Machine Learning at ICML 2018*
-
-## FLAML
-[source](https://github.com/microsoft/FLAML) |
-[documentation](https://microsoft.github.io/FLAML/) |
-Python |
-Optimization: Configurable |
-License MIT
-
-> FLAML is a lightweight Python library that finds accurate machine learning models efficiently and economically. 
-
-FLAML is powered by a new, cost-effective hyperparameter optimization and learner selection method invented by Microsoft Research. FLAML leverages the structure of the search space to choose a search order optimized for both cost and error. 
-FLAML is fast and economical. The simple and lightweight design makes it easy to extend, such as adding customized learners or metrics.
-
-#### Papers
-
-Chi Wang, Qingyun Wu, Markus Weimer, and Erkang Zhu (2021).
-[FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/)
-*Proceedings of MLSys 2021*
-
-Qingyun Wu, Chi Wang, and Silu Huang (2021).
-[Frugal Optimization for Cost-related Hyperparameters](https://www.microsoft.com/en-us/research/publication/frugal-optimization-for-cost-related-hyperparameters/)
-*Proceedings of AAAI 2021*
-
-Chi Wang, Qingyun Wu, Silu Huang, and Amin Saied (2021).
-[Economical Hyperparameter Optimization With Blended Search Strategy](https://www.microsoft.com/en-us/research/publication/economical-hyperparameter-optimization-with-blended-search-strategy/)
-*The Ninth International Conference on Learning Representations (ICLR 2021)*
-
-## GAMA 
-[source](https://github.com/PGijsbers/gama) |
-[documentation](https://pgijsbers.github.io/gama/) |
-Python |
-Optimization: Configurable |
-License MIT
-
-> GAMA is an AutoML tool for end-users and AutoML researchers with a configurable AutoML pipeline.
-
-GAMA is a new framework under active development.
-GAMA supports AutoML researchers through a configurable AutoML pipeline, extensive logging and visualization of the logs.
-The configurable AutoML pipeline allows selection of the optimization and post-processing algorithms.
-
-By default GAMA searches over linear machine learning pipelines and create an ensemble of them as a post-processing step.
-Currently pipelines can be optimized with an asynchronous evolutionary algorithm or [ASHA](https://arxiv.org/abs/1810.05934).
-
-#### Papers
-
-Pieter Gijsbers, Joaquin Vanschoren (2019).
-[GAMA: Genetic Automated Machine learning Assistant](https://joss.theoj.org/papers/10.21105/joss.01132).
-*Journal of Open Source Software, 4(33), 1132*
-
-## hyperopt-sklearn 
-[source](https://github.com/hyperopt/hyperopt-sklearn) |
-[documentation](http://hyperopt.github.io/hyperopt-sklearn/) |
-Python |
-Optimization: Random Search, various SMBO |
-3-clause BSD
-
-> Hyperopt-sklearn is Hyperopt-based model selection among machine learning algorithms in scikit-learn.
-
-Hyperopt-sklearn allows for different search strategies through a scikit-learn-like interface.
-Besides random search, various sequential model based optimization (SMBO) techniques are available.
-Amongst these are Tree of Parzen Estimators (TPE), Annealing and Gaussian Process Trees.
-
-#### Papers
-
-Komer, Brent, James Bergstra, and Chris Eliasmith (2014).
-[Hyperopt-sklearn: automatic hyperparameter configuration for scikit-learn.](http://compneuro.uwaterloo.ca/files/publications/komer.2014b.pdf)
-*ICML workshop on AutoML 2014*.
-
-## ML-Plan
-[source](https://github.com/starlibs/AILibs) |
-[documentation](https://starlibs.github.io/AILibs/projects/mlplan/) |
-Java |
-Optimization: Best-First Search on a search graph induced through Hierachical Task Network Planning | AGPL-3.0
-
-> a new approach to AutoML based on hierarchical planning
-
-ML-Plan organizes the search space of possible solution candidates via Hierarchical Task Network (HTN) planning.
-It works with both WEKA and scikit-learn backends and can be used to deal with classification, regression, multi-label classification, and remaining useful lifetime estimation tasks.
-ML-Plan is under active development.
-
-#### Papers
-
-Felix Mohr, Marcel Wever and Eyke Hüllermeier (2018).
-[ML-Plan: Automated machine learning via hierarchical planning](https://link.springer.com/article/10.1007/s10994-018-5735-z)
-*Machine Learning  107(8):1495–1515*
-
-Marcel Wever, Felix Mohr and Eyke Hüllermeier (2018).
-[ML-Plan for Unlimited-Length Machine Learning Pipelines](https://ris.uni-paderborn.de/download/3852/3853/38.pdf)
-* ICML workshop on AutoML 2018*.
-
-Marcel Wever, Felix Mohr and Eyke Hüllermeier (2018).
-[Automated Multi-Label Classification based on ML-Plan](https://arxiv.org/abs/1811.04060)
-*arXiv preprint*
-
-Marcel Wever, Felix Mohr, Alexander Tornede and Eyke Hüllermeier (2019).
-[Automating Multi-Label Classification Extending ML-Plan](https://ris.uni-paderborn.de/download/10232/13177/Automating_MultiLabel_Classification_Extending_ML-Plan.pdf)
-* ICML workshop on AutoML 2019*.
-
-## mlr3automl
-[source](https://github.com/a-hanf/mlr3automl) |
-[documentation](https://github.com/a-hanf/mlr3automl/blob/master/vignettes/mlr3automl.md) |
-R |
-Optimization: Hyperband | License LGPL-3.0
-
-> mlr3automl combines a static portfolio with Hyperband tuning. 
-
-mlr3automl is built on top of mlr3. It combines a static portfolio of known successful pipelines
-with Hyperband tuning. mlr3automl currently supports classification and regression tasks.
-
-#### Papers
-\-
-
-## OBOE 
-[source](https://github.com/udellgroup/oboe) |
-[documentation](https://github.com/udellgroup/oboe) |
-Python |
-Optimization: Collaborative Filtering |
-License N/A
-
-> Oboe is a data-driven Python algorithmic system for automated machine learning, and is based on matrix factorization and classical experiment design. 
-
-OBOE is still in early stages of development.
-It focuses on finding a good initial set of pipelines from which to start further optimization.
-The focus is on time-constrained model selection and hyperparameter tuning, using meta-learning to find good pipelines.
-
-OBOE searches for a good set of algorithm configurations to create an ensemble from, using meta-learning.
-With collaborative filtering they estimate which algorithms are likely to do well on the new dataset.
-
-#### Papers
-
-Chengrun Yang, Yuji Akimoto, Dae Won Kim, Madeleine Udell (2018).
-[OBOE: Collaborative Filtering for AutoML Initialization](https://arxiv.org/pdf/1808.03233.pdf).
-*arXiv preprint*.
-
-
-## Baselines
-
-We compare the performance of AutoML frameworks not only to each other, but also to three baseline methods, these are:
-
-## Constant Predictor 
-[source](https://github.com/openml/automlbenchmark/tree/master/frameworks/constantpredictor)
-
-Always predicts the class probabilities according to their occurrence in the dataset.
- 
-## Random Forest 
-[source](https://github.com/openml/automlbenchmark/tree/master/frameworks/RandomForest)
-  
-The [Random Forest Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) of scikit-learn 0.20. 
-All hyperparameters are set to their default value, except for the number of estimators, which is set to *2000*.
- 
-## Tuned Random Forest 
-[source](https://github.com/openml/automlbenchmark/tree/master/frameworks/TunedRandomForest)
-
-Uses the Random Forest setup as described above, but first optimizes the hyperparameter `max_features`.
-It tries up to *11* different values of `max_features`. 
-Five values uniformly picked from `[1, sqrt(p))`, five values from `(sqrt(p), p]` and finally `sqrt(p)`, where `p` if the number of features in the dataset.
-
-It first evaluates `max_features=sqrt(p)` and then evaluates the other values in ascending order, until it completes them all or runs out of time.
-Finally the model is fit to the entire training dataset with the best value for `max_features` according to the above cross-validation results.
diff --git a/docs/benchmark_datasets.md b/docs/benchmark_datasets.md
deleted file mode 100644
index b0ebfd36d..000000000
--- a/docs/benchmark_datasets.md
+++ /dev/null
@@ -1,61 +0,0 @@
----
-layout: category
-title: Benchmark Datasets
-sidebar_sort_order: 3
----
-
-The benchmark aims to consist of datasets that represent real-world data science problems.
-This means we want to include datasets of all sizes (including *big* ones), of different problem domains and with various levels of difficulty.
-
-We also want to prevent AutoML tools from overfitting to our benchmark.
-For this reason, we plan to change the selection of benchmark problems over time.
-This should help prevent (some of the) bias that can be introduced by static benchmarks.
-
-In our selection for the [paper](#paper.md), we drew datasets from [OpenML100](https://www.openml.org/s/14), [OpenML-CC18](https://www.openml.org/s/98) and [AutoML Challenges](http://automl.chalearn.org/data).
-However, we did not include all datasets.
-One reason was that some did not meet our criteria (more on that below), another that we wanted to keep some datasets of the future.
-There are also a few datasets which we wanted to include, but could not include in the paper due to time constraints.
-
-## Criteria
-As stated before, we did not adopt all proposed datasets but made a selection.
-Our criteria for adopting a dataset were as follows:
-
-**difficulty** of the dataset has to be a sufficient.
-If a problem is easily solved by just about any algorithm, it will not be able to differentiate the various AutoML frameworks.
-This was the case for many of the OpenML 100 problems (see e.g. [this Github Issue](https://github.com/openml/OpenML/issues/491)),
-but also some of the OpenML-CC18 problems (see e.g. [this task](https://www.openml.org/t/15)).
-
-**representative of real-world** data science problems to be solved with the tool.
-In particular we **limit artificial** problems. 
-We included some, either based on their widespread use ([kr-vs-kp](https://www.openml.org/d/3)) or because they pose difficult problems.
-But we do not want them to be a large part of the benchmark.
-We also **limit image problems** because those problems are typically solved with solutions in the deep learning domain.
-However they still make for realistic, interesting and hard problems, so we did not want to exclude them altogether.
-
-**diversity** in the problem domains.
-We do not want the benchmark to skew towards any domain in particular.
-There are various software quality problems in the OpenML-CC18 (
-[jm1](https://www.openml.org/d/1053),
-[kc1](https://www.openml.org/d/1067), 
-[kc2](https://www.openml.org/d/1063), 
-[pc1](https://www.openml.org/d/1068), 
-[pc3](https://www.openml.org/d/1050), 
-[pc4](https://www.openml.org/d/1049)), but adopting them all would lead to a bias in the benchmark to this domain.
-
-*We want to note however that being notified of new interesting problems in a domain that is already well-represented is still useful,
-because we want to eventually replace datasets in the benchmark.*
-
-**miscellaneous** reasons to *exclude* a dataset included label-leakage, near-duplicates (e.g. different only in categorical encoding or imputation) or violation of the i.i.d. assumption.
- 
-
-
-## Final List
-The first iteration of our benchmark as presented in the paper contained 39 classification datasets.
-For the full list of datasets and their characteristics see [OpenML Study 218](https://www.openml.org/s/218) or its [table view](https://www.openml.org/search?q=tags.tag%3Astudy_218&type=data&table=1&size=39).
-
-## The Future
-As stated before, we want the selection of benchmark problems to change over time.
-If you find a good candidate dataset, you can [help us make it part of the benchmark](extending.md#adding-a-dataset).
-While we are interested in all interesting datasets that match our criteria, we are particularly interested in bigger datasets (>100k rows).
-
-We greatly appreciate any help to find new and interesting problems for the AutoML benchmark.
\ No newline at end of file
diff --git a/docs/bib_workshop.md b/docs/bib_workshop.md
deleted file mode 100644
index 5eeb9a23d..000000000
--- a/docs/bib_workshop.md
+++ /dev/null
@@ -1,14 +0,0 @@
----
-title: BibTeX - AutoML @ ICML 2019 Paper
----
-```
-@article{amlb2019,
-  title={An Open Source AutoML Benchmark},
-  author={Gijsbers, P. and LeDell, E. and Poirier, S. and Thomas, J. and Bischl, B. and Vanschoren, J.},
-  journal={arXiv preprint arXiv:1907.00909 [cs.LG]},
-  url={https://arxiv.org/abs/1907.00909},
-  note={Accepted at AutoML Workshop at ICML 2019},
-  year={2019}
-}
-
-```
\ No newline at end of file
diff --git a/docs/documentation.md b/docs/documentation.md
deleted file mode 100644
index 45a863e88..000000000
--- a/docs/documentation.md
+++ /dev/null
@@ -1,9 +0,0 @@
----
-layout: category
-title: Documentation
-sidebar_sort_order: 4
----
-
- - [Running the Benchmark](README.md#running-benchmarks)
- - [Adding a dataset](extending.md#adding-a-dataset)
- - [Adding an AutoML Framework](extending.md#adding-an-automl-framework)
\ No newline at end of file
diff --git a/docs/extending.md b/docs/extending.md
deleted file mode 100644
index 33aa01048..000000000
--- a/docs/extending.md
+++ /dev/null
@@ -1,98 +0,0 @@
----
-title: Extending the benchmark
----
-
-Whether you want to add a dataset or a framework to the benchmark, you will first have to [fork our repository](https://help.github.com/en/articles/fork-a-repo).
-By forking our repository, you can make and test changes without affecting the benchmark.
-If you feel your changes should be included in the benchmark, set up a [pull request](https://help.github.com/en/articles/about-pull-requests).
-When creating a pull request, indicate clearly the changes and why they are made.
-
-## Adding a dataset
-
-### What makes a good dataset
-Before discussing on *how* to add your dataset to the benchmark, we want to briefly elaborate on what we think makes for an interesting dataset.
-
-In our benchmark we aim to include machine learning problems which are representative of those encountered in practice.
-In particular, problems of different domains, mixed data types and dataset sizes.
-Currently, we would love some additional *big* datasets.
-
-Another important aspect for inclusion in the benchmark would be that it is a hard problem.
-Even if the data is interesting, if a (near-)perfect model can be created with a decision tree, it is not going to be useful to profile the AutoML systems with.
-Ideal datasets are those where only certain algorithms (with certain hyperparameter configurations) work, or require non-trivial data preprocessing.
-Running a few different algorithms, with and without preprocessing, with different configurations, is encouraged to show the problem is sufficiently difficult.
-
-Perhaps your dataset does not match with the above description, or you lack the resources or know-how to evaluate the problem with different machine learning approaches.
-If you think the problem is interesting regardless, do not hesitate to contact us anyway.
-If possible, do this through a pull request as laid out in the following sections.
-Otherwise, open an [issue](https://github.com/openml/automlbenchmark/issues).
-Please title the issue '[DATA ADD] DATASETNAME' (replacing 'DATASETNAME' with the name of your dataset),
-provide a link to the dataset on OpenML as well as motivation as to why you think the dataset is an interesting addition.
-Following the steps below will make it more likely that we'll be able to review (and add) the dataset quickly.
-
-### Uploading to OpenML
-To add a dataset to the benchmark, it needs to be uploaded to OpenML.
-This requires the dataset in [ARFF format](https://www.cs.waikato.ac.nz/ml/weka/arff.html). 
-Read [here](https://docs.openml.org/#data) for more information on OpenML data,
-and [here](https://www.openml.org/new/data) on how to actually upload it (this requires you to [sign up](https://www.openml.org/register) for OpenML).
-
-After uploading the dataset, visit its page on OpenML and create a [task](https://docs.openml.org/#tasks) for it.
-An OpenML task specifies the evaluation procedure (e.g. splits of a 10-fold cross-validation) and the target of the problem.
-To create a task for your OpenML dataset, visit its webpage and find the 'define new task' button at the bottom.
-After these steps we are ready to add the problem to a benchmark.
-
-### Testing the task
-First, to make sure everything was set up right, create a single-problem benchmark.
-The easiest is to modify the [example benchmark](https://github.com/openml/automlbenchmark/blob/master/resources/benchmarks/example.yaml) by replacing the iris task information with your own.
-Then run the benchmark: `python runbenchmark.py constantpredictor_enc example`.
-<!--- If your task contains categorical variables, make sure use `constantpredictor_enc` instead.--->
-
-Check results for errors.
-If your task fails and it is unclear why, you can open an [issue](https://github.com/openml/automlbenchmark/issues).
-If you do, please clearly indicate the related OpenML task id and steps to recreate it
-and title the issue '[DATA HELP] DATASETNAME', replacing 'DATASETNAME' with the name of your dataset.
-
-### Adding it to the real thing
-If you've made sure everything works, modify one of the existing benchmark or create a new one with your task.
-When extending an existing benchmark, make sure not to modify any of the existing problems for the task.
-Finally commit your changes and set up a pull request.
-
-
-**Please make sure the PR does not include the changes made to `example.yaml`**
-
-
-In your PR include:
- - a link to the task and dataset on OpenML, where the OpenML dataset has meaningful meta-data (e.g. description)
- - a motivation as to why this is an interesting addition to the benchmark. 
- Preferably address the points from the [What makes a good dataset](#what-makes-a-good-dataset) section.
- The higher quality your motivation, the better we can come to a conclusion on whether to include the dataset or not.
-
-
-## Adding an AutoML framework
-
-To add a new framework, create a new folder in the [frameworks folder](https://github.com/openml/automlbenchmark/tree/master/frameworks) (`/frameworks`).
-In the package include at least a `__init__.py` file which exposes the method `run(Dataset, TaskConfig)` and optionally also `setup(*args)` and/or  `docker_commands()` as documented [here](https://github.com/openml/automlbenchmark/blob/master/frameworks/__init__.py).
-
-For an example using a python-based AutoML tool, see e.g. the [TPOT](https://github.com/openml/automlbenchmark/tree/master/frameworks/TPOT) folder.
-For an example using a non-python-based AutoML tool, see e.g. the [Auto-WEKA](https://github.com/openml/automlbenchmark/tree/master/frameworks/AutoWEKA) folder.
-
-Note that, as can be seen in the TPOT example, imputing the data before passing it to the framework is (currently) allowed.
-The data is available in its regular form, but also in a numeric-only form (where string values are encoded with integers).
-
-Finally, add your framework to the [`framework.yaml`](https://github.com/openml/automlbenchmark/blob/master/resources/frameworks.yaml) file.
-If at any point you run into issues or questions that aren't answered by the benchmark's documentation, 
-please open an [issue](https://github.com/openml/automlbenchmark/issues), the title of the issue should start with '[FW ADD]'.
-
-### Testing an AutoML framework
-
-To test if the implementation is successful, it is recommended to run the validation benchmark:
-`python runbenchmark.py your_framework validation`.
-This benchmark has tasks with a variety of interesting properties (e.g. missing values, different data types).
-
-
-### Adding it to the real thing
-If everything seems to work correctly, you're almost ready to set up a pull request.
-But first, make sure you all documentation is up-to-date with your latest additions.
-In particular, add or update the section on your AutoML framework in [the AutoML overview](https://github.com/openml/automlbenchmark/blob/master/docs/automl_overview.md).
-
-The title of your pull request when adding a new framework should be 'Add FRAMEWORK' where 'FRAMEWORK' should be replaced by the name of your framework.
-If you are updating a framework, please title the pull request 'Update FRAMEWORK' similarly.
\ No newline at end of file
diff --git a/docs/extending/benchmark.md b/docs/extending/benchmark.md
new file mode 100644
index 000000000..5331662fe
--- /dev/null
+++ b/docs/extending/benchmark.md
@@ -0,0 +1,319 @@
+# Benchmark
+
+Benchmarks are collections of machine learning tasks, where each task is a dataset
+with associated information on train/test splits used to evaluate the model.
+These tasks can be defined in a `yaml` file or on [OpenML](https://www.openml.org).
+Both options allow for defining a benchmark of one or more datasets.
+It is even possible to reference to OpenML tasks from a benchmark file.
+
+!!! note "Supported Datasets"
+    
+    Currently, the AutoML benchmark only supports definitions of tabular datasets for
+    classification, regression, and time series forecasting. The time series forecasting
+    support is in an early stage, subject to change, and not supported through OpenML.
+
+## Defining a Benchmark on OpenML
+Especially when performing a benchmark evaluation to be used in a publication, we
+recommend the use of OpenML for the definition of the benchmark if possible. This
+ensures that other users can run your benchmark out of the box, without any required
+additional files. OpenML also provides a lot of meta-data about the datasets which is
+also accessible through [APIs](https://www.openml.org/apis) in various programming 
+languages. We recommend using the [`openml-python`](https://openml.github.io/openml-python)
+Python library as it is the most comprehensive of the OpenML libraries.
+
+Defining a benchmark on OpenML requires the following steps:
+
+ - [Upload a dataset](https://openml.github.io/openml-python/main/examples/30_extended/create_upload_tutorial.html#sphx-glr-examples-30-extended-create-upload-tutorial-py). 
+   A dataset is the tabular data, alongside meta-data like its name,
+   authors, and license. OpenML will also automatically extract meta-data about the
+   datasets, such as feature types, class balance, or dataset size. After uploading the
+   dataset, it will receive an identifier (`ID`) and should be visible on the OpenML
+   website on `www.openml.org/d/ID`.
+ - [Define a task](https://openml.github.io/openml-python/main/generated/openml.tasks.create_task.html#openml.tasks.create_task). 
+   A task defines how to evaluate a model on a given dataset, for example
+   "10-fold cross-validation optimizing AUC". OpenML will generate splits for the 10-fold
+   cross-validation procedure which means that anyone using this task definition can 
+   perform the experiment with the exact same splits easily.
+ - [Define a benchmark suite](https://openml.github.io/openml-python/main/examples/30_extended/suites_tutorial.html#sphx-glr-examples-30-extended-suites-tutorial-py). 
+   On a technical level, a benchmarking suite is nothing more than a collection of tasks. 
+   You can add a description that details the purpose of the benchmarking suite, or any 
+   information that users should be aware of before using the suite.
+
+When a task or benchmark suite is available on OpenML, it can be directly referred to
+for the `benchmark` parameter of `runbenchmark.py` as `openml/s/ID` for suites and 
+`openml/t/ID` for tasks, where `ID` is to be replaced with the OpenML identifier of the
+object. For example, `openml/t/59` refers to [task 59](https://www.openml.org/t/59), 
+which is 10-fold cross-validation on the [iris dataset](https://www.openml.org/d/61).
+
+## Defining a Benchmark with a File
+
+When defining a benchmark with a `yaml` file, the `yaml` will contain information about
+tasks that are located either on disk or on OpenML. We make a few default benchmarks
+available in our [`resources/benchmarks`](GITHUB/resources/benchmarks) folder:
+
+ * `test`: a collection of three small datasets covering regression, binary classification, 
+    and multiclass classification. This makes it incredibly useful for small tests and
+    fast feedback on whether the software runs without error.
+ * `validation`: a collection of datasets which have different edge cases, such as a
+    very wide dataset, datasets with missing or non-numerical values, and more. This
+    typically produces most errors you might also encounter when running larger 
+    benchmarks.
+ * `timeseries`: a benchmark for testing time series forecasting integration (experimental).
+
+Below is an excerpt from the `test.yaml` file:
+
+```yaml
+- name: kc2
+  openml_task_id: 3913
+  description: "binary test dataset"
+```
+
+When writing your own benchmark definition, it needs to be discoverable by the benchmark.
+A good place to do this would be adding a `benchmarks` directory to your benchmark
+configuration directory (`~/.config/automlbenchmark` by default) and updating your
+[custom configuration](../../using/configuration/#custom-configurations) by adding:
+
+```yaml
+benchmarks:                     
+  definition_dir:               
+    - '{root}/resources/benchmarks'
+    - '{user}/resources/benchmarks'
+```
+
+Each task must have a name that is unique in the definition file (case-insensitive),
+this name will also be used as identifier (e.g., in the results files).
+Additionally, the file must have a description of where to find the dataset files
+and splits. When you have a task already on OpenML, you can directly reference it with
+`openml_task_id` to define the dataset and splits. Alternatively, you can use local files.
+
+It is also possible to benchmark your own datasets that you can not or do not want to
+upload to OpenML. The data files should be in `arff` or `csv` format and contain at least 
+one file for training data and one file for test data. When working with multiple files,
+it is useful to use an archive (`.zip`, `.tar`, `.tgz`, `.tbz`) or directory structure. 
+Use the following naming convention to allow the AutoML benchmark to infer what each file represents:
+
+    - if there's only one file for training and one for test, they should be named `{name}_train.csv` and `{name}_test.csv` (in case of CSV files).
+    - if there are multiple `folds`, they should follow a similar convention: `{name}_train_0.csv`, `{name}_test_0.csv``, {name}_train_1.csv`, `{name}_test_1.csv`, ...
+
+Examples:
+
+=== "Single Fold CSV"
+
+    ```yaml
+    - name: example_csv
+      dataset:
+        train: /path/to/data/ExampleTraining.csv
+        test:  /path/to/data/ExampleTest.csv
+        target: TargetColumn
+      folds: 1
+    ```
+
+=== "Multiple Folds CSV"
+
+    ```yaml
+    - name: example_multi_folds
+      dataset:
+        train: 
+          - /path/to/data/ExampleTraining_0.csv
+          - /path/to/data/ExampleTraining_1.csv
+        test:  
+          - /path/to/data/ExampleTest_0.csv
+          - /path/to/data/ExampleTest_1.csv
+        target: TargetColumn
+      folds: 2
+    ```
+
+=== "Directory"
+
+    It is important that the files in the directory follow the naming convention described above.
+
+    ```yaml
+    - name: example_dir
+      dataset: 
+        path: /path/to/data
+        target: TargetColumn
+      folds: 1
+    ```
+
+=== "Archive"
+
+    It is important that the files in the archive follow the naming convention described above.
+
+    ```yaml
+    - name: example_archive
+      dataset:
+        path: /path/to/archive.zip
+        target: TargetColumn
+      folds: 3
+    ```
+
+=== "Remote Files"
+
+    The remote file may also be an archive. If that is the case, it is important that 
+    the files in the archive follow the naming convention described above.
+
+    ```yaml
+    - name:  example_csv_http
+      dataset:
+        train: https://my.domain.org/data/ExampleTraining.csv
+        test:  https://my.domain.org/data/ExampleTest.csv
+        target: TargetColumn
+      folds: 1
+    ```
+
+    Remote files are downloaded to the `input_dir` folder and archives are decompressed 
+    there as well. You can change the value of this folder in your 
+    [custom config.yaml file](../../using/configuration/#custom-configurations) 
+    or specify it at the command line with the `-i` or `--indir` argument 
+    (by default, it points to the `~/.openml/cache` folder).
+
+
+The `target` attribute is optional but recommended. If not set, it will resolve to the 
+column `target` or `class` if present, and the last column otherwise.
+
+You can even make use of the [special directives](../../using/configuration/#custom-configurations) like `{user}`.
+
+```yaml
+- name: example_relative_to_user_dir
+  dataset:
+    train: "{user}/data/train.csv"
+    test: "{user}/data/test.csv"
+```
+
+After creating a benchmark definition, e.g. `~/.config/automlbenchmark/benchmarks/my_benchmark.yaml`,
+it can then be referenced when running `runbenchmark.py`: `python runbenchmark.py FRAMEWORK my_benchmark`.
+
+## Defining a Time Series Forecasting Dataset
+
+!!! warning "Time Series Forecasting should be considered experimental"
+
+    Time series forecasting support should be considered experimental and is currently
+    only supported with the AutoGluon integration.
+
+Benchmark definitions for time series datasets work in much the same way, but there are
+some additional fields and requirements to a valid time series dataset.
+
+First, the dataset must be stored as a single csv file in 
+[long format](https://doc.dataiku.com/dss/latest/time-series/data-formatting.html#long-format) 
+and must include 3 columns:
+
+  - `id_column`: An indicator column that specifies to which time series the sample belongs by a unique id.
+    The default expected name of this column is "item_id".
+  - `timestamp_column`: A column with the timestamp of the observation.
+    The default expected name of this column is "timestamp".
+  - `target`: A column with the target value of the time series
+
+Additionally, the data must satisfy the following criteria:
+
+ - The shortest time series in the dataset must have length of at least `folds * forecast_horizon_in_step + 1` (see [Generated Folds](#generated-folds)).
+ - Time series may have different lengths or have different starting timestamps, 
+   but must have the same frequency.
+ - All time series must have regular timestamp index, i.e., it must have an observation
+   for each time step from start to end.
+
+If the `id_column` or `timestamp_column` names are not the default expected ones,
+they must be explicitly stated in the definition, as can be seen in the examples below.
+Moreover, the definition must also contain the following fields:
+
+  - `path`: a local or remote path to the CSV file with time series data.
+  - `freq`: a [pandas-compatible frequency string](https://pandas.pydata.org/docs/user_guide/timeseries.html#offset-aliases) 
+    that denotes the frequency of the time series. For example, `D` for daily, `H` for hourly, or `15min` for 15-minute frequency.
+  - `forecast_horizon_in_steps`: a positive integer denoting how many future time series values need to be predicted.
+  - `seasonality`: a positive integer denoting the seasonal period of the data, measured in steps. 
+    This parameter is used for computing metrics like [mean absolute scaled error](https://en.wikipedia.org/wiki/Mean_absolute_scaled_error#Seasonal_time_series) (denoted as *m* on Wikipedia).
+
+
+=== "Default Column Names"
+
+    Given a file at `path.to/data.csv` that contains two time series with daily frequency, 
+    `A` with three observations and `B` with four observations:
+    
+    | item_id |	timestamp |	target |
+    |---------|-----------|--------:|
+    | A       |	2020-01-01|	2.0    |
+    | A       |	2020-01-02|	1.0    |
+    | A       |	2020-01-03|	5.0    |
+    | B       |	2019-05-02|	8.0    |
+    | B       |	2019-05-03|	2.0    |
+    | B       |	2019-05-04|	1.0    |
+    | B       |	2019-05-05|	9.0    |
+
+    When we specify the fields outlined above, then the respective task definition may 
+    look like the one below. Note that we do not have to specify `id_column` or 
+    `timestamp_column` as their names match the default expected value.
+    
+    ```yaml
+    - name: example_time_series_data
+      dataset:
+        path: /path/to/data.csv
+        freq: D
+        forecast_horizon_in_steps: 1
+        seasonality: 7
+        target: target
+      folds: 1
+    ```
+
+    
+
+=== "Non-default Column Names"
+
+    Given a file at `path.to/data.csv` that contains two time series with daily frequency, 
+    `A` with three observations and `B` with four observations. It is identical to
+    the example "default column values", but the header provides different column names:
+    
+    | Product |	Date |	Value |
+    |---------|-----------|--------:|
+    | A       |	2020-01-01|	2.0    |
+    | A       |	2020-01-02|	1.0    |
+    | A       |	2020-01-03|	5.0    |
+    | B       |	2019-05-02|	8.0    |
+    | B       |	2019-05-03|	2.0    |
+    | B       |	2019-05-04|	1.0    |
+    | B       |	2019-05-05|	9.0    |
+
+    When we specify the fields outlined above, then the respective task definition may 
+    look like the one below. Note that we do *have to* specify `id_column` or 
+    `timestamp_column` as their names do not match the default expected value. If left 
+    unspecified, the benchmark tool will raise an error.
+    
+    ```yaml
+    - name: example_time_series_data
+      dataset:
+        path: /path/to/data.csv
+        freq: D
+        forecast_horizon_in_steps: 1
+        seasonality: 7
+        id_column: Product
+        timestamp_column: Date
+        target: Value
+      folds: 1
+    ```
+
+    
+
+### Generated Folds
+
+AMLB automatically generates the train and test splits from the raw data depending 
+on the chosen `forecast_horizon_in_steps` and `folds` parameters. Assuming 
+`forecast_horizon_in_steps = K` and `folds = n`, and each time series has length `n * K`,
+the folds will be generated as follows:
+
+  rows | fold 0 | fold 1 | ... | fold (n-2) | fold (n-1)
+  -- | -- | -- | -- | -- | --
+  1..K | train | train | ... | train | train
+  K..2K | train | train | ... | train | test
+  2..3K | train  | train | ... | test |
+  ... |   |   |     |  
+  (n-2)K...(n-1)K | train  |  test   | |
+  (n-1)K...nK | test  |    | |
+
+As a consequence, the shortest time series in the dataset must have length of at least 
+`folds * forecast_horizon_in_step + 1`.
+
+!!! warning "This is still batch learning!"
+    
+    It is important to note that the model does not carry over between folds, each fold
+    the model is trained from scratch on the available training data. As such, it is
+    still batch learning, as opposed to [train-then-test](https://scikit-multiflow.readthedocs.io/en/stable/user-guide/core-concepts.html) 
+    (or prequential) evaluation where a single model is continuously updated instead.
+    
\ No newline at end of file
diff --git a/docs/extending/constraint.md b/docs/extending/constraint.md
new file mode 100644
index 000000000..7d6497d37
--- /dev/null
+++ b/docs/extending/constraint.md
@@ -0,0 +1,71 @@
+# Constraints
+
+Constraint definitions allow a set of common constraints to be applied to all tasks in 
+a benchmark. Default constraint definitions are available in 
+[`resources/constraint.yaml`](GITHUB/resources/constraint.yaml).
+When no constraint is specified at the command line, the `test` constraint definition is used by default.
+
+A constraint definition can consist of the following constraints:
+
+- `folds` (default=10): The number of folds to evaluate for the task. Has to be less or equal to the number of folds defined by the task.
+- `max_runtime_seconds` (default=3600): maximum time in seconds for each individual fold of a benchmark task. 
+  This parameter is usually passed directly to the framework. If it doesn't respect the 
+  constraint, the application will abort the task after `2 * max_runtime_seconds`. 
+  In any case, the _actual_ time used is always recorded and available in the results.
+- `cores` (default=-1): amount of cores used for each automl task. If non-positive, it will try to use all cores.
+- `max_mem_size_mb` (default=-1): amount of memory assigned to each automl task. 
+   If non-positive, then the amount of memory is computed from os available memory.
+- `min_vol_size_mb` (default=-1): minimum amount of free space required on the volume. If non-positive, skips verification. If the requirement is not fulfilled, a warning message will be printed, but the task will still be attempted.
+- `ec2_volume_type`: The volume type to use for the task when using EC2 instances, otherwise defaults to the value of `aws.ec2.volume_type` in your configuration file.
+
+!!! warning "Constraints are not enforced!"
+
+    These constraints are forwarded to the AutoML framework if possible but are 
+    generally not enforced. Not all AutoML frameworks allow for e.g., memory limits
+    to be set, and not all implementations that do treat it as a hard constraint.
+    For that reason, only `max_runtime_seconds` is enforced as described above.
+    It is advised when benchmarking to use an environment that mimics the given constraints.
+
+??? info "Constraints can be overriden by `benchmark`"
+
+    A benchmark definition can override constraints on a task level.
+    This is useful if you want to define a benchmark which has different constraints
+    for different tasks. The default "test" benchmark does this to limit runtime to
+    60 seconds instead of 600 seconds, which is useful to get quick results for its
+    small datasets. For more information, see [defining a benchmark](#ADD-link-to-adding-benchmark).
+
+
+When writing your own constraint definition, it needs to be discoverable by the benchmark.
+A good place to do this would be adding a `constraints.yaml` file to your benchmark
+configuration directory (`~/.config/automlbenchmark` by default) and updating your
+[custom configuration](../../using/configuration/#custom-configurations) by adding:
+
+```yaml
+benchmarks:                     
+  constraints_file: 
+    - '{root}/resources/constraints.yaml'
+    - '{user}/constraints.yaml'
+```
+
+You can then define multiple constraints in your constraint file, for example:
+```yaml title="{user}/constraints.yaml"
+---
+
+test:
+  folds: 1
+  max_runtime_seconds: 120
+
+8h16c:
+  folds: 10
+  max_runtime_seconds: 28800
+  cores: 16
+  min_vol_size_mb: 65536
+  ec2_volume_type: gp3
+```
+
+The new constraints can now be passed on the command line when executing the benchmark:
+```bash
+python runbenchmark.py randomforest validation 8h16c
+```
+*Note*: The above example is _allowed_ to run for 8 hours, but will stop earlier as 
+`RandomForest` stops early after training 2000 trees.
\ No newline at end of file
diff --git a/docs/extending/framework.md b/docs/extending/framework.md
new file mode 100644
index 000000000..ffd66f60d
--- /dev/null
+++ b/docs/extending/framework.md
@@ -0,0 +1,415 @@
+# Adding an AutoML Framework
+
+!!! warning "Rewrite in progress"
+
+    Most information on this page is accurate, and it should be complete enough to use.
+    However, it hasn't been updated to make use of `mkdocs-materials` features, and
+    _might_ have some outdated examples. Contributions welcome.
+
+## Add an AutoML framework
+
+Adding an AutoML framework consist in several steps:
+
+ 1. create a Python module that will contain everything related to the integration of this framework.
+ 1. define the framework in a [Framework definition](#framework-definition) file.
+ 1. write some integration code
+   - to download/setup the framework dynamically: by convention, this is done by a `setup.sh` script defined in the module.
+   - to run the framework using the data and constraints/parameters provided by the benchmark application: by convention, this is done by an `exec.py` script in the module, but it may require more files depending on the framework, for example if it runs on Python or R, Java...
+   
+
+### Framework definition
+
+The framework definition consists in an entry in a `yaml` file with the framework name and some properties
+
+ 1. to describe the framework and define which version will be used: `project`, `version`.
+ 1. to indicate the Python module with the integration code: `module` or `extends`.
+ 1. to pass optional parameters to the framework and/or the integration code: `params`.
+ 
+Default framework definitions are defined in file `resources/frameworks.yaml` in lexicographic order, 
+where `version` should be set to `stable`, which will point dynamically to the most recent official release available.
+
+Frameworks that offer the possibility to test cutting edge version (e.g. nightly builds, 
+`dev`/`master` repo, ...) can add an entry to `resources/frameworks_latest.yaml`, where `version` should be set to `latest`.
+
+Maintainers of this repository try to regularly — ideally, every quarter — create a 
+framework definition using frozen framework versions in order to favour the reproducibility of the published benchmarks.
+
+Following the [custom configuration](../using/configuration.md#custom-configurations), 
+it is possible to override and/or add a framework definitions by creating a `frameworks.yaml` file in your `user_dir`.
+
+See for example the `examples/custom/frameworks.yaml`:
+
+```yaml
+---
+
+GradientBoosting:
+  module: extensions.GradientBoosting
+  project: https://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting
+  params:
+    n_estimators: 500
+
+Stacking:
+  module: extensions.Stacking
+  project: https://scikit-learn.org/stable/modules/ensemble.html#stacking
+  params:
+    _rf_params: {n_estimators: 200}
+    _gbm_params: {n_estimators: 200}
+    _linear_params: {penalty: elasticnet, loss: log}
+#    _svc_params: {tol: 1e-3, max_iter: 1e5}
+#    _final_params: {penalty: elasticnet, loss: log} # sgd linear
+    _final_params: {max_iter: 1000}  # logistic/linear
+
+autosklearn_latest:
+  extends: autosklearn
+  version: latest
+  description: "this will use master branch from the autosklearn repository instead of the fixed version"
+
+autosklearn_mybranch:
+  extends: autosklearn
+  version: mybranch
+  description: "this will use mybranch branch from the autosklearn repository instead of the fixed version"
+
+autosklearn_oldgen:
+  extends: autosklearn
+  version: "0.7.1"
+  description: "this will use the latest autosklearn version from the old generation"
+
+H2OAutoML_nightly:
+  module: frameworks.H2OAutoML
+  setup_cmd: 'LATEST_H2O=`curl http://h2o-release.s3.amazonaws.com/h2o/master/latest` && pip install --no-cache-dir -U "http://h2o-release.s3.amazonaws.com/h2o/master/${{LATEST_H2O}}/Python/h2o-3.29.0.${{LATEST_H2O}}-py2.py3-none-any.whl"'
+  version: 'nightly'
+
+H2OAutoML_custom:
+  extends: H2OAutoML
+  params:
+    nfolds: 3
+    stopping_tolerance: 0.05
+```
+
+This example shows
+
+- the definitions for 2 new frameworks: `GradientBoosting` and `Stacking`. 
+  - Those definitions (optionally) externalize some parameters (e.g. `n_estimators`): the `params` property always appears in json format in the results, so that we can clearly see what has been tuned when analyzing the results later.
+  - Note that the module is case sensitive and should point to the module containing the integration code.
+  - The application will search for modules from the sys path, which includes the application `root_dir` and the `user_dir`: 
+    - that's why the default frameworks use `module: frameworks.autosklearn` for example, 
+    - and the example above can use `module: extensions.GradientBoosting` because those examples must be run by setting the `user_dir` to `examples/config`, e.g. 
+      > `python runbenchmark.py gradientboosting -u examples/custom`.
+- a custom definition (`H2OAutoML_nightly`) for the existing `frameworks.H2OAutoML` module, allowing to reuse the module for a dynamic version of the module:
+    - the `setup_cmd` is executed after the default setup of the module, so it can be used to make additional setup. To customize the setup, it is possible to use:
+      - `setup_args: my_version` (only if the `setup.sh` in the framework module supports new arguments).
+      - `setup_cmd` (as shown in this example). 
+      - `setup_script: my_additional_setup.sh`.
+- 2 custom definitions (`H2OAutoML_blending` and `H2OAutoML_custom`) simply extending the existing `H2OAutoML` definition (therefore inheriting from all its properties, including the `module` one), but overriding the `params` property, thus allowing to provide multiple "flavours" of the same framework.  
+
+The frameworks defined in this example can then be used like any other framework as soon as both the framework module and the definition file are made available to the application: in our case, this is done by the creation of the integration modules under `examples/custom/extensions` and by exposing the definitions in `examples/custom/frameworks.yaml` thanks to the entry in `examples/custom/config.yaml`:
+```yaml
+frameworks:
+  definition_file:  # this allows to add custom framework definitions (in {user}/frameworks.yaml) on top of the default ones.
+    - '{root}/resources/frameworks.yaml'
+    - '{user}/frameworks.yaml'
+```
+
+By pointing the `user_dir` to `examples/custom`, our `config.yaml` is also loaded, and we can use the new frameworks:
+```bash
+python runbenchmark.py gradientboosting -u examples/custom
+python runbenchmark.py stacking -u examples/custom
+python runbenchmark.py h2oautoml_blending -u examples/custom
+```
+
+*Note:*
+
+By default, when generating a docker image, the image name is created as `automlbenchmark/{framework}:{version}-{branch}` with the framework name in lowercase, and `branch` being the branch of the `automlbenchmark` app (usually `stable`).
+However, it is possible to customize this image name as follow:
+```yaml
+MyFramework:
+  version: 1.0
+  module: extensions.MyFramework
+  docker:
+    author: my_docker_repo
+    image: my_image
+    tag: my_tag
+```
+which will result in the docker image name `my_docker_repo/my_image:my_tag-{branch}`, with `branch` still being the branch of the application.
+
+
+### Framework integration
+
+If the framework definition allows to use the new framework from the application, the (not so) hard part is to integrate it.
+
+There are already several frameworks already integrated under `frameworks` directory (+ the examples under `examples/custom`), so the best starting point when adding a new framework is to first look at the existing ones.
+
+Among the existing frameworks, we can see different type of integrations:
+
+- trivial integration: these are frameworks running on Python and using dependencies (`numpy`, `sklearn`) already required by the application itself. These are not really AutoML toolkits, but rather integrations using `sklearn` to provide a reference when analyzing the results: cf. `constantpredictor`, `DecisionTree`.
+- Python API integration: these are frameworks that can be run directly from Python: cf. `autosklearn`, `H2OAutoML`, `TPOT`, `RandomForest`, `TunedRandomForest`.
+   - contrary to the trivial integration, those require a `setup` phase.
+   - Most of them currently run using the same dependencies as the application, which is not recommended due to potential version conflicts (especially with `sklearn`). This was not a major constraint with the first frameworks implemented, but now, those integrations can and will be slightly changed to [run in their dedicated virtual environment], using their own dependencies: cf. `RandomForest` and `examples/custom/extensions/Stacking` for examples.
+- non-Python frameworks: those frameworks typically run in `R` or `Java` and don't provide any Python API. The integration is then still done by spawning the `Java` or `R` process from the `exec.py`: cf. `AutoWEKA` or `ranger`, respectively.
+
+#### Recommended structure
+
+By convention, the integration is done using the following structure:
+
+```text
+frameworks/autosklearn/
+|-- __init__.py
+|-- exec.py
+|-- requirements.txt
+`-- setup.sh
+```
+
+Please note however, that this structure is not a requirement, the only requirement is the contract exposed by the integration module itself, i.e. by the `__init__.py` file.
+
+A simple `__init__.py` would look like this:
+
+```python
+from amlb.utils import call_script_in_same_dir
+
+
+def setup(*args, **kwargs):
+    call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)
+
+
+def run(*args, **kwargs):
+    from .exec import run
+    return run(*args, **kwargs)
+
+```
+
+where we see that the module should expose (only `run` is actually required) the following functions:
+
+- `setup` (optional): called by the application to setup the given framework, usually by simply running a `setup.sh` script that will be responsible for potentially creating a local virtual env, downloading and installing the dependencies. 
+   The `setup` function can also receive the optional `setup_args` param from the [framework definition](#framework-definition) as an argument. 
+- `run`: called by the benchmark application to execute a task against the framework, using the selected dataset and constraints. We will describe the parameters in detail below, for now, just note that by convention, we just load the `exec.py` file from the module and delegate the execution to its `run` function.
+- `docker_commands` (optional): called by the application to collect docker instructions that are specific to the framework. If the framework requires a `setup` phase, then the string returned by this function should at least ensure that the setup is also executed during the docker image creation, that's one reason why it is preferable to do all the setup in a `setup.sh` script, to allow the docker support above.
+
+#### Frameworks with Python API
+
+##### Frameworks requiring a dedicated virtual env
+
+For frameworks with Python API, we may worry about version conflicts between the packages used by the application (e.g. `sklearn`, `numpy`, `pandas`) and the ones required by the framework.
+
+In this case, the integration is slightly different as you can see with the `RandomForest` integration allowing to use any version of `sklearn`.
+
+This is the basic structure after the creation of the dedicated Python virtual environment during setup:
+```text
+frameworks/RandomForest/
+|-- __init__.py
+|-- exec.py
+|-- requirements.txt
+|-- setup.sh
+`-- venv/
+    `-- (this local virtual env is created by the frameworks/shared/setup.sh)
+```
+
+Noticeable differences with a basic integration:
+
+- the `venv` is created in `setup.sh` by passing the current dir when sourcing the `shared/setup.sh` script: `. $HERE/../shared/setup.sh $HERE`.
+- the `run` function in `__init__.py` prepares the data (in the application environment) before executing the `exec.py` in the dedicated `venv`. The call to `run_in_venv` is in charge of serializing the input, calling `exec.py` and deserializing + saving the results from `exec`.
+- `exec.py`, when calls in the subprocess (function `__main__`), calls `call_run(run)` which deserializes the input (dataset + config) and passes it to the `run` function that just need to return a `result` object.
+
+*Note A*:
+
+As the serialization/deserialization of `numpy` arrays can be costly for very large datasets, it is recommended to use dataset serialization only if the framework itself doesn't support loading datasets from files. 
+
+This means that, in the `__init__.py` instead of implementing `run` as:
+```python
+data = dict(
+    train=dict(
+        X=dataset.train.X,
+        y=dataset.train.y
+    ),
+    test=dict(
+        X=dataset.test.X,
+        y=dataset.test.y
+    )
+)
+
+return run_in_venv(__file__, "exec.py",
+                   input_data=data, dataset=dataset, config=config)
+```
+it could simply expose the dataset paths (the application avoids loading the data if not explicitly needed by the framework):
+```python
+data = dict(
+    target=dict(name=dataset.target.name),
+    train=dict(path=dataset.train.path),
+    test=dict(path=dataset.test.path)
+)
+return run_in_venv(__file__, "exec.py",
+                   input_data=data, dataset=dataset, config=config)
+```
+
+*Note B*:
+
+The serialization/deserialization of data between the main process and the framework process can be customized using the `options` parameter:
+The allowed options for (de)serialization are defined by the object `amlb.utils.serialization.ser_config`.
+
+For example:
+```python
+data = dict(
+    train=dict(
+        X=dataset.train.X,
+        y=dataset.train.y
+    ),
+    test=dict(
+        X=dataset.test.X,
+        y=dataset.test.y
+    )
+)
+
+options = dict(
+    serialization=dict(sparse_dataframe_deserialized_format='dense')
+)
+return run_in_venv(__file__, "exec.py",
+                   input_data=data, dataset=dataset, config=config, options=options)
+```
+
+
+
+#### Other Frameworks
+
+Integration of frameworks without any Python API is done in similar way, for example:
+
+```text
+frameworks/AutoWEKA/
+|-- __init__.py
+|-- exec.py
+|-- requirements.txt
+|-- setup.sh
+`-- lib/
+    `-- (this is where the framework dependencies go, usually created by setup.sh)
+```
+or
+```text
+frameworks/ranger/
+|-- __init__.py
+|-- exec.R
+|-- exec.py
+|-- requirements.txt
+`-- setup.sh
+```
+
+Here are the main differences:
+- the `setup` phase is identical, but if at runtime, some executable file or library is required that need to be installed locally (as opposed to globally: for example, `R` or `java` executable are usually installed globally), we just recommend to put everything under the integration module (for example in `lib` and/or `bin` subfolders as for `AutoWEKA`). This is also true for some Python frameworks (cf. `hyperoptsklearn` integration for example, where the modules are loaded from `frameworks/hyperoptsklearn/lib/hyperopt-sklearn`).
+- the framework is then executed by building a command manually in `exec.py`, running it in a subprocess, and finally collecting the results generated by the subprocess. For example, in `ranger/exec.py`:
+  ```python
+  with Timer() as training:
+    run_cmd(("Rscript --vanilla -e \""
+             "source('{script}'); "
+             "run('{train}', '{test}', '{output}', cores={cores}, meta_results_file='{meta_results}', task_type='{task_type}')"
+             "\"").format(
+        script=os.path.join(here, 'exec.R'),
+        train=dataset.train.path,
+        test=dataset.test.path,
+        output=config.output_predictions_file,
+        meta_results=meta_results_file,
+        task_type=config.type,
+        cores=config.cores
+    ), _live_output_=True)
+  ```
+  Here, the `exec.R` script is also responsible to save the predictions in the expected format.
+
+
+#### Add a default framework
+
+Is called "default framework" an AutoML framework whose integration is available on `master` branch under the `frameworks` folder, and with a simple definition in `resources/frameworks.yaml`.  
+
+*NOTE:*
+There are a few requirements when integrating a new default framework:
+
+- The code snippet triggering the training should use only defaults (no AutoML hyper parameters), plus possibly a generic `**kwargs` in order to support `params` section in custom framework definitions.  In other words, one of the requirements for being included in the benchmark is that the framework is submitted without any tweaks to default settings.  This is to prevent submissions (systems) from overfitting or tuning to the benchmark.
+- There must be a way to limit the runtime of the algorithm (a maximum runtime parameter).
+- Exceptions:
+  - the problem type ("classification", "regression", "binary", "multiclass"): this is available through `config.type` or `dataset.type`. 
+  - information about data, for example the column types: available through the `dataset` object.
+  - time, cpu and memory constraints: those must be provided by the benchmark application through the `config` object.  
+  - the objective function: provided by `config.metric` (usually requires a translation for a given framework).
+  - seed: provided by `config.seed`
+  - paths to folders (output, temporary...): if possible, use `config.output_dir` or a subfolder (see existing integrations).
+- The default framework definition in `resources/frameworks.yaml` shouldn't have any `params` section: this `params` section is intended for custom definitions, not default ones.
+```yaml
+good_framework:
+   version: "0.0.1"
+   project: "http://go.to/good_framework"
+
+bad_framework:
+   version: "0.0.1"
+   project: "http://go.to/bad_framework"
+   params: 
+     enable_this: true
+     use: ['this', 'that']
+```
+
+Using the instructions above:
+
+ 1. verify that there is an issue created under <https://github.com/openml/automlbenchmark/issues> for the framework you want to add, or create one.
+ 1. create a private branch for your integration changes.
+ 1. create the framework module (e.g. `MyFramework`) under `frameworks` folder.
+ 1. define the module (if possible without any `params`) in `resources/frameworks.yaml`.
+ 1. try to setup the framework: 
+    > python runbenchmark.py myframework -s only
+ 1. fixes the framework setup until it works: the setup being usually a simple `setup.sh` script, you should be able to test it directly without using the application.
+ 1. try to run simple test against one fold using defaults (`test` benchmark and `test` constraints) with the `-Xtest_mode` that will trigger additional validations:
+    > python runbenchmark.py myframework -f 0 -Xtest_mode
+ 1. fix the module integration code until the test produce all results with no error (if the integration generated an error, it is visible in the results).
+ 1. if this works, validate it against the `validation` dataset using one fold:
+    > python runbenchmark.py myframework validation 1h4c -f 0 -Xtest_mode
+ 1. if this works, try to run it in docker to validate the docker image setup: 
+    > python runbenchmark.py myframework -m docker
+ 1. if this works, try to run it in aws: 
+    > python runbenchmark.py myframework -m aws
+ 1. add a brief description of the framework to the documentation in [docs/website/framework.html](GITHUB/docs/website/frameworks.html) following the same formatting as the other entries.
+ 1. create a pull request, and ask a review from authors of `automlbenchmark`: they'll also be happy to help you during this integration.
+
+#### Add a custom framework
+
+You may want to integrate a framework without wanting to make this publicly available.
+
+In this case, as we've seen above, there's always the possibility to integrate your framework in a custom `user_dir`.
+
+Using the instructions above:
+
+ 1. define what is (or will be) your custom `user_dir` for this framework.
+ 1. ensure it contains a `config.yaml`, otherwise create one (for example copy [this one](../using/configuration.md#custom-configurations) or `examples/custom/config.yaml`).
+ 1. create the framework module somewhere under this `user_dir`, e.g. `{user_dir}/extensions/MyFramework`.
+ 1. define the module in `{user_dir}/frameworks.yaml` (create the file if needed).
+ 1. follow the same steps as for a "default" framework to implement the integration: setup, test, ... except that you always need to specify the `user_dir`, e.g. for testing:
+    > python runbenchmark.py myframework -f 0 -u {user_dir}
+ 1. there may be some issues when trying to build the docker image when the framework is in a custom folder, as all the files should be under the docker build context: solving this probably requires a multi-stage build, needs more investigation. For now, if you really need a docker image, you can either build it manually, or simply copy the `extensions` folder temporarily under `automlbenchmark`.
+ 1. even without docker image, you can run the framework on AWS, as soon as the custom `config.yaml`, `frameworks.yaml` and `extensions` folder are made available as AWS resources: cf. again the [custom configuration](../using/configuration.md#custom-configurations). The application will copy those files to the EC2 instances into a local `user_dir` and will be able to setup the framework there.
+
+
+## Using a Different Hyperparameter Configuration
+
+When you want to use an existing framework integration with a different hyperparameter
+configuration, it is often enough to write only a custom framework definition without
+further changes. 
+
+Framework definitions accept a `params` dictionary for pass-through parameters, 
+i.e., parameters that are directly accessible from the `exec.py` file in the framework 
+integration executing the AutoML training. *Most* integration scripts use this to
+overwrite any (default) hyperparameter value. Use the `extends` field to indicate
+which framework definition to copy default values from, and then add any fields to
+overwrite. In the example below the `n_estimators` and `verbose` params are passed 
+directly to the `RandomForestClassifier`, which will now train only 200 trees
+(default is 2000):
+
+```yaml
+RandomForest_custom:
+  extends: RandomForest
+  params:
+    n_estimators: 200
+    verbose: true
+```
+
+This new definition can be used as normal: 
+```
+python runbenchmark.py randomforest_custom ...
+```
+
+!!! note
+    By convention, param names starting with `_` are filtered out (they are not passed 
+    to the framework) but are used for custom logic in the `exec.py`. For example, the
+    `_save_artifact` field is often used to allow additional artifacts, such as logs or
+    models, to be saved.
diff --git a/docs/extending/index.md b/docs/extending/index.md
new file mode 100644
index 000000000..54f1a4939
--- /dev/null
+++ b/docs/extending/index.md
@@ -0,0 +1,8 @@
+# Extending the Benchmark Tool
+
+You can extend the benchmark tool in multiple ways.
+[Benchmarks](benchmark.md) define collections of tasks on which to evaluate AutoML
+frameworks. [Constraints](constraint.md) specify the resource constraints forwarded
+to the AutoML framework, such as a time or memory limit. Finally, it is possible to
+[add AutoML frameworks](framework.md#add-a-custom-framework) or to 
+[use an integrated AutoML framework with non-default configuration](framework.md#using-a-different-hyperparameter-configuration).
\ No newline at end of file
diff --git a/docs/faq.md b/docs/faq.md
new file mode 100644
index 000000000..481bb4b0d
--- /dev/null
+++ b/docs/faq.md
@@ -0,0 +1,40 @@
+# Frequently Asked Questions
+
+If your question is not answered here, please check our Github [issue tracker](https://github.com/openml/automlbenchmark/issues) and [discussion board](https://github.com/openml/automlbenchmark/discussions). 
+If you still can not find an answer, please [open a Q&A discussion on Github](https://github.com/openml/automlbenchmark/discussions/new?category=q-a).
+
+## (When) will you add framework X?
+
+We are currently not focused on integrating additional AutoML systems.
+However, we process any pull requests that add frameworks and will assist with the integration.
+The best way to make sure framework X gets included is to start with the integration 
+yourself or encourage the package authors to do so. For technical details see 
+[Adding an AutoML Framework](./extending/framework.md).
+
+It is also possible to open a Github issue indicating the framework you would like added.
+Please use a clear title (e.g. "Add framework: X") and provide some relevant information 
+(e.g. a link to the documentation).
+This helps us keep track of which frameworks people are interested in seeing included.
+
+
+## Framework setup is not executed
+First, it is important to note that we officially only officially support Ubuntu 22.04 LTS,
+though other versions and MacOS generally work too. If that does not work, for 
+example with Windows, use docker mode as per [the installation instructions](getting_started.md#installation).
+For MacOS, it may be required to have [brew](https://brew.sh) installed.
+
+If you are experiencing issues with the framework setup not being executed, please
+try the following steps before opening an issue:
+
+  - delete the `.marker_setup_safe_to_delete` from the framework module and try to run 
+    the benchmark again. This marker file is automatically created after a successful 
+    setup to avoid having to execute it each tine (setup phase can be time-consuming), 
+    this marker then prevents auto-setup, except if the `-s only` or `-s force` args below are used.
+
+  - force the setup using the  `--setup=only` arg on the command line. This forces the
+    setup to take place. If the setup is now done correctly, you can run the commands
+    as normal to start the benchmark. If not, continue.
+
+  - manually clean the installation files by deleting the `lib`, `venv` and `.setup` folders
+    in the given framework folder (e.g. `frameworks/MyFramework`), and try again.
+
diff --git a/docs/getting_started.md b/docs/getting_started.md
new file mode 100644
index 000000000..e1d8d02dc
--- /dev/null
+++ b/docs/getting_started.md
@@ -0,0 +1,267 @@
+---
+title: Getting Started
+description: A short tutorial on installing the software and running a simple benchmark.
+---
+
+# Getting Started
+
+The AutoML Benchmark is a tool for benchmarking AutoML frameworks on tabular data.
+It automates the installation of AutoML frameworks, passing it data, and evaluating
+their predictions. 
+[Our paper](https://arxiv.org/pdf/2207.12560.pdf) describes the design and showcases 
+results from an evaluation using the benchmark. 
+This guide goes over the minimum steps needed to evaluate an
+AutoML framework on a toy dataset.
+
+## Installation
+These instructions assume that [Python 3.9 (or higher)](https://www.python.org/downloads/) 
+and [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git) are installed,
+and are available under the alias `python` and `git`, respectively. We recommend
+[Pyenv](https://github.com/pyenv/pyenv) for managing multiple Python installations,
+if applicable. We support Ubuntu 22.04, but many linux and MacOS versions likely work
+(for MacOS, it may be necessary to have [`brew`](https://brew.sh) installed).
+
+First, clone the repository:
+
+```bash
+git clone https://github.com/openml/automlbenchmark.git --branch stable --depth 1
+cd automlbenchmark
+```
+
+Create a virtual environments to install the dependencies in:
+
+=== ":simple-linux: Linux"
+
+    ```bash
+    python -m venv venv
+    source venv/bin/activate
+    ```
+
+=== ":material-apple: MacOS"
+
+    ```bash
+    python -m venv venv
+    source venv/bin/activate
+    ```
+
+=== ":simple-windows: Windows"
+
+    ```bash
+    python -m venv ./venv
+    venv/Scripts/activate
+    ```
+
+Then install the dependencies:
+
+```bash
+python -m pip install --upgrade pip
+python -m pip install -r requirements.txt
+```
+
+
+??? windows "Note for Windows users"
+
+    The automated installation of AutoML frameworks is done using shell script,
+    which doesn't work on Windows. We recommend you use
+    [Docker](https://docs.docker.com/desktop/install/windows-install/) to run the
+    examples below. First, install and run `docker`. 
+    Then, whenever there is a `python runbenchmark.py ...` 
+    command in the tutorial, add `-m docker` to it (`python runbenchmark.py ... -m docker`).
+
+??? question "Problem with the installation?"
+
+    On some platforms, we need to ensure that requirements are installed sequentially.
+    Use `xargs -L 1 python -m pip install < requirements.txt` to do so. If problems 
+    persist, [open an issue](https://github.com/openml/automlbenchmark/issues/new) with
+    the error and information about your environment (OS, Python version, pip version).
+
+
+## Running the Benchmark
+
+To run a benchmark call the `runbenchmark.py` script specifying the framework to evaluate.
+See [integrated frameworks](WEBSITE/frameworks.html) for a list of supported frameworks, or the [adding a frameworking](extending/framework.md) page on how to add your own.
+
+### Example: a test run with Random Forest
+Let's try evaluating the `RandomForest` baseline, which uses [scikit-learn](https://scikit-learn.org/stable/)'s random forest:
+
+=== ":simple-linux: Linux"
+
+    ```bash
+    python runbenchmark.py randomforest 
+    ```
+
+=== ":material-apple: MacOS"
+
+    ```bash
+    python runbenchmark.py randomforest 
+    ```
+
+=== ":simple-windows: Windows"
+    As noted above, we need to install the AutoML frameworks (and baselines) in
+    a container. Add `-m docker` to the command as shown:
+    ```bash
+    python runbenchmark.py randomforest -m docker
+    ```
+
+    !!! warning "Important"
+        Future example usages will only show invocations without `-m docker` mode,
+        but Windows users will need to run in some non-local mode.
+
+After running the command, there will be a lot of output to the screen that reports
+on what is currently happening. After a few minutes final results are shown and should 
+look similar to this:
+
+```
+Summing up scores for current run:
+               id        task  fold    framework constraint     result      metric  duration      seed
+openml.org/t/3913         kc2     0 RandomForest       test   0.865801         auc      11.1 851722466
+openml.org/t/3913         kc2     1 RandomForest       test   0.857143         auc       9.1 851722467
+  openml.org/t/59        iris     0 RandomForest       test  -0.120755 neg_logloss       8.7 851722466
+  openml.org/t/59        iris     1 RandomForest       test  -0.027781 neg_logloss       8.5 851722467
+openml.org/t/2295 cholesterol     0 RandomForest       test -44.220800    neg_rmse       8.7 851722466
+openml.org/t/2295 cholesterol     1 RandomForest       test -55.216500    neg_rmse       8.7 851722467
+```
+
+The result denotes the performance of the framework on the test data as measured by
+the metric listed in the metric column. The result column always denotes performance 
+in a way where higher is better (metrics which normally observe "lower is better" are
+converted, which can be observed from the `neg_` prefix).
+
+While running the command, the AutoML benchmark performed the following steps:
+
+ 1. Create a new virtual environment for the Random Forest experiment. 
+    This environment can be found in `frameworks/randomforest/venv` and will be re-used 
+    when you perform other experiments with `RandomForest`.
+ 2. It downloaded datasets from [OpenML](https://www.openml.org) complete with a 
+    "task definition" which specifies [cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html) folds.
+ 3. It evaluated `RandomForest` on each (task, fold)-combination in a separate subprocess, where:
+    1. The framework (`RandomForest`) is initialized.
+    2. The training data is passed to the framework for training.
+    3. The test data is passed to the framework to make predictions on.
+    4. It passes the predictions back to the main process
+ 4. The predictions are evaluated and reported on. They are printed to the console and 
+    are stored in the `results` directory. There you will find:
+    1. `results/results.csv`: a file with all results from all benchmarks conducted on your machine.
+    2. `results/randomforest.test.test.local.TIMESTAMP`: a directory with more information about the run,
+        such as logs, predictions, and possibly other artifacts.
+
+!!! info "Docker Mode" 
+
+    When using docker mode (with `-m docker`) a docker image will be made that contains
+    the virtual environment. Otherwise, it functions much the same way.
+
+### Important Parameters
+
+As you can see from the results above, the  default behavior is to execute a short test
+benchmark. However, we can specify a different benchmark, provide different constraints,
+and even run the experiment in a container or on AWS. There are many parameters
+for the `runbenchmark.py` script, but the most important ones are:
+
+`Framework (required)`
+
+: The AutoML framework or baseline to evaluate and is not case-sensitive. See
+  [integrated frameworks](WEBSITE/frameworks.html) for a list of supported frameworks. 
+  In the above example, this benchmarked framework `randomforest`.
+
+`Benchmark (optional, default='test')`
+
+: The benchmark suite is the dataset or set of datasets to evaluate the framework on.
+  These can be defined as on [OpenML](https://www.openml.org) as a [study or task](extending/benchmark.md#defining-a-benchmark-on-openml) 
+  (formatted as `openml/s/X` or `openml/t/Y` respectively) or in a [local file](extending/benchmark.md#defining-a-benchmark-with-a-file).
+  The default is a short evaluation on two folds of `iris`, `kc2`, and `cholesterol`.
+
+`Constraints (optional, default='test')`
+
+: The constraints applied to the benchmark as defined by default in [constraints.yaml](GITHUB/resources/constraints.yaml).
+  These include time constraints, memory constrains, the number of available cpu cores, and more.
+  Default constraint is `test` (2 folds for 10 min each). 
+
+    !!! warning "Constraints are not enforced!"
+        These constraints are forwarded to the AutoML framework if possible but, except for
+        runtime constraints, are generally not enforced. It is advised when benchmarking
+        to use an environment that mimics the given constraints.
+
+    ??? info "Constraints can be overriden by `benchmark`"
+        A benchmark definition can override constraints on a task level.
+        This is useful if you want to define a benchmark which has different constraints
+        for different tasks. The default "test" benchmark does this to limit runtime to
+        60 seconds instead of 600 seconds, which is useful to get quick results for its
+        small datasets. For more information, see [defining a benchmark](#ADD-link-to-adding-benchmark).
+
+`Mode (optional, default='local')`
+
+:  The benchmark can be run in four modes:
+
+     * `local`: install a local virtual environment and run the benchmark on your machine.
+     * `docker`: create a docker image with the virtual environment and run the benchmark in a container on your machine. 
+                 If a local or remote image already exists, that will be used instead. Requires [Docker](https://docs.docker.com/desktop/).
+     * `singularity`: create a singularity image with the virtual environment and run the benchmark in a container on your machine. Requires [Singularity](https://docs.sylabs.io/guides/3.5/user-guide/introduction.html).
+     * `aws`: run the benchmark on [AWS EC2](https://aws.amazon.com/free/?trk=b3f93e34-c1e0-4aa9-95f8-6d2c36891d8a&sc_channel=ps&ef_id=CjwKCAjw-7OlBhB8EiwAnoOEk0li05IUgU9Ok2uCdejP22Yr7ZuqtMeJZAdxgL5KZFaeOVskCAsknhoCSjUQAvD_BwE:G:s&s_kwcid=AL!4422!3!649687387631!e!!g!!aws%20ec2!19738730094!148084749082&all-free-tier.sort-by=item.additionalFields.SortRank&all-free-tier.sort-order=asc&awsf.Free%20Tier%20Types=*all&awsf.Free%20Tier%20Categories=*all) instances.
+              It is possible to run directly on the instance or have the EC2 instance run in `docker` mode.
+              Requires valid AWS credentials to be configured, for more information see [Running on AWS](#ADD-link-to-aws-guide).
+
+
+For a full list of parameters available, run:
+
+```
+python runbenchmark.py --help
+```
+
+### Example: AutoML on a specific task and fold
+
+The defaults are very useful for performing a quick test, as the datasets are small
+and cover different task types (binary classification, multiclass classification, and 
+regression). We also have a ["validation" benchmark](GITHUB/resources/benchmarks/validation.yaml)
+suite for more elaborate testing that also includes missing data, categorical data, 
+wide data, and more. The benchmark defines 9 tasks, and evaluating two folds with a
+10-minute time constraint would take roughly 3 hours (=9 tasks * 2 folds * 10 minutes,
+plus overhead). Let's instead use the `--task` and `--fold` parameters to run only a
+specific task and fold in the `benchmark` when evaluating the 
+[flaml](https://microsoft.github.io/FLAML/) AutoML framework:
+
+```
+python runbenchmark.py flaml validation test -t eucalyptus -f 0
+```
+
+This should take about 10 minutes plus the time it takes to install `flaml`.
+Results should look roughly like this:
+
+```
+Processing results for flaml.validation.test.local.20230711T122823
+Summing up scores for current run:
+               id       task  fold framework constraint    result      metric  duration       seed
+openml.org/t/2079 eucalyptus     0     flaml       test -0.702976 neg_logloss     611.0 1385946458
+```
+
+Similarly to the test run, you will find additional files in the `results` directory.
+
+
+### Example: Benchmarks on OpenML
+
+In the previous examples, we used benchmarks which were defined in a local file
+([test.yaml](GITHUB/resources/benchmarks/test.yaml) and 
+[validation.yaml](GITHUB/resources/benchmarks/validation.yaml), respectively). 
+However, we can also use tasks and
+benchmarking suites defined on OpenML directly from the command line. When referencing
+an OpenML task or suite, we can use `openml/t/ID` or `openml/s/ID` respectively as 
+argument for the benchmark parameter. Running on the [iris task](https://openml.org/t/59):
+
+```
+python runbenchmark.py randomforest openml/t/59
+```
+
+or on the entire [AutoML benchmark classification suite](https://openml.org/s/271) (this will take hours!):
+
+```
+python runbenchmark.py randomforest openml/s/271
+```
+
+!!! info "Large-scale Benchmarking"
+
+    For large scale benchmarking it is advised to parallelize your experiments,
+    as otherwise it may take months to run the experiments.
+    The benchmark currently only supports native parallelization in `aws` mode
+    (by using the `--parallel` parameter), but using the `--task` and `--fold` parameters 
+    it is easy to generate scripts that invoke individual jobs on e.g., a SLURM cluster.
+    When you run in any parallelized fashion, it is advised to run each process on
+    separate hardware to ensure experiments can not interfere with each other.
diff --git a/docs/index.md b/docs/index.md
index d884d6b24..82ea1189c 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,18 +1,21 @@
----
-layout: index
-title: Home
----
+# AutoML Benchmark
 
-# An Open Source AutoML Benchmark
+These are the AutoML Benchmark documentation pages with information on how to
+configure and use the AutoML Benchmark tool. For first time users, we advise
+visiting the [getting started](getting_started.md) page.
 
-This the homepage for the open and extensible AutoML Benchmark.
-The AutoML Benchmark provides an overview and comparison of open-source AutoML systems.
-It is *open* because the benchmark infrastructure is [open-source](https://github.com/openml/automlbenchmark/)
-and *extensible* because you can [add your own](extending.md) problems and datasets.
+This documentation is accompanied by [our website](WEBSITE) 
+which has information on [our papers](WEBSITE/papers.html),
+[integrated frameworks](WEBSITE/frameworks.html), 
+and [evaluation results](WEBSITE/results.html).
 
-A brief overview and further references for each AutoML system can be found on the [AutoML systems](automl_overview.md) page.
-For a thorough explanation of the benchmark, and evaluation of results, you can read our [paper](paper.md).
-If you want to analyze the results yourself, you can do this on the [results](results.md) pages.
+!!! note "Help Wanted!"
 
-Because the benchmark infrastructure is open-source, you can rerun the benchmark yourself, use custom datasets or your own AutoML platform as explained in our [project documentation](documentation.md).
-We also invite you to [submit your own AutoML](documentation.md) system to be evaluated against the benchmark and included in the overview.
+    We recently switched to generating doc pages with `mkdocs-material`. In the process,
+    we did our best to make sure to use the additional functionalities to better
+    present the information and make it easy to find through notes, tabs, and other features.
+    
+    It is possible to find parts of the documentation are not updated (correctly), or
+    are not clear. We welcome all help to improve the documentation. If you have a 
+    suggestion on how to improve the documentation, please open an issue. If you find
+    an error, please open an issue or open a pull request directly. Thanks! :pray:
\ No newline at end of file
diff --git a/docs/modifications.md b/docs/modifications.md
deleted file mode 100644
index c90dff209..000000000
--- a/docs/modifications.md
+++ /dev/null
@@ -1,42 +0,0 @@
----
----
-# Required Modifications
-
-Each method is given, unless otherwise specified or unavailable, information about resources:
-* Memory
-* Runtime
-* Number of cores
-
-And, additionally:
-* Metric to optimize
-
-## auto-sklearn
-
-### Data preprocessing
-Encode string data to numeric (labelencoding).
-
-### Non-default arguments
-
-## Auto-WEKA
-logloss metric is specified as kBInformation.
-
-### Data preprocessing
-None, ARFF file used directly.
-Output is rewritten so it fits `docker/common/evaluate.py` expectations.
-
-### Non-default arguments
-
-## H2O AutoML
-
-### Data preprocessing
-None, ARFF file used directly.
-
-### Non-default arguments
-
-## TPOT
-
-### Data preprocessing
-Encode string data to numeric (labelencoding).
-
-### Non-default arguments
-
diff --git a/docs/paper.md b/docs/paper.md
deleted file mode 100644
index 8011f3628..000000000
--- a/docs/paper.md
+++ /dev/null
@@ -1,17 +0,0 @@
----
-layout: category
-title: Paper
-sidebar_sort_order: 9
----
-[PDF](https://www.automl.org/wp-content/uploads/2019/06/automlws2019_Paper45.pdf) \| [arXiv](https://arxiv.org/abs/1907.00909) \| [BibTeX](bib_workshop.md)
-
-> First look of the benchmark submitted to [AutoML Workshop at ICML 2019](https://sites.google.com/view/automl2019icml).
-
-**abstract:** In recent years, an active field of research has developed around automated machine learning(AutoML). 
-Unfortunately,  comparing  different  AutoML  systems  is  hard  and  often  done in correctly. 
-We introduce an open, ongoing, and extensible benchmark framework which follows best practices and avoids common mistakes.
-The framework is open-source, uses public datasets and has a website with up-to-date results.
-We use the framework to conduct a thorough comparison of 4 AutoML systems across 39 datasets and analyze the results.
-
----
-
diff --git a/docs/results.md b/docs/results.md
deleted file mode 100644
index 7c9aa541a..000000000
--- a/docs/results.md
+++ /dev/null
@@ -1,40 +0,0 @@
----
-layout: page
-title: Results
-sidebar_link: true
-sidebar_sort_order: 1
----
-
-### Complete Results
-[Complete results][reports] are also available in [csv] format or as simple [visualizations] for now.
-We hope to provide interactive visualization in the future.
-
-### Binary Results
-A sample of the results obtained by running each framework over 10 folds for various durations each: for binary tasks, the plotted metric is AUC.
-Smaller and medium datasets were trained for 1h and 4h. Larger datasets have been trained for 4h and 8h.
-
-![Binary Results Stripplot 1h][binary_1h]
-
-![Binary Results Stripplot 4h][binary_4h]
-
-![Binary Results Stripplot 8h][binary_8h]
-
-### Multiclass Results
-A sample of the results obtained by running each framework over 10 folds for various durations each: for multiclass tasks, the plotted metric is logloss.
-Smaller and medium datasets were trained for 1h and 4h. Larger datasets have been trained for 4h and 8h.
-
-![Multiclass Results Stripplot 1h][multiclass_1h]
-
-![Multiclass Results Stripplot 4h][multiclass_4h]
-
-![Multiclass Results Stripplot 8h][multiclass_8h]
-
-[binary_1h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/1h/binary_results_stripplot.png
-[multiclass_1h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/1h/multiclass_results_stripplot.png
-[binary_4h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/4h/binary_results_stripplot.png
-[multiclass_4h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/4h/multiclass_results_stripplot.png
-[binary_8h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/8h/binary_results_stripplot.png
-[multiclass_8h]:https://raw.github.com/openml/automlbenchmark/master/reports/graphics/8h/multiclass_results_stripplot.png
-[reports]:https://github.com/openml/automlbenchmark/tree/master/reports
-[csv]:https://github.com/openml/automlbenchmark/tree/master/reports/tables
-[visualizations]:https://github.com/openml/automlbenchmark/tree/master/reports/graphics
\ No newline at end of file
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
new file mode 100644
index 000000000..7727c4398
--- /dev/null
+++ b/docs/stylesheets/extra.css
@@ -0,0 +1,25 @@
+:root {
+  --md-primary-fg-color:        #1971c2;
+  --md-primary-fg-color--light: white;
+  --md-primary-fg-color--dark:  #90030C;
+
+  --md-admonition-icon--windows: url('data:image/svg+xml;charset=utf-8,<svg xmlns="http://www.w3.org/2000/svg" height="1em" viewBox="0 0 448 512"><!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license (Commercial License) Copyright 2023 Fonticons, Inc. --><path d="M0 93.7l183.6-25.3v177.4H0V93.7zm0 324.6l183.6 25.3V268.4H0v149.9zm203.8 28L448 480V268.4H203.8v177.9zm0-380.6v180.1H448V32L203.8 65.7z"/></svg>')
+}
+.md-typeset .admonition.windows,
+.md-typeset details.windows {
+  border-color: rgb(25, 113, 194);
+}
+.md-typeset .windows > .admonition-title,
+.md-typeset .windows > summary {
+  background-color: rgba(25, 113, 194, 0.1);
+}
+.md-typeset .windows > .admonition-title::before,
+.md-typeset .windows > summary::before {
+  background-color: rgb(25, 113, 194);
+  -webkit-mask-image: var(--md-admonition-icon--windows);
+          mask-image: var(--md-admonition-icon--windows);
+}
+
+.md-typeset .limit_max_height code {
+    max-height: 20rem;
+}
\ No newline at end of file
diff --git a/docs/using/aws.md b/docs/using/aws.md
new file mode 100644
index 000000000..1e9c76a2e
--- /dev/null
+++ b/docs/using/aws.md
@@ -0,0 +1,162 @@
+# AWS
+
+The AutoML benchmark supports running experiments on [AWS EC2](https://aws.amazon.com/ec2/).
+
+!!! danger "AMLB does not limit expenses!"
+
+    The AWS integration lets your easily conduct massively parallel evaluations.
+    The AutoML Benchmark does not in any way restrict the _total_ costs you can make on AWS.
+    However, there are some tips for [reducing costs](#reducing-costs).
+
+    ??? danger "Example Costs"
+
+        For example, benchmarking one framework on the classification and regression suites
+        on a one hour budget takes 1 hour * 10 folds * 100 datasets = 1,000 hours, plus
+        overhead. Even when using spot instance pricing on `m5.2xlarge` instances (default)
+        probably costs at least $100 US (prices depend on overhead and fluctating prices).
+        A full evaluation with multiple frameworks and/or time budgets can cost
+        thousands of dollars. 
+
+
+## Setup
+
+To run a benchmark on AWS you additionally need to have a configured AWS account.
+The application is using the [boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html)
+Python package to exchange files through S3 and create EC2 instances.
+
+If this is your first time setting up your AWS account on the machine that will run the 
+`automlbenchmark` app, you can use the [AWS CLI](http://aws.amazon.com/cli/) tool and run:
+ ```bash
+ aws configure
+ ```
+You will need your AWS Access Key ID, AWS Secret Access Key, and pick a default [EC2 region](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-regions-availability-zones.html#concepts-available-regions).
+
+!!! note "Selecting a Region" 
+    To use a region, an AMI must be configured in the automl benchmark configuration file
+    under `aws.ec2.regions`. The default configuration has AMIs for `us-east-1`, 
+    `us-east-2`, `us-west-1`, `eu-west-1`, and `eu-central-1`. If you default EC2
+    region is different from these, you will need to add the AMI to your [custom configuration](configuration.md#custom-configurations).
+  
+On first use, it is recommended to use the following configuration file, or to extend
+your custom configuration file with these options. Follow the instructions in the file
+and make any necessary adjustments before running the benchmark.
+
+```yaml title="Starting AWS Configuration"
+--8<-- "examples/aws/config.yaml"
+```
+
+To run a test to see if the benchmark framework is working on AWS, do the following:
+
+```bash
+python3 runbenchmark.py constantpredictor test -m aws
+```
+
+This will create and start an EC2 instance for each benchmark job and run the 6 jobs 
+(3 OpenML tasks * 2 folds) from the `test` benchmark sequentially.
+Each job will run is constrained to a one-minute limit in this case, excluding setup 
+time for the EC2 instances (though `constantpredictor` will likely only take seconds).
+
+For longer benchmarks, you'll probably want to run multiple jobs in parallel and 
+distribute the work to several EC2 instances, for example:
+```bash
+python3 runbenchmark.py autosklearn validation 1h4c -m aws -p 4
+```
+will keep 4 EC2 instances running, monitor them in a dedicated thread, and finally collect all outputs from s3.
+
+??? info "EC2 Instances always stopped eventually (by default)"
+
+    Each EC2 instance is provided with a time limit at startup to ensure that in any case, 
+    the instance is stopped even if there is an issue when running the benchmark task. 
+    In this case the instance is stopped, not terminated, and we can therefore inspect 
+    the machine manually (ideally after resetting its UserData field to avoid 
+    re-triggering the benchmark on the next startup).
+
+The console output is still showing the instances starting, outputs the progress and 
+then the results for each dataset/fold combination (log excerpt from different command):
+
+```{.text .limit_max_height title="Example output benchmarking H2O on AWS"}
+Running `H2OAutoML_nightly` on `validation` benchmarks in `aws` mode!
+Loading frameworks definitions from ['/Users/me/repos/automlbenchmark/resources/frameworks.yaml'].
+Loading benchmark definitions from /Users/me/repos/automlbenchmark/resources/benchmarks/validationt.yaml.
+Uploading `/Users/me/repos/automlbenchmark/resources/benchmarks/validation.yaml` to `ec2/input/validation.yaml` on s3 bucket automl-benchmark.
+...
+Starting new EC2 instance with params: H2OAutoML_nightly /s3bucket/input/validation.yaml -t micro-mass -f 0
+Started EC2 instance i-0cd081efc97c3bf6f 
+[2019-01-22T11:51:32] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: pending 
+Starting new EC2 instance with params: H2OAutoML_nightly /s3bucket/input/validation.yaml -t micro-mass -f 1
+Started EC2 instance i-0251c1655e286897c 
+...
+[2019-01-22T12:00:32] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running
+[2019-01-22T12:00:33] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: running
+[2019-01-22T12:00:48] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running
+[2019-01-22T12:00:48] checking job aws_validation_micro-mass_0_H2OAutoML_nightly on instance i-0cd081efc97c3bf6f: running
+...
+[  731.511738] cloud-init[1521]: Predictions saved to /s3bucket/output/predictions/h2oautoml_nightly_micro-mass_0.csv
+[  731.512132] cloud-init[1521]: H2O session _sid_96e7 closed.
+[  731.512506] cloud-init[1521]: Loading predictions from /s3bucket/output/predictions/h2oautoml_nightly_micro-mass_0.csv
+[  731.512890] cloud-init[1521]: Metric scores: {'framework': 'H2OAutoML_nightly', 'version': 'nightly', 'task': 'micro-mass', 'fold': 0, 'mode': 'local', 'utc': '2019-01-22T12:00:02', 'logloss': 0.6498889633819804, 'acc': 0.8793103448275862, 'result': 0.6498889633819804}
+[  731.513275] cloud-init[1521]: Job local_micro-mass_0_H2OAutoML_nightly executed in 608.534 seconds
+[  731.513662] cloud-init[1521]: All jobs executed in 608.534 seconds
+[  731.514089] cloud-init[1521]: Scores saved to /s3bucket/output/scores/H2OAutoML_nightly_task_micro-mass.csv
+[  731.514542] cloud-init[1521]: Loaded scores from /s3bucket/output/scores/results.csv
+[  731.515006] cloud-init[1521]: Scores saved to /s3bucket/output/scores/results.csv
+[  731.515357] cloud-init[1521]: Summing up scores for current run:
+[  731.515782] cloud-init[1521]:          task          framework    ...         acc   logloss
+[  731.516228] cloud-init[1521]: 0  micro-mass  H2OAutoML_nightly    ...     0.87931  0.649889
+[  731.516671] cloud-init[1521]: [1 rows x 9 columns]
+...
+EC2 instance i-0cd081efc97c3bf6f is stopped
+Job aws_validation_micro-mass_0_H2OAutoML_nightly executed in 819.305 seconds
+[2019-01-22T12:01:34] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running
+[2019-01-22T12:01:49] checking job aws_validation_micro-mass_1_H2OAutoML_nightly on instance i-0251c1655e286897c: running
+EC2 instance i-0251c1655e286897c is stopping
+Job aws_validation_micro-mass_1_H2OAutoML_nightly executed in 818.463 seconds
+...
+Terminating EC2 instances i-0251c1655e286897c
+Terminated EC2 instances i-0251c1655e286897c with response {'TerminatingInstances': [{'CurrentState': {'Code': 32, 'Name': 'shutting-down'}, 'InstanceId': 'i-0251c1655e286897c', 'PreviousState': {'Code': 64, 'Name': 'stopping'}}], 'ResponseMetadata': {'RequestId': 'd09eeb0c-7a58-4cde-8f8b-2308a371a801', 'HTTPStatusCode': 200, 'HTTPHeaders': {'content-type': 'text/xml;charset=UTF-8', 'transfer-encoding': 'chunked', 'vary': 'Accept-Encoding', 'date': 'Tue, 22 Jan 2019 12:01:53 GMT', 'server': 'AmazonEC2'}, 'RetryAttempts': 0}}
+Instance i-0251c1655e286897c state: shutting-down
+All jobs executed in 2376.891 seconds
+Deleting uploaded resources `['ec2/input/validation.yaml', 'ec2/input/config.yaml', 'ec2/input/frameworks.yaml']` from s3 bucket automl-benchmark.
+```
+
+
+## Configurable AWS Options
+
+When using AWS mode, the application will use `on-demand` EC2 instances from the `m5` 
+series by default. However, it is also possible to use `Spot` instances, specify a 
+`max_hourly_price`, or customize your experience when using this mode in general.
+All configuration points are grouped and documented under the `aws` yaml namespace in 
+the main [config](GITHUB/resources/config.yaml) file.
+When setting  your own configuration, it is strongly recommended to first create your 
+own `config.yaml` file as described in [Custom configuration](configuration.md#custom-configurations).
+Here is an example of a config file using Spot instances on a non-default region:
+```yaml
+
+aws:
+  region: 'us-east-1'
+  resource_files:
+    - '{user}/config.yaml'
+    - '{user}/frameworks.yaml'
+
+  ec2:
+    subnet_id: subnet-123456789   # subnet for account on us-east-1 region
+    spot:
+      enabled: true
+      max_hourly_price: 0.40  # comment out to use default
+```
+
+### Reducing Costs
+
+The most important thing you can do to reduce costs is to critically evaluate which
+experimental results can be re-used from previous publications. That said, when
+conducting new experiments on AWS we have the following recommendations to reduce costs:
+
+ - Use spot instances with a fixed maximum price: set `aws.ec2.spot.enabled: true` and `aws.ec2.spot.max_hourly_price`. 
+   Check which region has [the lowest spot instance prices](https://aws.amazon.com/ec2/spot/)
+   and configure `aws.region` accordingly. 
+ - Skip the framework installation process by providing a docker image and setting `aws.docker_enabled: true`.
+ - Set up [AWS Budgets](https://aws.amazon.com/aws-cost-management/aws-budgets/)
+   to get alerts early if forecasted usage exceeds the budget. It should also be
+   technically possibly to automatically shut down all running instances in a region
+   if a budget is exceeded, but this naturally leads to a loss of experimental results, so
+   it is best avoided.
\ No newline at end of file
diff --git a/docs/using/configuration.md b/docs/using/configuration.md
new file mode 100644
index 000000000..e95350c91
--- /dev/null
+++ b/docs/using/configuration.md
@@ -0,0 +1,51 @@
+# Configuration
+
+The AutoML benchmark has a host of settings that can be configured from a `yaml` file.
+It is possible to write your own configuration file that overrides the default behavior
+in a flexible manner.
+
+## Configuration Options
+
+The default configuration options can be found in the 
+[`resources/config.yaml`](GITHUB/resources/config.yaml) file.
+
+```{ .yaml title="resources/config.yaml" .limit_max_height }
+--8<-- "resources/config.yaml"
+```
+
+## Custom Configurations
+To override default configuration, create your custom `config.yaml` file under the
+`user_dir` (specified by the `--userdir` parameter of `runbenchmark.py`).
+The application will automatically load this custom file and apply it on top of the defaults.
+
+When specifying filepaths, configurations support the following placeholders:
+
+| Placeholder | Replaced By Value Of | Default                     | Function                                                               |
+|:------------|:---------------------|:----------------------------|:-----------------------------------------------------------------------|
+| `{input}`   | `input_dir`          | `~/.openml/cache`           | Folder from which datasets are loaded (and/or downloaded)              |
+| `{output}`  | `output_dir`         | `./results`                 | Folder where all outputs (results, logs, predictions, ...) are stored. |
+| `{user}`    | `user_dir`           | `~/.config/automlbenchmark` | Folder containing custom configuration files.                          |
+| `{root}`    | `root_dir`           | Detected at runtime         | The root folder of the `automlbenchmark` application.                  |
+
+For example, including the following snippet in your custom configuration when
+`user_dir` is `~/.config/automlbenchmark` (which it is by default) changes your 
+input directory to `~/.config/automlbenchmark/data` :
+
+```yaml title="examples/custom/config.yaml"
+--8<-- "examples/custom/config.yaml:6:7"
+```
+
+!!! tip "Multiple Configuration Files"
+    It is possible to have multiple configuration files: 
+    just create a folder for each `config.yaml` file and use that folder as your 
+    `user_dir` using `--userdir /path/to/config/folder` when invoking `runbenchmark.py`.
+
+
+Below is an example of a configuration file which **1.** changes the directory the 
+datasets are loaded from, **2.** specifies additional paths to look up framework,
+benchmark, and constraint definitions, **3.** also makes those available in an S3 bucket 
+when running in AWS mode, and **4.** changes the default EC2 instance type for AWS mode.
+
+```yaml title="examples/custom/config.yaml"
+--8<-- "examples/custom/config.yaml:3"
+```
diff --git a/docs/using/parameters.md b/docs/using/parameters.md
new file mode 100644
index 000000000..e0494e8db
--- /dev/null
+++ b/docs/using/parameters.md
@@ -0,0 +1,94 @@
+# Parameters of `runbenchmark.py`
+
+The parameters of the `runbenchmark.py` script can be shown with:
+
+```{ .text title="python runbenchmark.py --help" .limit_max_height }
+usage: runbenchmark.py [-h] [-m {local,aws,docker,singularity}] [-t [task_id ...]] [-f [fold_num ...]] [-i input_dir] [-o output_dir] [-u user_dir] [-p parallel_jobs] [-s {auto,skip,force,only}] [-k [true|false]]
+                       [-e] [--logging LOGGING] [--openml-run-tag OPENML_RUN_TAG]
+                       framework [benchmark] [constraint]
+
+positional arguments:
+  framework             The framework to evaluate as defined by default in resources/frameworks.yaml.
+                        To use a labelled framework (i.e. a framework defined in resources/frameworks-{label}.yaml),
+                        use the syntax {framework}:{label}.
+  benchmark             The benchmark type to run as defined by default in resources/benchmarks/{benchmark}.yaml,
+                        a path to a benchmark description file, or an openml suite or task.
+                        OpenML references should be formatted as 'openml/s/X' and 'openml/t/Y',
+                        for studies and tasks respectively. Use 'test.openml/s/X' for the 
+                        OpenML test server.
+                        (default: 'test')
+  constraint            The constraint definition to use as defined by default in resources/constraints.yaml.
+                        (default: 'test')
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -m {local,aws,docker,singularity}, --mode {local,aws,docker,singularity}
+                        The mode that specifies how/where the benchmark tasks will be running.
+                        (default: 'local')
+  -t [task_id ...], --task [task_id ...]
+                        The specific task name (as defined in the benchmark file) to run.
+                        When an OpenML reference is used as benchmark, the dataset name should be used instead.
+                        If not provided, then all tasks from the benchmark will be run.
+  -f [fold_num ...], --fold [fold_num ...]
+                        If task is provided, the specific fold(s) to run.
+                        If fold is not provided, then all folds from the task definition will be run.
+  -i input_dir, --indir input_dir
+                        Folder from where the datasets are loaded by default.
+                        (default: '/Users/pietergijsbers/.openml')
+  -o output_dir, --outdir output_dir
+                        Folder where all the outputs should be written.(default: '/Users/pietergijsbers/repositories/forks/automlbenchmark-fork/results')
+  -u user_dir, --userdir user_dir
+                        Folder where all the customizations are stored.(default: '~/.config/automlbenchmark')
+  -p parallel_jobs, --parallel parallel_jobs
+                        The number of jobs (i.e. tasks or folds) that can run in parallel.
+                        A hard limit is defined by property `job_scheduler.max_parallel_jobs`
+                         in `resources/config.yaml`.
+                        Override this limit in your custom `config.yaml` file if needed.
+                        Supported only in aws mode or container mode (docker, singularity).
+                        (default: 1)
+  -s {auto,skip,force,only}, --setup {auto,skip,force,only}
+                        Framework/platform setup mode. Available values are:
+                        • auto: setup is executed only if strictly necessary.
+                        • skip: setup is skipped.
+                        • force: setup is always executed before the benchmark.
+                        • only: only setup is executed (no benchmark).
+                        (default: 'auto')
+  -k [true|false], --keep-scores [true|false]
+                        Set to true (default) to save/add scores in output directory.
+  -e, --exit-on-error   If set, terminates on the first task that does not complete with a model.
+  --logging LOGGING     Set the log levels for the 3 available loggers:
+                        • console
+                        • app: for the log file including only logs from amlb (.log extension).
+                        • root: for the log file including logs from libraries (.full.log extension).
+                        Accepted values for each logger are: notset, debug, info, warning, error, fatal, critical.
+                        Examples:
+                          --logging=info (applies the same level to all loggers)
+                          --logging=root:debug (keeps defaults for non-specified loggers)
+                          --logging=console:warning,app:info
+                        (default: 'console:info,app:debug,root:info')
+  --openml-run-tag OPENML_RUN_TAG
+                        Tag that will be saved in metadata and OpenML runs created during upload, must match '([a-zA-Z0-9_\-\.])+'.
+```
+
+
+## Profiling the application
+
+Currently, the application provides a global flag `--profiling` to activate profiling 
+for some specific methods that can be slow or memory intensive:
+
+```bash
+python runbenchmark.py randomforest --profiling
+```
+
+Not all methods and functions are not profiled, but if you need to profile more, 
+you just need to decorate the function with the `@profile()` decorator (from `amlb.utils`).
+Profiling reports on memory usage and function durations:
+
+```{ .text title="Example of profiling logs" }
+[PROFILING] `amlb.datasets.openml.OpenmlLoader.load` executed in 7.456s.
+[PROFILING] `amlb.datasets.openml.OpenmlDatasplit.data` returned object size: 45.756 MB.
+[PROFILING] `amlb.datasets.openml.OpenmlDatasplit.data` memory change; process: +241.09 MB/379.51 MB, resident: +241.09 MB/418.00 MB, virtual: +230.01 MB/4918.16 MB.
+[PROFILING] `amlb.data.Datasplit.X_enc` executed in 6.570s.
+[PROFILING] `amlb.data.Datasplit.release` executed in 0.007s.
+[PROFILING] `amlb.data.Datasplit.release` memory change; process: -45.73 MB/238.80 MB, resident: +0.00 MB/414.60 MB, virtual: +0.00 MB/4914.25 MB.
+```
\ No newline at end of file
diff --git a/docs/using/result_analysis.md b/docs/using/result_analysis.md
new file mode 100644
index 000000000..24d8d9a06
--- /dev/null
+++ b/docs/using/result_analysis.md
@@ -0,0 +1,217 @@
+# Results
+
+The AutoML benchmark produces many result files, such as logs, performance records,
+and meta-data of the experiments. Some of these files can also be automatically parsed
+and visualized by notebooks we provide.
+
+## Output File Structure
+
+Except the logs, all the files generated by the application are in easy to process 
+`csv` or `json` format, and they are all generated in a subfolder of the `output_dir` 
+unique for each benchmark run.
+
+For example:
+```text
+results/randomforest.test.test.local.20201204T192714
+|-- predictions
+|   |-- cholesterol
+|   |   |-- 0
+|   |   |   |-- metadata.json
+|   |   |   `-- predictions.csv
+|   |   `-- 1
+|   |       |-- metadata.json
+|   |       `-- predictions.csv
+|   |-- iris
+|   |   |-- 0
+|   |   |   |-- metadata.json
+|   |   |   `-- predictions.csv
+|   |   `-- 1
+|   |       |-- metadata.json
+|   |       `-- predictions.csv
+|   `-- kc2
+|       |-- 0
+|       |   |-- metadata.json
+|       |   `-- predictions.csv
+|       `-- 1
+|           |-- metadata.json
+|           `-- predictions.csv
+`-- scores
+    |-- RandomForest.benchmark_test.csv
+    `-- results.csv
+```
+
+### `results.csv`
+
+Here is a sample `results.csv` file from a test run against the `RandomForest` framework:
+
+
+=== "Produced CSV"
+
+    ```csv
+    id,task,framework,constraint,fold,result,metric,mode,version,params,tag,utc,duration,models,seed,info,acc,auc,balacc,logloss,mae,r2,rmse
+    openml.org/t/3913,kc2,RandomForest,test,0,0.865801,auc,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:27:46,3.2,2000,2633845682,,0.792453,0.865801,0.634199,0.350891,,,
+    openml.org/t/3913,kc2,RandomForest,test,1,0.86039,auc,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:27:52,3.0,2000,2633845683,,0.90566,0.86039,0.772727,0.406952,,,
+    openml.org/t/59,iris,RandomForest,test,0,0.126485,logloss,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:27:56,2.9,2000,2633845682,,0.933333,,0.933333,0.126485,,,
+    openml.org/t/59,iris,RandomForest,test,1,0.0271781,logloss,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:28:01,3.0,2000,2633845683,,1.0,,1.0,0.0271781,,,
+    openml.org/t/2295,cholesterol,RandomForest,test,0,44.3352,rmse,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:28:05,3.0,2000,2633845682,,,,,,35.6783,-0.014619,44.3352
+    openml.org/t/2295,cholesterol,RandomForest,test,1,55.3163,rmse,local,0.23.2,{'n_estimators': 2000},,2020-12-04T19:28:10,3.1,2000,2633845683,,,,,,43.1808,-0.0610752,55.3163
+    ```
+
+=== "Readable Table"
+
+    ```text
+                      id         task     framework constraint fold     result   metric   mode version                  params                  utc  duration models        seed       acc       auc    balacc   logloss      mae        r2     rmse
+    0  openml.org/t/3913          kc2  RandomForest       test    0   0.865801      auc  local  0.23.2  {'n_estimators': 2000}  2020-12-04T19:27:46       3.2   2000  2633845682  0.792453  0.865801  0.634199  0.350891      NaN       NaN      NaN
+    1  openml.org/t/3913          kc2  RandomForest       test    1   0.860390      auc  local  0.23.2  {'n_estimators': 2000}  2020-12-04T19:27:52       3.0   2000  2633845683  0.905660  0.860390  0.772727  0.406952      NaN       NaN      NaN
+    2    openml.org/t/59         iris  RandomForest       test    0   0.126485  logloss  local  0.23.2  {'n_estimators': 2000}  2020-12-04T19:27:56       2.9   2000  2633845682  0.933333       NaN  0.933333  0.126485      NaN       NaN      NaN
+    3    openml.org/t/59         iris  RandomForest       test    1   0.027178  logloss  local  0.23.2  {'n_estimators': 2000}  2020-12-04T19:28:01       3.0   2000  2633845683  1.000000       NaN  1.000000  0.027178      NaN       NaN      NaN
+    4  openml.org/t/2295  cholesterol  RandomForest       test    0  44.335200     rmse  local  0.23.2  {'n_estimators': 2000}  2020-12-04T19:28:05       3.0   2000  2633845682       NaN       NaN       NaN       NaN  35.6783 -0.014619  44.3352
+    5  openml.org/t/2295  cholesterol  RandomForest       test    1  55.316300     rmse  local  0.23.2  {'n_estimators': 2000}  2020-12-04T19:28:10       3.1   2000  2633845683       NaN       NaN       NaN       NaN  43.1808 -0.061075  55.3163
+    ```
+
+Here is a short description of each column:
+
+  - `id`: a identifier for the dataset used in this result. For convenience, we use the link to the OpenML task by default.
+  - `task`: the task name as defined in the benchmark definition.
+  - `framework`: the framework name as defined in the framework definition.
+  - `fold`: the dataset fold being used for this job. Usually, we're using 10 folds, so the fold varies from 0 to 9.
+  - `result`: the result score, this is the score for the metric that the framework was trying to optimize. For example, for binary classification, the default metrics defined in `resources/config.yaml` are `binary: ['auc', 'acc']`; this means that the frameworks should try to optimize `auc` and the final `auc` score will become the `result` value, the other metrics (here `acc`) are then computed for information.
+  - `mode`: one of `local`, `docker`, `aws`, `aws+docker`: tells where/how the job was executed.
+  - `version`: the version of the framework being benchmarked.
+  - `params`: if any, a JSON representation of the params defined in the framework definition. This allows to see clearly if some tuning was done for example.
+  - `tag`: the branch tag of the `automlbenchmark` app that was running the job.
+  - `utc`: the UTC timestamp at the job completion.
+  - `duration`: the training duration: the framework integration is supposed to provide this information to ensure that it takes only into account the time taken by the framework itself. When benchmarking large data, the application can use a significant amount of time to prepare the data: this additional time doesn't appear in this `duration` column.
+  - `models`: for some frameworks, it is possible to know how many models in total were trained by the AutoML framework. 
+  - `seed`: the seed or random state passed to the framework. With some frameworks, it is enough to obtain reproducible results. Note that the seed can be specified at the command line using `-Xseed=` arg (for example `python randomforest -Xseed=1452956522`): when there are multiple folds, the seed is then incremented by the fold number.
+  - `info`: additional info in text format, this usually contains error messages if the job failed.
+  - `acc`, `auc`, `logloss` metrics: all the metrics that were computed based on the generated predictions. For each job/row, one of them matches the `result` column, the others are purely informative. Those additional metric columns are simply added in alphabetical order.
+
+### Predictions Directory
+
+For each evaluation, the framework integration must generate a predictions file that
+will be used by the application to compute the scores. This predictions file is saved 
+under the `predictions` subfolder as shown [above](#output-file-structure) and
+follows the naming convention: `{framework}_{task}_{fold}.csv`.
+
+The `csv` file contains a header row and contains the following columns, in order:
+  - For classification tasks only, there is first one column per class, sorted alphabetically.
+    Each column contains the probability of the sample belonging to that class, as predicted by the AutoML framework.
+    If a framework does not provide probabilities, it will be 1 for the predicted class and 0 otherwise.
+  - `predictions`: contains the predictions of the test predictor data by the model trained by the framework,
+  - `truth`: the true values of the test target data (`test.y`).
+  
+Here are examples of the first few samples for `KC2` (binary classification), 
+`iris` (multiclass classification), and `cholesterol` (regression):
+
+=== "KC2 (csv)"
+
+    ```csv
+    no,yes,predictions,truth
+    0.965857617846013,0.034142382153998944,no,no
+    0.965857617846013,0.034142382153998944,no,no
+    0.5845,0.4155,no,no
+    0.6795,0.3205,no,no
+    0.965857617846013,0.034142382153998944,no,no
+    ```
+=== "KC2 (table)"
+
+    | no                | yes                  | predictions | truth | 
+    |-------------------|----------------------|-------------|-------| 
+    | 0.965857617846013 | 0.034142382153998944 | no          | no    | 
+    | 0.965857617846013 | 0.034142382153998944 | no          | no    | 
+    | 0.5845            | 0.4155               | no          | no    | 
+    | 0.6795            | 0.3205               | no          | no    | 
+    | 0.965857617846013 | 0.034142382153998944 | no          | no    | 
+
+===! "iris (csv)"
+
+    ```csv
+    Iris-setosa,Iris-versicolor,Iris-virginica,predictions,truth
+    1.0,0.0,0.0,Iris-setosa,Iris-setosa
+    0.9715,0.028,0.0005,Iris-setosa,Iris-setosa
+    1.0,0.0,0.0,Iris-setosa,Iris-setosa
+    1.0,0.0,0.0,Iris-setosa,Iris-setosa
+    1.0,0.0,0.0,Iris-setosa,Iris-setosa
+    0.0,1.0,0.0,Iris-versicolor,Iris-versicolor
+    ```
+
+
+=== "iris (table)"
+
+    | Iris-setosa | Iris-versicolor | Iris-virginica | predictions     | truth           | 
+    |-------------|-----------------|----------------|-----------------|-----------------| 
+    | 1.0         | 0.0             | 0.0            | Iris-setosa     | Iris-setosa     | 
+    | 0.9715      | 0.028           | 0.0005         | Iris-setosa     | Iris-setosa     | 
+    | 1.0         | 0.0             | 0.0            | Iris-setosa     | Iris-setosa     | 
+    | 1.0         | 0.0             | 0.0            | Iris-setosa     | Iris-setosa     | 
+    | 1.0         | 0.0             | 0.0            | Iris-setosa     | Iris-setosa     | 
+    | 0.0         | 1.0             | 0.0            | Iris-versicolor | Iris-versicolor | 
+
+
+===! "cholesterol (csv)"
+
+    ```csv
+    predictions,truth
+    241.204,207.0
+    248.9575,249.0
+    302.278,268.0
+    225.9215,234.0
+    226.6995,201.0
+    ```
+
+=== "cholesterol (table)"
+
+    | predictions | truth | 
+    |-------------|-------| 
+    | 241.204     | 207.0 | 
+    | 248.9575    | 249.0 | 
+    | 302.278     | 268.0 | 
+    | 225.9215    | 234.0 | 
+    | 226.6995    | 201.0 | 
+
+
+### Extract more information
+
+For some frameworks, it is also possible to extract more detailed information, 
+in the form of `artifacts` that are saved after the training.
+Examples of those artifacts are logs generated by the framework, models or descriptions 
+of the models trained by the framework, predictions for each of the model trained by the
+AutoML framework. By default, those artifacts are not saved, and not all frameworks 
+provide the same artifacts. This is why the artifacts to be stored have to be specified
+in the framework definition (_before_ running the experiments!). By convention, 
+this can be achieved by specifying the `params._save_artifacts` parameter. For example: 
+
+=== "autosklearn"
+
+    Save model descriptions under the `models` subfolder:
+    ```yaml
+    autosklearn_debug:
+      extends: autosklearn
+      params:
+        _save_artifacts: ['models'] 
+    ```
+
+=== "H2O"
+
+    Save the leaderboard and models under the `models` subfolder, 
+    and the H2O logs under `logs` subfolder:
+    ```yaml
+    H2OAutoML_debug:
+      extends: H2OAutoML
+      params:
+        _save_artifacts: ['leaderboard', 'logs', 'models'] 
+    ```
+
+=== "TPOT"
+
+    Save the description of models for the Pareto frontin the `models` subfolder:
+    ```yaml
+    TPOT_debug:
+      extends: TPOT
+      params:
+        _save_artifacts: ['models']
+    ```
+
+The framework integrations themselves determine where the artifacts are saved,
+this is typically not configurable from the framework definition.
diff --git a/docs/using/upload_to_openml.md b/docs/using/upload_to_openml.md
new file mode 100644
index 000000000..ce50145da
--- /dev/null
+++ b/docs/using/upload_to_openml.md
@@ -0,0 +1,26 @@
+
+### Uploading results to OpenML
+The `upload_results.py` script can be used to upload results to OpenML with the following usage:
+```text
+>python upload_results.py --help
+usage: Script to upload results from the benchmark to OpenML. [-h] [-i INPUT_DIRECTORY] [-a APIKEY] [-m MODE] [-x] [-v] [-t TASK]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -i INPUT_DIRECTORY, --input-directory INPUT_DIRECTORY
+                        Directory that stores results from the runbenchmark.py invocation. By default use the most recent folder in the results folder as
+                        specified in the configuration.
+  -a APIKEY, --api-key APIKEY
+                        OpenML API key to use for uploading results.
+  -m MODE, --mode MODE  Run mode (default=check).
+                        • check: only report whether results can be uploaded.
+                        • upload: upload all complete results.
+  -x, --fail-fast       Stop as soon as a task fails to upload due to an error during uploading.
+  -v, --verbose         Output progress to console.
+  -t TASK, --task TASK  Only upload results for this specific task.
+```
+
+Note that the default behavior does not upload data but only verifies data is complete.
+We strongly encourage you to only upload your data after verifying all expected results are complete.
+The OpenML Python package is used for uploading results, so to ensure your API credentials are configured, please refer to their [configuration documentation](https://openml.github.io/openml-python/master/usage.html#installation-set-up).
+Results obtained on tasks on the test server (e.g. through the `--test-server` option of `runbenchmark.py`) are uploaded to the test server and don't require additional authentication.
diff --git a/docs/website/automl_overview.html b/docs/website/automl_overview.html
new file mode 100644
index 000000000..e72f07c04
--- /dev/null
+++ b/docs/website/automl_overview.html
@@ -0,0 +1,17 @@
+<html>
+  <head>
+    <title>AMLB</title>
+    <meta charset="UTF-8" />
+    <meta http-equiv="refresh" content="3; URL=frameworks.html" />
+  </head>
+  <body>
+    <p>
+      The old AutoML framework overview page no longer exists, you will be
+      redirected to the new page. If you are not redirected within 3 seconds,
+      you can find it at
+      <a href="https://openml.github.io/automlbenchmark/frameworks.html"
+        >https://openml.github.io/automlbenchmark/frameworks.html</a
+      >.
+    </p>
+  </body>
+</html>
diff --git a/docs/website/benchmark_datasets.html b/docs/website/benchmark_datasets.html
new file mode 100644
index 000000000..f9b28623f
--- /dev/null
+++ b/docs/website/benchmark_datasets.html
@@ -0,0 +1,18 @@
+<html>
+  <head>
+    <title>AMLB</title>
+    <meta charset="UTF-8" />
+    <meta http-equiv="refresh" content="5; URL=https://github.com/openml/automlbenchmark/blob/2fe3bd41768ce28387f827791bd57ef1a5a84783/docs/benchmark_datasets.md"/>
+  </head>
+  <body>
+    <p>
+      You probably found this link from our 2019 paper.
+      Unfortunately, we updated our site but the new website does not yet contain a page with a description of our dataset selection strategy.<br/>
+        You will be redirected to the markdown file from which the old page was generated, so you can still view the old text.<br/>
+        If you are not redirected within 5 seconds, you can find it at
+      <a href="https://github.com/openml/automlbenchmark/blob/2fe3bd41768ce28387f827791bd57ef1a5a84783/docs/benchmark_datasets.md"
+        >https://github.com/openml/automlbenchmark/blob/2fe3bd41768ce28387f827791bd57ef1a5a84783/docs/benchmark_datasets.md</a
+      >.
+    </p>
+  </body>
+</html>
diff --git a/docs/website/frameworks.html b/docs/website/frameworks.html
new file mode 100644
index 000000000..08315b7d3
--- /dev/null
+++ b/docs/website/frameworks.html
@@ -0,0 +1,957 @@
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <title>AMLB: Frameworks</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link
+      href="https://fonts.googleapis.com/css2?family=Inconsolata&family=Lato:ital,wght@0,400;0,700;1,400&display=swap"
+      rel="stylesheet"
+    />
+    <link href="style.css" rel="stylesheet" />
+  </head>
+  <body>
+    <nav class="navigation-bar desktop">
+      <a href="index.html">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z"
+          />
+        </svg>
+      </a>
+      <a href="results.html" class="nav-link">Results</a>
+      <a href="frameworks.html" class="nav-link">Frameworks</a>
+      <a href="papers.html" class="nav-link">papers</a>
+      <a href="docs/index.html" class="nav-link">user guide</a>
+      <a
+        href="https://github.com/openml/automlbenchmark/discussions/216"
+        target="_blank"
+        class="nav-link"
+        >Discussions
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon-with-text"
+          fill="none"
+          viewBox="0 0 24 24"
+          stroke="currentColor"
+          stroke-width="2"
+        >
+          <path
+            stroke-linecap="round"
+            stroke-linejoin="round"
+            d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/"
+        target="_blank"
+        class="nav-icon"
+      >
+        <img src="img/logos/GitHub-Mark-Light-64px.png" height="100%" />
+      </a>
+    </nav>
+
+    <nav class="navigation-bar mobile nav-mobile">
+      <a href="index.html">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z"
+          />
+        </svg>
+      </a>
+      <a href="results.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            fill-rule="evenodd"
+            d="M5 3a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2V5a2 2 0 00-2-2H5zm9 4a1 1 0 10-2 0v6a1 1 0 102 0V7zm-3 2a1 1 0 10-2 0v4a1 1 0 102 0V9zm-3 3a1 1 0 10-2 0v1a1 1 0 102 0v-1z"
+            clip-rule="evenodd"
+          />
+        </svg>
+      </a>
+      <a href="frameworks.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M5 4a1 1 0 00-2 0v7.268a2 2 0 000 3.464V16a1 1 0 102 0v-1.268a2 2 0 000-3.464V4zM11 4a1 1 0 10-2 0v1.268a2 2 0 000 3.464V16a1 1 0 102 0V8.732a2 2 0 000-3.464V4zM16 3a1 1 0 011 1v7.268a2 2 0 010 3.464V16a1 1 0 11-2 0v-1.268a2 2 0 010-3.464V4a1 1 0 011-1z"
+          />
+        </svg>
+      </a>
+      <a href="papers.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M9 4.804A7.968 7.968 0 005.5 4c-1.255 0-2.443.29-3.5.804v10A7.969 7.969 0 015.5 14c1.669 0 3.218.51 4.5 1.385A7.962 7.962 0 0114.5 14c1.255 0 2.443.29 3.5.804v-10A7.968 7.968 0 0014.5 4c-1.255 0-2.443.29-3.5.804V12a1 1 0 11-2 0V4.804z"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark#readme"
+        target="_blank"
+        class="nav-link"
+        ><svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            fill-rule="evenodd"
+            d="M10.496 2.132a1 1 0 00-.992 0l-7 4A1 1 0 003 8v7a1 1 0 100 2h14a1 1 0 100-2V8a1 1 0 00.496-1.868l-7-4zM6 9a1 1 0 00-1 1v3a1 1 0 102 0v-3a1 1 0 00-1-1zm3 1a1 1 0 012 0v3a1 1 0 11-2 0v-3zm5-1a1 1 0 00-1 1v3a1 1 0 102 0v-3a1 1 0 00-1-1z"
+            clip-rule="evenodd"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/discussions/216"
+        target="_blank"
+        class="nav-link"
+      >
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M2 5a2 2 0 012-2h7a2 2 0 012 2v4a2 2 0 01-2 2H9l-3 3v-3H4a2 2 0 01-2-2V5z"
+          />
+          <path
+            d="M15 7v2a4 4 0 01-4 4H9.828l-1.766 1.767c.28.149.599.233.938.233h2l3 3v-3h2a2 2 0 002-2V9a2 2 0 00-2-2h-1z"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/"
+        target="_blank"
+        class="nav-icon"
+      >
+        <img src="img/logos/GitHub-Mark-Light-64px.png" height="22px" />
+      </a>
+    </nav>
+    <div class="page-content">
+      <section>
+        <h1 class="page-title">AutoML Frameworks</h1>
+        There is more to an AutoML system than just its performance. This page
+        contains more information about the integrated AutoML frameworks,
+        including links to their papers, repositories
+        <img src="img/logos/GitHub-Mark-64px.png" height="16px" />, and
+        documentation 📖. Summaries taken directly from the respective
+        documentation pages. Want to integrate your own framework?
+        <a
+          href="docs/extending/framework/"
+          target="_blank"
+          >Adding your own framework</a
+        >
+        is relatively simple.
+      </section>
+      <section class="framework-card-list">
+        <div class="accordion acard">
+          <div class="framework-header">
+            <img src="img/logos/autogluon.png" height="32px" />
+            <h3>AutoGluon</h3>
+            <div class="framework-links">
+              <a href="https://github.com/awslabs/autogluon" target="_blank"
+                ><img src="img/logos/GitHub-Mark-64px.png" height="24px"
+              /></a>
+              <a href="https://auto.gluon.ai/stable/index.html" target="_blank"
+                >📖</a
+              >
+            </div>
+          </div>
+          <div>
+            AutoGluon enables easy-to-use and easy-to-extend AutoML with a focus
+            on automated stack ensembling, deep learning, and real-world
+            applications spanning image, text, and tabular data.
+          </div>
+          <input type="checkbox" id="more-autogluon" class="accordion-input" />
+          <div class="accordion-content">
+            <div class="paper">
+              <h3 class="paper-title">
+                AutoGluon-Tabular: Robust and Accurate AutoML for Structured
+                Data
+              </h3>
+              <div class="paper-authors">
+                Nick Erickson, Jonas Mueller, Alexander Shirkov, Hang Zhang,
+                Pedro Larroy, Mu Li, Alexander Smola
+              </div>
+              <div class="paper-abstract">
+                We introduce AutoGluon-Tabular, an open-source AutoML framework
+                that requires only a single line of Python to train highly
+                accurate machine learning models on an unprocessed tabular
+                dataset such as a CSV file. Unlike existing AutoML frameworks
+                that primarily focus on model/hyperparameter selection,
+                AutoGluon-Tabular succeeds by ensembling multiple models and
+                stacking them in multiple layers. Experiments reveal that our
+                multi-layer combination of many models offers better use of
+                allocated training time than seeking out the best. A second
+                contribution is an extensive evaluation of public and commercial
+                AutoML platforms including TPOT, H2O, AutoWEKA, auto-sklearn,
+                AutoGluon, and Google AutoML Tables. Tests on a suite of 50
+                classification and regression tasks from Kaggle and the OpenML
+                AutoML Benchmark reveal that AutoGluon is faster, more robust,
+                and much more accurate. We find that AutoGluon often even
+                outperforms the best-in-hindsight combination of all of its
+                competitors. In two popular Kaggle competitions, AutoGluon beat
+                99% of the participating data scientists after merely 4h of
+                training on the raw data.
+              </div>
+              <div class="paper-links">
+                <div class="hover-expand">
+                  <strong>2020</strong>
+                  <div>ICML 2020 AutoML Workshop</div>
+                </div>
+                <a href="https://arxiv.org/pdf/2003.06505.pdf" target="_blank"
+                  >PDF</a
+                >
+                <a href="https://arxiv.org/abs/2003.06505" target="_blank"
+                  >arxiv</a
+                >
+              </div>
+            </div>
+          </div>
+          <label for="more-autogluon">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-down accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M19 9l-7 7-7-7"
+              />
+            </svg>
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-up accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M5 15l7-7 7 7"
+              />
+            </svg>
+          </label>
+        </div>
+        <div class="accordion acard">
+          <div class="framework-header">
+            <img src="img/logos/auto-sklearn.png" height="32px" />
+            <h3>Auto-sklearn</h3>
+            <div class="framework-links">
+              <a href="https://github.com/automl/auto-sklearn" target="_blank"
+                ><img src="img/logos/GitHub-Mark-64px.png" height="24px"
+              /></a>
+              <a
+                href="https://automl.github.io/auto-sklearn/master/#"
+                target="_blank"
+                >📖</a
+              >
+            </div>
+          </div>
+          <div>
+            Auto-sklearn is an automated machine learning toolkit and a drop-in
+            replacement for a scikit-learn estimator. Auto-sklearn frees a
+            machine learning user from algorithm selection and hyperparameter
+            tuning. It leverages recent advantages in Bayesian optimization,
+            meta-learning and ensemble construction.
+          </div>
+          <input type="checkbox" id="more-askl" class="accordion-input" />
+          <div class="accordion-content">
+            <div class="paper">
+              <h3 class="paper-title">
+                Auto-Sklearn 2.0: Hands-free AutoML via Meta-Learning
+              </h3>
+              <div class="paper-authors">
+                Matthias Feurer, Katharina Eggensperger, Stefan Falkner, Marius
+                Lindauer, Frank Hutter
+              </div>
+              <div class="paper-abstract">
+                Automated Machine Learning (AutoML) supports practitioners and
+                researchers with the tedious task of designing machine learning
+                pipelines and has recently achieved substantial success. In this
+                paper we introduce new AutoML approaches motivated by our
+                winning submission to the second ChaLearn AutoML challenge. We
+                develop PoSH Auto-sklearn, which enables AutoML systems to work
+                well on large datasets under rigid time limits using a new,
+                simple and meta-feature-free meta-learning technique and employs
+                a successful bandit strategy for budget allocation. However,
+                PoSH Auto-sklearn introduces even more ways of running AutoML
+                and might make it harder for users to set it up correctly.
+                Therefore, we also go one step further and study the design
+                space of AutoML itself, proposing a solution towards truly
+                hands-free AutoML. Together, these changes give rise to the next
+                generation of our AutoML system, Auto-sklearn 2.0 . We verify
+                the improvements by these additions in a large experimental
+                study on 39 AutoML benchmark datasets and conclude the paper by
+                comparing to other popular AutoML frameworks and Auto-sklearn
+                1.0 , reducing the relative error by up to a factor of 4.5, and
+                yielding a performance in 10 minutes that is substantially
+                better than what Auto-sklearn 1.0 achieves within an hour.
+              </div>
+              <div class="paper-links">
+                <div class="hover-expand">
+                  <strong>2021</strong>
+                  <div>arXiv</div>
+                </div>
+                <a href="https://arxiv.org/pdf/2007.04074.pdf" target="_blank"
+                  >PDF</a
+                >
+                <a href="https://arxiv.org/abs/2007.04074" target="_blank"
+                  >arxiv</a
+                >
+              </div>
+            </div>
+            <div class="paper">
+              <h3 class="paper-title">
+                Efficient and Robust Automated Machine Learning
+              </h3>
+              <div class="paper-authors">
+                Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost
+                Springenberg, Manuel Blum, Frank Hutter
+              </div>
+              <div class="paper-abstract">
+                The success of machine learning in a broad range of applications
+                has led to an ever-growing demand for machine learning systems
+                that can be used off the shelf by non-experts. To be effective
+                in practice, such systems need to automatically choose a good
+                algorithm and feature preprocessing steps for a new dataset at
+                hand, and also set their respective hyperparameters. Recent work
+                has started to tackle this automated machine learning (AutoML)
+                problem with the help of efficient Bayesian optimization
+                methods. In this work we introduce a robust new AutoML system
+                based on scikit-learn (using 15 classifiers, 14 feature
+                preprocessing methods, and 4 data preprocessing methods, giving
+                rise to a structured hypothesis space with 110 hyperparameters).
+                This system, which we dub auto-sklearn, improves on existing
+                AutoML methods by automatically taking into account past
+                performance on similar datasets, and by constructing ensembles
+                from the models evaluated during the optimization. Our system
+                won the first phase of the ongoing ChaLearn AutoML challenge,
+                and our comprehensive analysis on over 100 diverse datasets
+                shows that it substantially outperforms the previous state of
+                the art in AutoML. We also demonstrate the performance gains due
+                to each of our contributions and derive insights into the
+                effectiveness of the individual components of auto-sklearn.
+              </div>
+              <div class="paper-links">
+                <div class="hover-expand">
+                  <strong>2015</strong>
+                  <div>
+                    Advances in Neural Information Processing Systems 28 (NIPS
+                    2015)
+                  </div>
+                </div>
+                <a
+                  href="https://papers.neurips.cc/paper/2015/file/11d0e6287202fced83f79975ec59a3a6-Paper.pdf"
+                  target="_blank"
+                  >PDF</a
+                >
+                <a
+                  href="https://papers.neurips.cc/paper/2015/hash/11d0e6287202fced83f79975ec59a3a6-Abstract.html"
+                  target="_blank"
+                  >neurips</a
+                >
+              </div>
+            </div>
+          </div>
+          <label for="more-askl">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-down accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M19 9l-7 7-7-7"
+              />
+            </svg>
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-up accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M5 15l7-7 7 7"
+              />
+            </svg>
+          </label>
+        </div>
+        <div class="accordion acard">
+          <div class="framework-header">
+            <img src="img/logos/flaml.svg" height="32px" />
+            <h3>FLAML</h3>
+            <div class="framework-links">
+              <a href="https://github.com/microsoft/FLAML" target="_blank"
+                ><img src="img/logos/GitHub-Mark-64px.png" height="24px"
+              /></a>
+              <a href="https://microsoft.github.io/FLAML/" target="_blank"
+                >📖</a
+              >
+            </div>
+          </div>
+          <div>
+            FLAML is a lightweight Python library that finds accurate machine
+            learning models automatically, efficiently and economically. It
+            frees users from selecting learners and hyperparameters for each
+            learner.
+          </div>
+          <input type="checkbox" id="more-flaml" class="accordion-input" />
+          <div class="accordion-content">
+            <div class="paper">
+              <h3 class="paper-title">
+                FLAML: A Fast and Lightweight AutoML Library
+              </h3>
+              <div class="paper-authors">
+                Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu
+              </div>
+              <div class="paper-abstract">
+                We study the problem of using low computational cost to automate
+                the choices of learners and hyperparameters for an ad-hoc
+                training dataset and error metric, by conducting trials of
+                different configurations on the given training data. We
+                investigate the joint impact of multiple factors on both trial
+                cost and model error, and propose several design guidelines.
+                Following them, we build a fast and lightweight library FLAML
+                which optimizes for low computational resource in finding
+                accurate models. FLAML integrates several simple but effective
+                search strategies into an adaptive system. It significantly
+                outperforms top-ranked AutoML libraries on a large open source
+                AutoML benchmark under equal, or sometimes orders of magnitude
+                smaller budget constraints.
+              </div>
+              <div class="paper-links">
+                <div class="hover-expand">
+                  <strong>2021</strong>
+                  <div>MLSys Conference 2021</div>
+                </div>
+                <a
+                  href="https://proceedings.mlsys.org/paper/2021/file/92cc227532d17e56e07902b254dfad10-Paper.pdf"
+                  target="_blank"
+                  >PDF</a
+                >
+              </div>
+            </div>
+          </div>
+          <label for="more-flaml">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-down accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M19 9l-7 7-7-7"
+              />
+            </svg>
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-up accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M5 15l7-7 7 7"
+              />
+            </svg>
+          </label>
+        </div>
+        <div class="accordion acard">
+          <div class="framework-header">
+            <img src="img/logos/GAMA-icon.png" height="32px" />
+            <h3>GAMA</h3>
+            <div class="framework-links">
+              <a href="https://github.com/openml-labs/gama" target="_blank"
+                ><img src="img/logos/GitHub-Mark-64px.png" height="24px"
+              /></a>
+              <a
+                href="https://openml-labs.github.io/gama/master/"
+                target="_blank"
+                >📖</a
+              >
+            </div>
+          </div>
+          <div>
+            GAMA is developed for AutoML research and features a flexible AutoML
+            pipeline, which makes it easy to develop and evaluate new AutoML
+            components. GAMA's benchmarking configuration features evolutionary
+            optimization and ensemble construction.
+          </div>
+          <input type="checkbox" id="more-gama" class="accordion-input" />
+          <div class="accordion-content">
+            <div class="paper">
+              <h3 class="paper-title">
+                GAMA: A General Automated Machine Learning Assistant
+              </h3>
+              <div class="paper-authors">
+                Pieter Gijsbers, Joaquin Vanschoren
+              </div>
+
+              <!-- <div class="paper-year">2019</div>
+            <div class="paper-venue">ICML 2019 AutoML Workshop</div>
+            <div class="paper-venue">cite</div> -->
+
+              <div class="paper-abstract">
+                The General Automated Machine learning Assistant (GAMA) is a
+                modular AutoML system developed to empower users to track and
+                control how AutoML algorithms search for optimal machine
+                learning pipelines, and facilitate AutoML research itself. In
+                contrast to current, often black-box systems, GAMA allows users
+                to plug in different AutoML and post-processing techniques, logs
+                and visualizes the search process, and supports easy
+                benchmarking. It currently features three AutoML search
+                algorithms, two model post-processing steps, and is designed to
+                allow for more components to be added.
+              </div>
+              <div class="paper-links">
+                <div class="hover-expand">
+                  <strong>2021</strong>
+                  <div>
+                    Joint European Conference on Machine Learning and Knowledge
+                    Discovery in Databases. Springer, Cham, 2020.
+                  </div>
+                </div>
+                <a href="https://arxiv.org/pdf/2007.04911.pdf" target="_blank"
+                  >PDF</a
+                >
+                <a
+                  href="https://link.springer.com/chapter/10.1007/978-3-030-67670-4_39"
+                  target="_blank"
+                  >Springer</a
+                >
+              </div>
+            </div>
+          </div>
+          <label for="more-gama">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-down accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M19 9l-7 7-7-7"
+              />
+            </svg>
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-up accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M5 15l7-7 7 7"
+              />
+            </svg>
+          </label>
+        </div>
+        <div class="accordion acard">
+          <div class="framework-header">
+            <img src="img/logos/h2o-automl-logo.jpeg" height="32px" />
+            <h3>H2O AutoML</h3>
+            <div class="framework-links">
+              <a href="https://github.com/h2oai/h2o-3" target="_blank"
+                ><img src="img/logos/GitHub-Mark-64px.png" height="24px"
+              /></a>
+              <a
+                href="https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html"
+                target="_blank"
+                >📖</a
+              >
+            </div>
+          </div>
+          <div>
+            H2O's AutoML can be used for automating the machine learning
+            workflow, which includes automatic training and tuning of many
+            models within a user-specified time-limit. H2O offers a number of
+            model explainability methods that apply to AutoML objects (groups of
+            models), as well as individual models (e.g. leader model).
+            Explanations can be generated automatically with a single function
+            call, providing a simple interface to exploring and explaining the
+            AutoML models.
+          </div>
+          <input type="checkbox" id="more-h2o" class="accordion-input" />
+          <div class="accordion-content">
+            <div class="paper">
+              <h3 class="paper-title">
+                H2O AutoML: Scalable Automatic Machine Learning
+              </h3>
+              <div class="paper-authors">Erin LeDell and Sébastien Poirier</div>
+              <div class="paper-abstract">
+                H2O is an open source, distributed machine learning platform
+                designed to scale to very large datasets, with APIs in R,
+                Python, Java and Scala. We present H2O AutoML, a highly
+                scalable, fully-automated, supervised learning algorithm which
+                automates the pro- cess of training a large selection of
+                candidate models and stacked ensembles within a single function.
+                The result of the AutoML run is a “leaderboard”: a ranked list
+                of models, all of which can be easily exported for use in a
+                production environment. Models in the leader- board can be
+                ranked by numerous model performance metrics or other model
+                attributes such as training time or average per-row prediction
+                speed. The H2O AutoML algorithm relies on the efficient training
+                of H2O machine learning al- gorithms to produce a large number
+                of models in a short amount of time. H2O AutoML uses a
+                combination of fast random search and stacked ensembles to
+                achieve results competitive with, and often better than, other
+                frameworks which rely on more complex model tuning techniques
+                such as Bayesian optimization or genetic algorithms. H2O AutoML
+                trains a va- riety of algorithms (e.g. GBMs, Random Forests,
+                Deep Neural Networks, GLMs), yielding a healthy amount of
+                diversity across candidate models, which can be exploited by
+                stacked ensembles to produce a powerful final model. The
+                effectiveness of this technique is reflected in the OpenML
+                AutoML Benchmark, which compares the performance of several of
+                the most well known, open source AutoML systems across a number
+                of datasets.
+              </div>
+              <div class="paper-links">
+                <div class="hover-expand">
+                  <strong>2020</strong>
+                  <div>ICML 2020 AutoML Workshop</div>
+                </div>
+                <a
+                  href="https://www.automl.org/wp-content/uploads/2020/07/AutoML_2020_paper_61.pdf"
+                  target="_blank"
+                  >PDF</a
+                >
+              </div>
+            </div>
+          </div>
+          <label for="more-h2o">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-down accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M19 9l-7 7-7-7"
+              />
+            </svg>
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-up accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M5 15l7-7 7 7"
+              />
+            </svg>
+          </label>
+        </div>
+        <div class="accordion acard">
+          <div class="framework-header">
+            <img src="img/logos/LightAutoML_logo_small.png" height="32px" />
+            <h3>LightAutoML</h3>
+            <div class="framework-links">
+              <a href="https://github.com/sb-ai-lab/LightAutoML" target="_blank"
+                ><img src="img/logos/GitHub-Mark-64px.png" height="24px"
+              /></a>
+              <a
+                href="https://lightautoml.readthedocs.io/en/latest/"
+                target="_blank"
+                >📖</a
+              >
+            </div>
+          </div>
+          <div>
+            LightAutoML is open-source Python library aimed at automated machine
+            learning. It is designed to be lightweight and efficient for various
+            tasks with tabular, text data.
+          </div>
+          <input type="checkbox" id="more-lama" class="accordion-input" />
+          <div class="accordion-content">
+            <div>Paper to be added.</div>
+            <div class="paper">
+              <!-- <h3 class="paper-title">
+                Automating biomedical data science through tree-based pipeline
+                optimization
+              </h3> -->
+              <div class="paper-authors">
+                Alexander Ryzhkov, Anton Vakhrushev, Dmitry Simakov, Vasilii
+                Bunakov, Rinchin Damdinov, Alexander Kirilin, Pavel Shvets
+              </div>
+              <!-- <div class="paper-abstract">
+                Automated machine learning (AutoML) systems are helpful data
+                science assistants designed to scan data for novel features,
+                select appropriate supervised learning models and optimize their
+                parameters. For this purpose, Tree-based Pipeline Optimization
+                Tool (TPOT) was developed using strongly typed genetic
+                programing (GP) to recommend an optimized analysis pipeline for
+                the data scientist’s prediction problem. However, like other
+                AutoML systems, TPOT may reach computational resource limits
+                when working on big data such as whole-genome expression data.
+              </div>
+              <div class="paper-links">
+                <div class="hover-expand">
+                  <strong>2016</strong>
+                  <div>
+                    Applications of Evolutionary Computation, pages 123-137
+                  </div>
+                </div>
+                <a href="https://arxiv.org/pdf/1601.07925" target="_blank"
+                  >PDF</a
+                >
+                <a
+                  href="https://link.springer.com/chapter/10.1007/978-3-319-31204-0_9"
+                  target="_blank"
+                  >springer</a
+                >
+              </div> -->
+            </div>
+          </div>
+          <label for="more-lama">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-down accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M19 9l-7 7-7-7"
+              />
+            </svg>
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-up accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M5 15l7-7 7 7"
+              />
+            </svg>
+          </label>
+        </div>
+        <div class="accordion acard">
+          <div class="framework-header">
+            <img src="img/logos/mljar.png" height="32px" />
+            <h3>mljar-supervised</h3>
+            <div class="framework-links">
+              <a
+                href="https://github.com/mljar/mljar-supervised/"
+                target="_blank"
+                ><img src="img/logos/GitHub-Mark-64px.png" height="24px"
+              /></a>
+              <a href="https://supervised.mljar.com" target="_blank">📖</a>
+            </div>
+          </div>
+          <div>
+            The mljar-supervised is an Automated Machine Learning Python package
+            that works with tabular data. It is designed to save time for a data
+            scientist. It abstracts the common way to preprocess the data,
+            construct the machine learning models, and perform hyper-parameters
+            tuning to find the best model 🏆. It is no black-box as you can see
+            exactly how the ML pipeline is constructed (with a detailed Markdown
+            report for each ML model).
+          </div>
+          <input type="checkbox" id="more-mljar" class="accordion-input" />
+          <div class="accordion-content">
+            <div>No paper available.</div>
+          </div>
+          <label for="more-mljar">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-down accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M19 9l-7 7-7-7"
+              />
+            </svg>
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-up accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M5 15l7-7 7 7"
+              />
+            </svg>
+          </label>
+        </div>
+        <div class="accordion acard">
+          <div class="framework-header">
+            <img src="img/logos/tpot.jpeg" height="32px" />
+            <h3>TPOT</h3>
+            <div class="framework-links">
+              <a href="https://github.com/EpistasisLab/tpot" target="_blank"
+                ><img src="img/logos/GitHub-Mark-64px.png" height="24px"
+              /></a>
+              <a href="http://epistasislab.github.io/tpot/" target="_blank"
+                >📖</a
+              >
+            </div>
+          </div>
+          <div>
+            TPOT is a Python Automated Machine Learning tool that optimizes
+            machine learning pipelines using genetic programming. It has a focus
+            on optimizing models for biomedical data.
+          </div>
+          <input type="checkbox" id="more-tpot" class="accordion-input" />
+          <div class="accordion-content">
+            <div class="paper">
+              <h3 class="paper-title">
+                Automating biomedical data science through tree-based pipeline
+                optimization
+              </h3>
+              <div class="paper-authors">
+                Randal S. Olson, Ryan J. Urbanowicz, Peter C. Andrews, Nicole A.
+                Lavender, La Creis Kidd, and Jason H. Moore
+              </div>
+              <div class="paper-abstract">
+                Automated machine learning (AutoML) systems are helpful data
+                science assistants designed to scan data for novel features,
+                select appropriate supervised learning models and optimize their
+                parameters. For this purpose, Tree-based Pipeline Optimization
+                Tool (TPOT) was developed using strongly typed genetic
+                programing (GP) to recommend an optimized analysis pipeline for
+                the data scientist’s prediction problem. However, like other
+                AutoML systems, TPOT may reach computational resource limits
+                when working on big data such as whole-genome expression data.
+              </div>
+              <div class="paper-links">
+                <div class="hover-expand">
+                  <strong>2016</strong>
+                  <div>
+                    Applications of Evolutionary Computation, pages 123-137
+                  </div>
+                </div>
+                <a href="https://arxiv.org/pdf/1601.07925" target="_blank"
+                  >PDF</a
+                >
+                <a
+                  href="https://link.springer.com/chapter/10.1007/978-3-319-31204-0_9"
+                  target="_blank"
+                  >springer</a
+                >
+              </div>
+            </div>
+          </div>
+          <label for="more-tpot">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-down accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M19 9l-7 7-7-7"
+              />
+            </svg>
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-up accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M5 15l7-7 7 7"
+              />
+            </svg>
+          </label>
+        </div>
+      </section>
+    </div>
+  </body>
+  <footer>
+    © 2022 Copyright, the AutoML Benchmark team.<br />
+    Icons of AutoML frameworks and OpenML by their respective owners. <br />
+    Spotted an issue?
+    <a href="https://github.com/openml/automlbenchmark" target="_blank"
+      >Tell us on Github or open a pull request</a
+    >!
+  </footer>
+</html>
diff --git a/docs/website/img/logos/GAMA-icon.png b/docs/website/img/logos/GAMA-icon.png
new file mode 100644
index 000000000..c9626aa28
Binary files /dev/null and b/docs/website/img/logos/GAMA-icon.png differ
diff --git a/docs/website/img/logos/GAMA.png b/docs/website/img/logos/GAMA.png
new file mode 100644
index 000000000..0ba100f0c
Binary files /dev/null and b/docs/website/img/logos/GAMA.png differ
diff --git a/docs/website/img/logos/GitHub-Mark-32px.png b/docs/website/img/logos/GitHub-Mark-32px.png
new file mode 100644
index 000000000..8b25551a9
Binary files /dev/null and b/docs/website/img/logos/GitHub-Mark-32px.png differ
diff --git a/docs/website/img/logos/GitHub-Mark-64px.png b/docs/website/img/logos/GitHub-Mark-64px.png
new file mode 100644
index 000000000..182a1a3f7
Binary files /dev/null and b/docs/website/img/logos/GitHub-Mark-64px.png differ
diff --git a/docs/website/img/logos/GitHub-Mark-Light-64px.png b/docs/website/img/logos/GitHub-Mark-Light-64px.png
new file mode 100644
index 000000000..73db1f61f
Binary files /dev/null and b/docs/website/img/logos/GitHub-Mark-Light-64px.png differ
diff --git a/docs/website/img/logos/LightAutoML_logo_small.png b/docs/website/img/logos/LightAutoML_logo_small.png
new file mode 100644
index 000000000..8d268e390
Binary files /dev/null and b/docs/website/img/logos/LightAutoML_logo_small.png differ
diff --git a/docs/website/img/logos/auto-sklearn.png b/docs/website/img/logos/auto-sklearn.png
new file mode 100644
index 000000000..65141af67
Binary files /dev/null and b/docs/website/img/logos/auto-sklearn.png differ
diff --git a/docs/website/img/logos/autogluon.png b/docs/website/img/logos/autogluon.png
new file mode 100644
index 000000000..8afef59ab
Binary files /dev/null and b/docs/website/img/logos/autogluon.png differ
diff --git a/docs/website/img/logos/flaml.svg b/docs/website/img/logos/flaml.svg
new file mode 100644
index 000000000..5ae22b683
--- /dev/null
+++ b/docs/website/img/logos/flaml.svg
@@ -0,0 +1 @@
+<svg id="Layer_1" data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1138.16 450.71"><defs><style>.cls-1{fill:#ff9406;}.cls-2{fill:#505d66;}</style></defs><g id="Layer_2" data-name="Layer 2"><g id="图层_1" data-name="图层 1"><path class="cls-1" d="M298,76.7,173.81.24a1.58,1.58,0,0,0-2.06,2.35L211.55,51a263.57,263.57,0,0,0-33.37,3.9,252.77,252.77,0,0,0-35,9,240.65,240.65,0,0,0-33,13.92,228.41,228.41,0,0,0-30.5,18.8,211.86,211.86,0,0,0-29,25.52,191.34,191.34,0,0,0-23,29.72,176.8,176.8,0,0,0-16.34,33.49,172.22,172.22,0,0,0-8.9,36.76L0,241a1.58,1.58,0,0,0,1.37,1.77,1.6,1.6,0,0,0,1-.22l79-47.9a1.55,1.55,0,0,0,.69-.86l1-3.16a145,145,0,0,1,26.41-47.86,170.28,170.28,0,0,1,41.5-36A196.9,196.9,0,0,1,203.12,84.1a214.83,214.83,0,0,1,59.32-7.55c2.86,0,5.77.12,8.63.27s5.76.34,8.62.6,5.75.57,8.6.93,5.71.78,8.54,1.25a1.58,1.58,0,0,0,1.91-1.16A1.59,1.59,0,0,0,298,76.7Z" transform="translate(0.01 0.01)"/><path class="cls-1" d="M347.83,177.83l-1-18.05a1.57,1.57,0,0,0-1.65-1.48,1.49,1.49,0,0,0-.79.26l-71.16,47.15a1.55,1.55,0,0,0-.67,1L271.9,210a143.76,143.76,0,0,1-22.58,52.27,174.42,174.42,0,0,1-42.61,43,205,205,0,0,1-58.31,28.88,217.42,217.42,0,0,1-68.28,9.93c-3.3-.05-6.63-.17-9.89-.36s-6.58-.47-9.83-.8-6.51-.76-9.73-1.24-6.42-1.05-9.6-1.68a1.57,1.57,0,0,0-1.3,2.76L171.15,450.34a1.57,1.57,0,0,0,2.39-1.94l-36.87-70.52a264,264,0,0,0,40.57-5.5A251.22,251.22,0,0,0,217.75,360a238,238,0,0,0,36.9-18.61,224.15,224.15,0,0,0,32.27-24.25,201.9,201.9,0,0,0,28.2-31.37,179.69,179.69,0,0,0,19.59-34.43A167,167,0,0,0,345.6,215,161.86,161.86,0,0,0,347.83,177.83Z" transform="translate(0.01 0.01)"/><path class="cls-2" d="M258.56,209.79,196.9,181.24l61.42-95.48a1.63,1.63,0,0,0-2.23-2.26L101.25,179.84a1.63,1.63,0,0,0-.52,2.25,1.56,1.56,0,0,0,.67.6l60.26,29.12-90.33,122a1.62,1.62,0,0,0,.13,2.09,1.6,1.6,0,0,0,2.08.23l185.24-123.5a1.63,1.63,0,0,0,.46-2.26,1.67,1.67,0,0,0-.68-.58Z" transform="translate(0.01 0.01)"/><path class="cls-2" d="M451.86,199a36.63,36.63,0,0,0-12.3,10.44,32.45,32.45,0,0,0-6.35,14.49l-4.09,24.49h104.4l-4,24H425.12l-8.88,53.25H380.3l3-17.79,14-84a60.11,60.11,0,0,1,11.49-26.51,67.08,67.08,0,0,1,22.41-19.21,58.25,58.25,0,0,1,27.69-7.07H546.4l-4,24H466.76A31.6,31.6,0,0,0,451.86,199Z" transform="translate(0.01 0.01)"/><path class="cls-2" d="M772.68,325.65,742.8,208.39l-46.3,78,60.5,2.68L637.67,325.65,729,171.09h39.76l39.83,154.56Z" transform="translate(0.01 0.01)"/><path class="cls-2" d="M643.32,301.39H597.81q-11.91,0-18.91-8.43t-5-20.33l16.93-101.54H554.61L537.69,272.63q-2.4,14.39,2.65,26.51a41.67,41.67,0,0,0,16.11,19.32,45.49,45.49,0,0,0,25.42,7.19H629Z" transform="translate(0.01 0.01)"/><path class="cls-2" d="M975.05,170.86h36.17l-25.8,154.79H949.25L966,225l-37.44,45.37H892.87l-23.7-46.5-17,101.77H816l25.8-154.79H878l37.1,72.57Z" transform="translate(0.01 0.01)"/><path class="cls-2" d="M1138.15,301.39h-75.68q-11.91,0-18.92-8.43t-5-20.33l16.92-101.54H1019.3l-16.93,101.54a47.91,47.91,0,0,0,2.63,26.51,41.7,41.7,0,0,0,16.1,19.32,45.57,45.57,0,0,0,25.42,7.19h77.29Z" transform="translate(0.01 0.01)"/></g></g></svg>
\ No newline at end of file
diff --git a/docs/website/img/logos/h2o-automl-logo.jpeg b/docs/website/img/logos/h2o-automl-logo.jpeg
new file mode 100644
index 000000000..68ab95b32
Binary files /dev/null and b/docs/website/img/logos/h2o-automl-logo.jpeg differ
diff --git a/docs/website/img/logos/mljar.png b/docs/website/img/logos/mljar.png
new file mode 100644
index 000000000..a61427bef
Binary files /dev/null and b/docs/website/img/logos/mljar.png differ
diff --git a/docs/website/img/logos/tpot.jpeg b/docs/website/img/logos/tpot.jpeg
new file mode 100644
index 000000000..fd4d56add
Binary files /dev/null and b/docs/website/img/logos/tpot.jpeg differ
diff --git a/docs/website/img/shiny.png b/docs/website/img/shiny.png
new file mode 100644
index 000000000..9942dfeec
Binary files /dev/null and b/docs/website/img/shiny.png differ
diff --git a/docs/website/index.html b/docs/website/index.html
new file mode 100644
index 000000000..fdfc40b7b
--- /dev/null
+++ b/docs/website/index.html
@@ -0,0 +1,424 @@
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <title>AMLB: an AutoML Benchmark</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link
+      href="https://fonts.googleapis.com/css2?family=Inconsolata&family=Lato:ital,wght@0,400;0,700;1,400&display=swap"
+      rel="stylesheet"
+    />
+    <link href="style.css" rel="stylesheet" />
+  </head>
+  <body>
+    <nav class="navigation-bar desktop">
+      <a href="index.html">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z"
+          />
+        </svg>
+      </a>
+      <a href="results.html" class="nav-link">Results</a>
+      <a href="frameworks.html" class="nav-link">Frameworks</a>
+      <a href="papers.html" class="nav-link">papers</a>
+      <a href="docs/index.html" class="nav-link">user guide</a>
+      <a
+        href="https://github.com/openml/automlbenchmark/discussions/216"
+        target="_blank"
+        class="nav-link"
+        >Discussions
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon-with-text"
+          fill="none"
+          viewBox="0 0 24 24"
+          stroke="currentColor"
+          stroke-width="2"
+        >
+          <path
+            stroke-linecap="round"
+            stroke-linejoin="round"
+            d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/"
+        target="_blank"
+        class="nav-icon"
+      >
+        <img src="img/logos/GitHub-Mark-Light-64px.png" height="100%" />
+      </a>
+    </nav>
+
+    <nav class="navigation-bar mobile nav-mobile">
+      <a href="index.html">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z"
+          />
+        </svg>
+      </a>
+      <a href="results.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            fill-rule="evenodd"
+            d="M5 3a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2V5a2 2 0 00-2-2H5zm9 4a1 1 0 10-2 0v6a1 1 0 102 0V7zm-3 2a1 1 0 10-2 0v4a1 1 0 102 0V9zm-3 3a1 1 0 10-2 0v1a1 1 0 102 0v-1z"
+            clip-rule="evenodd"
+          />
+        </svg>
+      </a>
+      <a href="frameworks.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M5 4a1 1 0 00-2 0v7.268a2 2 0 000 3.464V16a1 1 0 102 0v-1.268a2 2 0 000-3.464V4zM11 4a1 1 0 10-2 0v1.268a2 2 0 000 3.464V16a1 1 0 102 0V8.732a2 2 0 000-3.464V4zM16 3a1 1 0 011 1v7.268a2 2 0 010 3.464V16a1 1 0 11-2 0v-1.268a2 2 0 010-3.464V4a1 1 0 011-1z"
+          />
+        </svg>
+      </a>
+      <a href="papers.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M9 4.804A7.968 7.968 0 005.5 4c-1.255 0-2.443.29-3.5.804v10A7.969 7.969 0 015.5 14c1.669 0 3.218.51 4.5 1.385A7.962 7.962 0 0114.5 14c1.255 0 2.443.29 3.5.804v-10A7.968 7.968 0 0014.5 4c-1.255 0-2.443.29-3.5.804V12a1 1 0 11-2 0V4.804z"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark#readme"
+        target="_blank"
+        class="nav-link"
+        ><svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            fill-rule="evenodd"
+            d="M10.496 2.132a1 1 0 00-.992 0l-7 4A1 1 0 003 8v7a1 1 0 100 2h14a1 1 0 100-2V8a1 1 0 00.496-1.868l-7-4zM6 9a1 1 0 00-1 1v3a1 1 0 102 0v-3a1 1 0 00-1-1zm3 1a1 1 0 012 0v3a1 1 0 11-2 0v-3zm5-1a1 1 0 00-1 1v3a1 1 0 102 0v-3a1 1 0 00-1-1z"
+            clip-rule="evenodd"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/discussions/216"
+        target="_blank"
+        class="nav-link"
+      >
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M2 5a2 2 0 012-2h7a2 2 0 012 2v4a2 2 0 01-2 2H9l-3 3v-3H4a2 2 0 01-2-2V5z"
+          />
+          <path
+            d="M15 7v2a4 4 0 01-4 4H9.828l-1.766 1.767c.28.149.599.233.938.233h2l3 3v-3h2a2 2 0 002-2V9a2 2 0 00-2-2h-1z"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/"
+        target="_blank"
+        class="nav-icon"
+      >
+        <img src="img/logos/GitHub-Mark-Light-64px.png" height="22px" />
+      </a>
+    </nav>
+    <!-- <div class="construction-banner">
+      🚧 This website is under construction 🚧
+    </div> -->
+    <div class="page-content">
+      <section>
+        <div class="title">AMLB</div>
+        <div class="subtitle">An AutoML Benchmark</div>
+        <div class="summary">
+          Comparing different AutoML frameworks is notoriously challenging. AMLB
+          is an open and extensible benchmark that follows best practices and
+          avoids common mistakes when comparing AutoML frameworks.
+        </div>
+      </section>
+      <section>
+        <div class="three-cols">
+          <div class="card">
+            <h3>Easy to Use</h3>
+            <div class="grey">
+              You can run an entire benchmark with a single command! The AutoML
+              benchmark tool automates the installation of the AutoML framework,
+              the experimental setup, and executing the experiment.
+            </div>
+            <div class="terminal">
+              > python runbenchmark.py autosklearn openml/s/269 1h8c
+            </div>
+            <div class="flex-grow"></div>
+            <a
+              class="card-nav"
+              href="docs/getting_started"
+              target="_blank"
+            >
+              <svg
+                xmlns="http://www.w3.org/2000/svg"
+                class="h-6 w-6"
+                fill="none"
+                viewBox="0 0 24 24"
+                stroke="currentColor"
+                stroke-width="2"
+              >
+                <path
+                  stroke-linecap="round"
+                  stroke-linejoin="round"
+                  d="M17 8l4 4m0 0l-4 4m4-4H3"
+                />
+              </svg>
+              Installation guide
+            </a>
+          </div>
+          <div class="card">
+            <h3>Visualize Results</h3>
+            <div class="grey">
+              The results can be visualized with our
+              <a
+                href="https://compstat-lmu.shinyapps.io/AutoML-Benchmark-Analysis/"
+                target="_blank"
+                >interactive visualization tool</a
+              >
+              or one of our
+              <a
+                href="https://github.com/openml/automlbenchmark/tree/master/reports"
+                >notebooks</a
+              >. This includes stripplots, critical difference diagrams,
+              Bradley-Terry trees, and more!
+            </div>
+            <img src="img/shiny.png" width="100%" />
+            <div class="flex-grow"></div>
+            <a class="card-nav" href="results.html" target="_blank">
+              <svg
+                xmlns="http://www.w3.org/2000/svg"
+                class="h-6 w-6"
+                fill="none"
+                viewBox="0 0 24 24"
+                stroke="currentColor"
+                stroke-width="2"
+              >
+                <path
+                  stroke-linecap="round"
+                  stroke-linejoin="round"
+                  d="M17 8l4 4m0 0l-4 4m4-4H3"
+                />
+              </svg>
+              Results
+            </a>
+          </div>
+          <div class="card">
+            <h3>Easy to Extend</h3>
+            <div class="grey">
+              <a
+                href="docs/extending/framework"
+                >Adding a framework</a
+              >
+              and
+              <a
+                href="docs/extending/benchmark/"
+                >adding a dataset</a
+              >
+              is easy. These extensions can be kept completely private, or
+              shared with the community. For datasets, it is even possible to
+              work with
+              <a href="https://www.openml.org" target="_blank">OpenML</a>
+              tasks and suites directly!
+            </div>
+            <svg width="72" height="72" viewBox="0, 0, 400,400">
+              <g>
+                <path
+                  id="path0"
+                  d="M77.077 264.657 C 76.925 264.809,76.800 265.251,76.800 265.639 C 76.800 266.066,76.445 266.395,75.900 266.472 C 75.017 266.598,74.998 266.717,74.890 272.700 L 74.779 278.800 75.790 278.800 C 76.496 278.800,76.800 279.017,76.800 279.520 C 76.800 280.994,77.584 281.200,83.200 281.200 C 88.816 281.200,89.600 280.994,89.600 279.520 C 89.600 279.017,89.904 278.800,90.610 278.800 L 91.621 278.800 91.510 272.700 C 91.403 266.769,91.376 266.597,90.525 266.475 C 89.979 266.398,89.602 266.021,89.525 265.475 C 89.399 264.591,77.899 263.834,77.077 264.657 M89.867 302.667 C 89.720 302.813,89.600 303.251,89.600 303.639 C 89.600 304.066,89.245 304.395,88.700 304.472 C 87.806 304.599,87.800 304.645,87.800 311.600 C 87.800 318.555,87.806 318.601,88.700 318.728 C 89.283 318.811,89.600 319.131,89.600 319.638 C 89.600 320.862,91.193 321.200,96.926 321.195 C 103.118 321.188,103.852 321.043,104.174 319.762 C 104.321 319.176,104.707 318.800,105.161 318.800 C 106.215 318.800,106.388 317.773,106.395 311.474 C 106.400 305.959,106.091 304.400,104.994 304.400 C 104.675 304.400,104.302 303.950,104.164 303.400 L 103.913 302.400 97.023 302.400 C 93.234 302.400,90.013 302.520,89.867 302.667 M281.200 322.140 C 281.200 322.984,280.968 323.128,279.226 323.359 C 277.588 323.576,277.231 323.772,277.126 324.510 C 277.046 325.075,276.675 325.446,276.110 325.526 C 275.372 325.631,275.176 325.988,274.959 327.626 C 274.728 329.368,274.584 329.600,273.740 329.600 L 272.784 329.600 272.892 337.900 C 272.998 346.023,273.018 346.203,273.851 346.321 C 274.543 346.420,274.751 346.809,274.962 348.395 C 275.176 350.012,275.373 350.369,276.110 350.474 C 276.675 350.554,277.046 350.925,277.126 351.490 C 277.231 352.228,277.588 352.424,279.226 352.641 C 280.968 352.872,281.200 353.016,281.200 353.860 L 281.200 354.816 289.500 354.708 C 297.766 354.600,297.801 354.596,297.928 353.700 C 298.019 353.056,298.325 352.800,299.002 352.800 C 300.639 352.800,302.000 352.078,302.000 351.210 C 302.000 350.663,302.263 350.400,302.810 350.400 C 303.678 350.400,304.400 349.039,304.400 347.402 C 304.400 346.725,304.656 346.419,305.300 346.328 C 306.682 346.132,306.729 329.875,305.349 329.679 C 304.655 329.580,304.449 329.191,304.235 327.579 C 304.017 325.933,303.834 325.600,303.149 325.600 C 302.651 325.600,302.239 325.271,302.108 324.769 C 301.875 323.878,300.533 323.200,299.002 323.200 C 298.325 323.200,298.019 322.944,297.928 322.300 C 297.801 321.404,297.766 321.400,289.500 321.292 L 281.200 321.184 281.200 322.140 M117.067 334.267 C 116.920 334.413,116.800 334.863,116.800 335.267 C 116.800 335.782,116.503 336.000,115.800 336.000 C 115.008 336.000,114.800 336.202,114.800 336.972 C 114.800 337.676,114.551 337.980,113.900 338.072 C 113.010 338.199,112.999 338.280,112.891 345.290 L 112.782 352.380 113.768 352.628 C 114.371 352.779,114.802 353.210,114.877 353.738 C 114.952 354.265,115.335 354.648,115.862 354.723 C 116.390 354.798,116.821 355.229,116.972 355.832 L 117.220 356.818 124.310 356.709 C 131.320 356.601,131.401 356.590,131.528 355.700 C 131.620 355.049,131.924 354.800,132.628 354.800 C 133.391 354.800,133.600 354.591,133.600 353.828 C 133.600 353.124,133.849 352.820,134.500 352.728 C 135.391 352.601,135.401 352.528,135.509 345.300 L 135.618 338.000 134.609 338.000 C 133.799 338.000,133.600 337.802,133.600 337.000 C 133.600 336.200,133.400 336.000,132.600 336.000 C 131.800 336.000,131.600 335.800,131.600 335.000 L 131.600 334.000 124.467 334.000 C 120.543 334.000,117.213 334.120,117.067 334.267 M245.680 348.880 C 245.416 349.144,245.200 349.684,245.200 350.080 C 245.200 350.542,244.903 350.800,244.369 350.800 C 243.308 350.800,241.200 352.890,241.200 353.941 C 241.200 354.470,240.892 354.788,240.300 354.872 C 239.400 355.000,239.400 355.001,239.400 363.200 C 239.400 371.399,239.400 371.400,240.300 371.528 C 240.892 371.612,241.200 371.930,241.200 372.459 C 241.200 373.510,243.308 375.600,244.369 375.600 C 244.903 375.600,245.200 375.858,245.200 376.320 C 245.200 377.855,245.910 378.000,253.447 378.000 C 261.546 378.000,262.000 377.923,262.000 376.557 C 262.000 375.773,262.205 375.600,263.132 375.600 C 263.931 375.600,264.423 375.305,264.800 374.600 C 265.094 374.050,265.575 373.600,265.868 373.600 C 266.165 373.600,266.400 373.170,266.400 372.628 C 266.400 371.924,266.649 371.620,267.300 371.528 C 268.200 371.400,268.200 371.399,268.200 363.200 C 268.200 355.001,268.200 355.000,267.300 354.872 C 266.749 354.794,266.400 354.466,266.400 354.028 C 266.400 353.634,265.950 352.974,265.400 352.561 C 264.850 352.148,264.400 351.583,264.400 351.305 C 264.400 351.011,263.900 350.800,263.200 350.800 C 262.201 350.800,262.000 350.640,262.000 349.843 C 262.000 349.316,261.715 348.776,261.367 348.643 C 260.116 348.163,246.187 348.373,245.680 348.880 M155.288 353.272 C 155.020 353.540,154.800 353.981,154.800 354.252 C 154.800 354.523,154.395 354.802,153.900 354.872 C 153.262 354.963,152.964 355.313,152.876 356.075 C 152.789 356.828,152.489 357.188,151.876 357.275 C 150.474 357.475,150.498 373.329,151.900 373.528 C 152.551 373.620,152.800 373.924,152.800 374.628 C 152.800 375.386,153.010 375.600,153.757 375.600 C 154.430 375.600,154.784 375.881,154.951 376.549 C 155.266 377.803,156.387 378.000,163.200 378.000 C 170.013 378.000,171.134 377.803,171.449 376.549 C 171.616 375.881,171.970 375.600,172.643 375.600 C 173.390 375.600,173.600 375.386,173.600 374.628 C 173.600 373.924,173.849 373.620,174.500 373.528 C 175.394 373.401,175.401 373.351,175.508 365.493 L 175.617 357.587 174.608 357.203 C 173.898 356.933,173.600 356.521,173.600 355.810 C 173.600 355.028,173.396 354.800,172.695 354.800 C 172.140
+ 354.800,171.637 354.452,171.395 353.900 C 170.923 352.824,156.306 352.254,155.288 353.272 M203.476 357.058 C 203.324 357.209,203.200 357.743,203.200 358.245 C 203.200 359.048,202.975 359.171,201.300 359.278 L 199.400 359.400 199.279 361.275 C 199.169 362.979,199.059 363.162,198.079 363.275 L 197.000 363.400 197.000 370.600 L 197.000 377.800 198.079 377.925 C 199.059 378.038,199.169 378.221,199.279 379.925 L 199.400 381.800 201.275 381.921 C 202.979 382.031,203.162 382.141,203.275 383.121 L 203.400 384.200 210.600 384.200 L 217.800 384.200 217.925 383.121 C 218.038 382.141,218.221 382.031,219.925 381.921 L 221.800 381.800 221.921 379.925 C 222.031 378.221,222.141 378.038,223.121 377.925 L 224.200 377.800 224.200 370.600 L 224.200 363.400 223.121 363.275 C 222.141 363.162,222.031 362.979,221.921 361.275 L 221.800 359.400 219.925 359.279 C 218.221 359.169,218.038 359.059,217.925 358.079 L 217.800 357.000 210.776 356.891 C 206.912 356.831,203.627 356.906,203.476 357.058 "
+                  stroke="none"
+                  fill="#34cc64"
+                  fill-rule="evenodd"
+                ></path>
+                <path
+                  id="path1"
+                  d="M203.467 47.867 C 203.320 48.013,203.200 48.451,203.200 48.839 C 203.200 49.266,202.845 49.595,202.300 49.672 C 201.417 49.798,201.398 49.917,201.290 55.900 L 201.179 62.000 202.190 62.000 C 202.799 62.000,203.200 62.227,203.200 62.571 C 203.200 64.074,204.305 64.400,209.396 64.400 C 215.031 64.400,215.600 64.270,215.600 62.984 C 215.600 62.266,215.851 62.024,216.700 61.927 L 217.800 61.800 217.871 55.884 L 217.942 49.968 216.771 49.734 C 215.871 49.554,215.600 49.280,215.600 48.550 L 215.600 47.600 209.667 47.600 C 206.403 47.600,203.613 47.720,203.467 47.867 M176.276 87.858 C 176.124 88.009,176.000 88.463,176.000 88.867 C 176.000 89.270,175.820 89.623,175.600 89.650 C 175.380 89.678,174.975 89.723,174.700 89.750 C 174.324 89.788,174.181 91.551,174.125 96.863 L 174.050 103.926 175.025 104.171 C 175.910 104.393,176.062 104.604,175.970 105.484 C 175.903 106.130,177.702 106.344,183.200 106.344 C 188.698 106.344,190.497 106.130,190.430 105.484 C 190.338 104.605,190.490 104.393,191.365 104.173 C 193.344 103.676,193.472 89.799,191.500 89.626 C 191.335 89.612,191.033 89.600,190.828 89.600 C 190.623 89.600,190.398 89.195,190.328 88.700 C 190.199 87.794,177.123 87.010,176.276 87.858 M157.308 125.736 C 157.017 126.193,156.949 126.410,156.850 127.200 C 156.822 127.420,156.350 127.600,155.800 127.600 C 155.028 127.600,154.800 127.806,154.800 128.505 C 154.800 129.060,154.452 129.563,153.900 129.805 C 153.023 130.190,152.997 130.378,152.891 137.090 L 152.781 143.980 153.791 144.233 C 154.420 144.391,154.800 144.769,154.800 145.236 C 154.800 145.694,155.175 146.079,155.765 146.227 C 156.344 146.372,156.829 146.860,156.975 147.444 L 157.220 148.419 164.110 148.309 C 170.822 148.203,171.010 148.177,171.395 147.300 C 171.637 146.748,172.140 146.400,172.695 146.400 C 173.394 146.400,173.600 146.172,173.600 145.400 C 173.600 144.850,173.780 144.378,174.000 144.350 C 174.220 144.322,174.625 144.278,174.900 144.250 C 175.275 144.212,175.427 142.382,175.509 136.900 L 175.618 129.600 174.609 129.600 C 173.799 129.600,173.600 129.402,173.600 128.600 C 173.600 127.800,173.400 127.600,172.600 127.600 C 171.812 127.600,171.600 127.397,171.600 126.643 C 171.600 126.115,171.298 125.570,170.928 125.428 C 169.703 124.958,157.629 125.231,157.308 125.736 M146.892 174.607 C 146.612 174.925,146.393 175.323,146.404 175.492 C 146.415 175.661,145.969 175.857,145.412 175.927 C 144.583 176.030,144.400 176.269,144.400 177.250 C 144.400 177.952,144.233 178.344,143.997 178.198 C 142.725 177.412,142.400 179.078,142.400 186.400 C 142.400 193.722,142.725 195.388,143.997 194.602 C 144.219 194.465,144.400 194.687,144.400 195.096 C 144.400 196.048,145.152 196.800,146.104 196.800 C 146.513 196.800,146.735 196.981,146.598 197.203 C 145.812 198.475,147.478 198.800,154.800 198.800 C 162.122 198.800,163.788 198.475,163.002 197.203 C 162.859 196.972,163.194 196.800,163.786 196.800 C 164.614 196.800,164.868 196.562,165.059 195.606 C 165.190 194.950,165.457 194.512,165.652 194.632 C 166.873 195.387,167.200 193.646,167.200 186.400 C 167.200 179.078,166.875 177.412,165.603 178.198 C 165.367 178.344,165.200 177.952,165.200 177.250 C 165.200 176.242,165.027 176.033,164.100 175.927 C 163.495 175.857,163.051 175.659,163.114 175.486 C 163.671 173.954,148.206 173.116,146.892 174.607 M148.564 219.000 C 148.387 219.708,148.042 220.000,147.383 220.000 C 145.156 220.000,144.400 220.657,144.400 222.590 C 144.400 224.254,144.319 224.400,143.400 224.400 L 142.400 224.400 142.400 231.600 L 142.400 238.800 143.400 238.800 C 144.276 238.800,144.400 238.973,144.400 240.200 C 144.400 242.031,144.860 243.148,145.600 243.114 C 147.635 243.022,148.354 243.269,148.472 244.100 C 148.599 244.991,148.672 245.001,155.900 245.109 L 163.200 245.218 163.200 244.209 C 163.200 243.366,163.397 243.192,164.400 243.150 C 165.060 243.123,165.780 243.055,166.000 243.000 C 166.220 242.945,166.545 242.878,166.722 242.850 C 166.899 242.822,167.088 242.048,167.142 241.129 C 167.261 239.114,167.428 238.800,168.381 238.800 C 169.477 238.800,169.600 238.075,169.600 231.600 C 169.600 224.910,169.498 224.400,168.157 224.400 C 167.309 224.400,167.200 224.220,167.200 222.823 C 167.200 221.016,166.160 220.000,164.311 220.000 C 163.393 220.000,163.200 219.827,163.200 219.000 L 163.200 218.000 156.008 218.000 L 148.815 218.000 148.564 219.000 M168.100 258.265 C 167.825 258.376,167.600 258.812,167.600 259.233 C 167.600 259.740,167.321 260.000,166.780 260.000 C 165.687 260.000,163.200 262.459,163.200 263.540 C 163.200 264.117,162.951 264.400,162.443 264.400 C 162.026 264.400,161.576 264.685,161.443 265.033 C 161.019 266.136,161.165 280.796,161.600 280.850 C 161.820 280.877,162.223 280.923,162.495 280.950 C 162.767 280.978,163.056 281.360,163.138 281.800 C 163.327 282.821,163.995 283.650,164.418 283.389 C 164.598 283.278,164.875 283.398,165.034 283.655 C 165.215 283.948,165.150 284.016,164.862 283.838 C 164.215 283.439,164.285 283.942,164.971 284.629 C 165.286 284.943,166.006 285.200,166.571 285.200 C 167.402 285.200,167.600 285.393,167.600 286.200 L 167.600 287.200 175.576 287.200 C 183.535 287.200,184.174 287.096,184.194 285.800 C 184.197 285.580,184.695 285.343,185.300 285.273 C 185.905 285.204,186.400 284.955,186.400 284.720 C 186.400 284.485,186.833 283.912,187.362 283.447 C 187.891 282.981,188.388 282.240,188.467 281.800 C 188.546 281.360,188.788 280.997,189.005 280.994 C 190.292 280.974,190.400 280.323,190.400 272.584 L 190.400 264.815 189.400 264.564 C 188.793 264.412,188.400 264.028,188.400 263.587 C 188.400 262.718,186.246 260.474,184.931 259.974 C 184.416 259.778,184.017 259.479,184.044 259.309 C 184.250 258.046,183.974 258.000,176.300 258.032 C 172.065 258.049,168.375 258.154,168.100 258.265 M201.200 282.148 C 201.200 283.108,200.906 283.235,197.700 283.666 C 197.425 283.703,197.284 283.817,197.386 283.919 C 197.790 284.323,196.726 285.600,195.986 285.600 C 195.554 285.600,195.178 285.780,195.150 286.000 C 195.123 286.220,195.055 286.580,195.000 286.800 C 194.945 287.020,194.878 287.740,194.850 288.400 C 194.808 289.403,194.634 289.600,193.792 289.600 L 192.784 289.600 192.892 297.900 C 193.000 306.166,193.004 306.201,193.900 306.328 C 194.585 306.425,194.812 306.736,194.850 307.628 C 194.878 308.273,194.945 308.980,195.000 309.200 C 195.055 309.420,195.123 309.791,195.150 310.024 C 195.178 310.257,195.373 310.340,195.585 310.209 C 196.048 309.923,197.714 311.492,197.432 311.948 C 197.324 312.123,197.452 312.297,197.718 312.334 C 200.960 312.786,201.200 312.890,201.200 313.852 L 201.200 314.816 209.500 314.708 C 217.766 314.600,217.801 314.596,217.928 313.700 C 218.020 313.054,218.325 312.800,219.009 312.800 C 220.477 312.800,222.000 311.953,222.000 311.136 C 222.000 310.688,222.288 310.400,222.736 310.400 C 223.559 310.400,224.400 308.874,224.400 307.381 C 224.400 306.842,224.580 306.377,224.800 306.350 C 225.020 306.322,225.425 306.278,225.700 306.250 C 226.736 306.146,226.382 289.826,225.341 289.678 C 224.521 289.561,224.336 289.090,223.934 286.100 C 223.897 285.825,223.783 285.684,223.681 285.786 C 223.356 286.110,222.000 285.194,222.000 284.650 C 222.000 284.016,220.316 283.200,219.009 283.200 C 218.325 283.200,218.020 282.946,217.928 282.300 C 217.801 281.404,217.766 281.400,209.500 281.292 L 201.200 281.184 201.200 282.148 "
+                  stroke="none"
+                  fill="#fccc04"
+                  fill-rule="evenodd"
+                ></path>
+                <path
+                  id="path2"
+                  d="M289.877 3.456 C 289.725 3.609,289.600 4.051,289.600 4.439 C 289.600 4.866,289.245 5.195,288.700 5.272 C 287.811 5.398,287.800 5.475,287.800 11.600 C 287.800 17.725,287.811 17.802,288.700 17.928 C 289.351 18.020,289.600 18.324,289.600 19.028 L 289.600 20.000 295.800 20.000 L 302.000 20.000 302.000 19.027 C 302.000 18.262,302.236 18.026,303.100 17.927 L 304.200 17.800 304.200 11.600 L 304.200 5.400 303.125 5.276 C 302.372 5.189,302.012 4.889,301.925 4.276 C 301.799 3.391,290.698 2.636,289.877 3.456 M256.276 31.058 C 256.124 31.209,256.000 31.651,256.000 32.039 C 256.000 32.466,255.645 32.795,255.100 32.872 C 254.206 32.999,254.200 33.045,254.200 40.000 C 254.200 46.955,254.206 47.001,255.100 47.128 C 255.737 47.218,256.000 47.526,256.000 48.181 C 256.000 49.438,256.775 49.589,263.200 49.589 C 269.659 49.589,270.400 49.441,270.400 48.153 C 270.400 47.416,270.616 47.200,271.353 47.200 C 272.644 47.200,272.788 46.467,272.795 39.874 C 272.800 34.138,272.501 32.800,271.214 32.800 C 270.732 32.800,270.409 32.472,270.328 31.900 C 270.199 30.994,257.123 30.210,256.276 31.058 M228.676 64.658 C 228.524 64.809,228.400 65.263,228.400 65.667 C 228.400 66.182,228.103 66.400,227.400 66.400 C 226.608 66.400,226.400 66.602,226.400 67.372 C 226.400 68.076,226.151 68.380,225.500 68.472 C 224.605 68.599,224.600 68.638,224.600 75.800 C 224.600 82.962,224.605 83.001,225.500 83.128 C 226.151 83.220,226.400 83.524,226.400 84.228 C 226.400 84.991,226.609 85.200,227.372 85.200 C 228.076 85.200,228.380 85.449,228.472 86.100 C 228.599 86.991,228.672 87.001,235.900 87.109 L 243.200 87.218 243.200 86.209 C 243.200 85.399,243.398 85.200,244.200 85.200 C 245.000 85.200,245.200 85.000,245.200 84.200 C 245.200 83.398,245.399 83.200,246.209 83.200 L 247.218 83.200 247.109 75.900 C 247.001 68.672,246.991 68.599,246.100 68.472 C 245.449 68.380,245.200 68.076,245.200 67.372 C 245.200 66.609,244.991 66.400,244.228 66.400 C 243.524 66.400,243.220 66.151,243.128 65.500 C 242.999 64.595,229.524 63.809,228.676 64.658 M203.600 103.400 C 203.333 104.102,202.919 104.400,202.210 104.400 C 201.412 104.400,201.200 104.600,201.200 105.357 C 201.200 106.043,200.918 106.384,200.200 106.564 L 199.200 106.815 199.200 114.800 L 199.200 122.785 200.200 123.036 C 200.918 123.216,201.200 123.557,201.200 124.243 C 201.200 125.000,201.412 125.200,202.210 125.200 C 202.919 125.200,203.333 125.498,203.600 126.200 L 203.980 127.200 211.782 127.200 L 219.585 127.200 219.836 126.200 C 220.016 125.482,220.357 125.200,221.043 125.200 C 221.797 125.200,222.000 124.988,222.000 124.200 C 222.000 123.463,222.213 123.200,222.810 123.200 C 224.102 123.200,224.400 121.627,224.400 114.800 C 224.400 107.973,224.102 106.400,222.810 106.400 C 222.213 106.400,222.000 106.137,222.000 105.400 C 222.000 104.612,221.797 104.400,221.043 104.400 C 220.357 104.400,220.016 104.118,219.836 103.400 L 219.585 102.400 211.782 102.400 L 203.980 102.400 203.600 103.400 M188.564 147.400 C 188.399 148.057,188.034 148.400,187.500 148.400 C 186.381 148.400,184.400 150.570,184.400 151.796 C 184.400 152.602,184.202 152.800,183.392 152.800 L 182.384 152.800 182.492 161.100 C 182.600 169.366,182.604 169.401,183.500 169.528 C 184.157 169.621,184.400 169.924,184.400 170.646 C 184.400 171.921,186.079 173.600,187.354 173.600 C 188.076 173.600,188.379 173.843,188.472 174.500 C 188.599 175.396,188.634 175.400,196.900 175.508 L 205.200 175.616 205.200 174.608 C 205.200 173.826,205.404 173.600,206.111 173.600 C 207.261 173.600,209.600 171.499,209.600 170.466 C 209.600 169.967,209.962 169.596,210.608 169.434 L 211.617 169.181 211.508 161.090 C 211.401 153.042,211.395 152.999,210.500 152.872 C 209.904 152.788,209.600 152.470,209.600 151.932 C 209.600 150.867,207.114 148.400,206.040 148.400 C 205.411 148.400,205.200 148.149,205.200 147.400 L 205.200 146.400 197.008 146.400 L 188.815 146.400 188.564 147.400 M188.667 191.067 C 188.520 191.213,188.400 191.663,188.400 192.067 C 188.400 192.570,188.105 192.800,187.458 192.800 C 186.186 192.800,184.400 194.524,184.400 195.753 C 184.400 196.477,184.157 196.779,183.500 196.872 C 182.604 196.999,182.600 197.034,182.492 205.300 L 182.384 213.600 183.392 213.600 C 184.202 213.600,184.400 213.798,184.400 214.604 C 184.400 215.830,186.381 218.000,187.500 218.000 C 188.034 218.000,188.399 218.343,188.564 219.000 L 188.815 220.000 197.008 220.000 L 205.200 220.000 205.200 219.000 C 205.200 218.251,205.411 218.000,206.040 218.000 C 206.502 218.000,207.492 217.388,208.240 216.640 C 208.988 215.892,209.600 214.902,209.600 214.440 C 209.600 213.811,209.851 213.600,210.600 213.600 L 211.600 213.600 211.600 205.408 L 211.600 197.215 210.600 196.964 C 209.923 196.794,209.600 196.436,209.599 195.857 C 209.598 195.351,209.107 194.672,208.400 194.200 C 207.741 193.760,207.202 193.265,207.201 193.100 C 207.201 192.935,206.750 192.800,206.200 192.800 C 205.400 192.800,205.200 192.600,205.200 191.800 L 205.200 190.800 197.067 190.800 C 192.593 190.800,188.813 190.920,188.667 191.067 M205.467 231.067 C 205.320 231.213,205.200 231.653,205.200 232.044 C 205.200 232.593,204.816 232.782,203.500 232.877 C 202.198 232.971,201.709 233.207,201.412 233.884 C 201.199 234.370,200.659 234.820,200.212 234.884 C 199.534 234.981,199.380 235.342,199.280 237.075 C 199.169 238.997,199.080 239.160,198.080 239.275 L 197.000 239.400 197.000 247.400 L 197.000 255.400 198.080 255.525 C 199.080 255.640,199.169 255.803,199.280 257.725 C 199.383 259.510,199.522 259.817,200.275 259.925 C 200.821 260.002,201.198 260.379,201.275 260.925 C 201.380 261.662,201.696 261.819,203.275 261.921 C 204.979 262.031,205.162 262.141,205.275 263.121 L 205.400 264.200 213.600 264.200 L 221.800 264.200 221.927 263.100 C 222.045 262.069,222.175 262.000,223.983 262.000 C 225.651 262.000,225.944 261.878,226.140 261.100 C 226.265 260.605,226.779 260.020,227.283 259.800 C 228.023 259.477,228.223 259.034,228.322 257.500 C 228.432 255.785,228.539 255.600,229.422 255.600 L 230.400 255.600 230.400 247.400 L 230.400 239.200 229.422 239.200 C 228.540 239.200,228.432 239.015,228.322 237.310 C 228.222 235.765,228.020 235.330,227.217 234.931 C 226.676 234.663,226.097 234.074,225.930 233.622 C 225.684 232.958,225.276 232.800,223.813 232.800 C 222.146 232.800,222.000 232.720,222.000 231.800 L 222.000 230.800 213.867 230.800 C 209.393 230.800,205.613 230.920,205.467 231.067 M164.254 261.113 C 164.143 261.292,164.196 261.670,164.371 261.953 C 164.619 262.354,164.741 262.331,164.927 261.846 C 165.179 261.190,164.598 260.557,164.254 261.113 M186.508 261.620 C 186.448 262.419,186.979 262.681,187.372 262.045 C 187.493 261.850,187.361 261.499,187.079 261.266 C 186.694 260.946,186.553 261.035,186.508 261.620 "
+                  stroke="none"
+                  fill="#cc0434"
+                  fill-rule="evenodd"
+                ></path>
+                <path
+                  id="path3"
+                  d="M123.477 131.856 C 123.325 132.009,123.200 132.553,123.200 133.067 C 123.200 133.798,122.981 134.000,122.190 134.000 L 121.179 134.000 121.290 140.100 C 121.397 146.031,121.424 146.203,122.275 146.325 C 122.821 146.402,123.198 146.779,123.275 147.325 C 123.397 148.176,123.569 148.203,129.500 148.310 L 135.600 148.421 135.600 147.437 C 135.600 146.661,135.832 146.426,136.700 146.327 L 137.800 146.200 137.800 140.200 L 137.800 134.200 136.724 134.076 C 135.893 133.980,135.620 133.707,135.524 132.876 L 135.400 131.800 129.577 131.690 C 126.374 131.629,123.629 131.704,123.477 131.856 M108.572 174.968 C 108.421 175.571,107.990 176.002,107.462 176.077 C 106.075 176.275,106.112 190.131,107.500 190.328 C 107.995 190.398,108.400 190.700,108.400 190.999 C 108.400 192.534,109.470 192.800,115.655 192.800 C 121.866 192.800,123.200 192.520,123.200 191.214 C 123.200 190.732,123.528 190.409,124.100 190.328 C 125.499 190.129,125.523 176.274,124.125 176.075 C 123.579 175.998,123.202 175.621,123.125 175.075 C 123.002 174.217,122.860 174.198,115.910 174.091 L 108.820 173.982 108.572 174.968 M102.400 216.973 C 102.400 217.738,102.164 217.974,101.300 218.073 C 100.523 218.163,100.163 218.457,100.075 219.075 C 99.998 219.621,99.621 219.998,99.075 220.075 C 98.212 220.198,98.200 220.298,98.200 227.400 C 98.200 234.502,98.212 234.602,99.075 234.725 C 99.621 234.802,99.998 235.179,100.075 235.725 C 100.163 236.343,100.523 236.637,101.300 236.727 C 102.164 236.826,102.400 237.062,102.400 237.827 L 102.400 238.800 109.600 238.800 L 116.800 238.800 116.800 237.800 C 116.800 237.000,117.000 236.800,117.800 236.800 C 118.593 236.800,118.800 236.598,118.800 235.827 C 118.800 235.062,119.036 234.826,119.900 234.727 L 121.000 234.600 121.110 227.853 C 121.224 220.835,121.058 220.000,119.551 220.000 C 119.017 220.000,118.800 219.712,118.800 219.000 C 118.800 218.200,118.600 218.000,117.800 218.000 C 117.000 218.000,116.800 217.800,116.800 217.000 L 116.800 216.000 109.600 216.000 L 102.400 216.000 102.400 216.973 M113.280 258.480 C 113.016 258.744,112.800 259.181,112.800 259.452 C 112.800 259.723,112.395 260.002,111.900 260.072 C 111.256 260.164,110.964 260.512,110.873 261.300 C 110.783 262.082,110.521 262.400,109.964 262.400 C 108.707 262.400,108.400 264.012,108.400 270.600 C 108.400 277.188,108.707 278.800,109.964 278.800 C 110.521 278.800,110.783 279.118,110.873 279.900 C 110.959 280.647,111.260 281.037,111.812 281.116 C 112.259 281.180,112.799 281.630,113.012 282.116 C 113.390 282.978,113.590 283.003,121.107 283.109 L 128.813 283.217 129.185 282.239 C 129.402 281.670,129.942 281.207,130.478 281.131 C 131.140 281.037,131.435 280.696,131.524 279.925 C 131.611 279.172,131.911 278.812,132.524 278.725 C 133.905 278.528,133.905 262.672,132.524 262.475 C 131.911 262.388,131.611 262.028,131.524 261.275 C 131.435 260.504,131.140 260.163,130.478 260.069 C 129.945 259.994,129.402 259.531,129.188 258.969 L 128.820 258.000 121.290 258.000 C 115.967 258.000,113.619 258.141,113.280 258.480 M140.276 291.858 C 140.124 292.009,140.000 292.543,140.000 293.045 C 140.000 293.848,139.775 293.971,138.100 294.078 L 136.200 294.200 136.078 296.100 C 135.968 297.815,135.861 298.000,134.978 298.000 L 134.000 298.000 134.000 305.400 L 134.000 312.800 134.978 312.800 C 135.861 312.800,135.968 312.985,136.078 314.700 L 136.200 316.600 138.100 316.722 C 139.815 316.832,140.000 316.939,140.000 317.822 L 140.000 318.800 147.400 318.800 L 154.800 318.800 154.800 317.822 C 154.800 316.939,154.985 316.832,156.700 316.722 L 158.600 316.600 158.721 314.725 C 158.831 313.021,158.941 312.838,159.921 312.725 L 161.000 312.600 161.000 305.400 L 161.000 298.200 159.921 298.075 C 158.941 297.962,158.831 297.779,158.721 296.075 L 158.600 294.200 156.725 294.079 C 155.021 293.969,154.838 293.859,154.725 292.879 L 154.600 291.800 147.576 291.691 C 143.712 291.631,140.427 291.706,140.276 291.858 M230.800 313.778 C 230.800 314.661,230.615 314.768,228.900 314.878 C 227.396 314.975,226.920 315.182,226.616 315.874 C 226.405 316.355,225.820 316.936,225.316 317.165 C 224.558 317.511,224.400 317.894,224.400 319.391 C 224.400 321.058,224.321 321.200,223.392 321.200 L 222.384 321.200 222.492 329.500 C 222.599 337.706,222.610 337.801,223.479 337.925 C 224.221 338.031,224.377 338.344,224.479 339.925 C 224.582 341.521,224.734 341.819,225.500 341.928 C 226.083 342.011,226.400 342.331,226.400 342.838 C 226.400 343.715,227.803 344.400,229.598 344.400 C 230.482 344.400,230.774 344.606,230.872 345.300 C 231.000 346.199,231.008 346.200,239.000 346.200 C 246.992 346.200,247.000 346.199,247.128 345.300 C 247.226 344.606,247.518 344.400,248.402 344.400 C 250.197 344.400,251.600 343.715,251.600 342.838 C 251.600 342.331,251.917 342.011,252.500 341.928 C 253.266 341.819,253.418 341.521,253.521 339.925 C 253.623 338.344,253.779 338.031,254.521 337.925 C 255.390 337.801,255.401 337.706,255.508 329.500 L 255.616 321.200 254.608 321.200 C 253.668 321.200,253.600 321.068,253.600 319.243 C 253.600 317.548,253.480 317.256,252.700 317.060 C 252.205 316.935,251.620 316.421,251.400 315.917 C 251.077 315.177,250.634 314.977,249.100 314.878 C 247.385 314.768,247.200 314.661,247.200 313.778 L 247.200 312.800 239.000 312.800 L 230.800 312.800 230.800 313.778 M178.500 317.065 C 178.225 317.176,178.000 317.702,178.000 318.233 C 178.000 319.002,177.792 319.200,176.982 319.200 C 175.671 319.200,174.000 320.871,174.000 322.182 C 174.000 322.896,173.785 323.200,173.280 323.200 C 171.745 323.200,171.600 323.910,171.600 331.447 C 171.600 339.546,171.677 340.000,173.043 340.000 C 173.743 340.000,174.000 340.221,174.000 340.826 C 174.000 342.613,175.525 344.365,177.100 344.387 C 177.712 344.396,178.000 344.642,178.000 345.157 C 178.000 346.300,178.674 346.400,186.400 346.400 C 194.126 346.400,194.800 346.300,194.800 345.157 C 194.800 344.660,195.082 344.400,195.623 344.400 C 196.691 344.400,198.800 342.084,198.800 340.911 C 198.800 340.217,199.028 340.000,199.757 340.000 C 201.123 340.000,201.200 339.546,201.200 331.447 C 201.200 323.910,201.055 323.200,199.520 323.200 C 199.045 323.200,198.800 322.901,198.800 322.322 C 198.800 321.149,196.881 319.200,195.725 319.200 C 195.016 319.200,194.800 318.976,194.800 318.243 C 194.800 317.716,194.515 317.176,194.167 317.043 C 193.414 316.754,179.221 316.774,178.500 317.065 "
+                  stroke="none"
+                  fill="#0464fc"
+                  fill-rule="evenodd"
+                ></path>
+              </g>
+            </svg>
+            <div class="flex-grow"></div>
+            <a
+              class="card-nav"
+              href="docs/extending/"
+              target="_blank"
+            >
+              <svg
+                xmlns="http://www.w3.org/2000/svg"
+                class="h-6 w-6"
+                fill="none"
+                viewBox="0 0 24 24"
+                stroke="currentColor"
+                stroke-width="2"
+              >
+                <path
+                  stroke-linecap="round"
+                  stroke-linejoin="round"
+                  d="M17 8l4 4m0 0l-4 4m4-4H3"
+                />
+              </svg>
+              Extending the benchmark
+            </a>
+          </div>
+        </div>
+      </section>
+      <section>
+        <div class="two-col">
+          <div class="flat-card">
+            <h3>📄 Paper</h3>
+            <div>
+              A preprint of our most recent paper is available on
+              <a href="https://arxiv.org/abs/2207.12560" target="_blank"
+                >ArXiv</a
+              >. It includes an in-depth discussion of the different design
+              decisions and its limiations, as well as a multi-faceted analysis
+              of results from large scale comparison across 9 frameworks on more
+              than 100 tasks.
+              <!-- https://arxiv.org/abs/1907.00909 -->
+            </div>
+          </div>
+          <div class="flat-card">
+            <h3>🧑‍💻 Code</h3>
+            <div>
+              The entire benchmark tool is open source and developed on
+              <a
+                href="https://github.com/openml/automlbenchmark"
+                target="_blank"
+                >Github</a
+              >. The Github discussion board and issue trackers are the main way
+              for us to interact with the community.
+            </div>
+          </div>
+        </div>
+        <!-- <h2>Paper <img src=/></h2> -->
+      </section>
+      <section>
+        <h2>AutoML Frameworks</h2>
+        <div>
+          <div class="grey">
+            Many AutoML frameworks are already integrated with the AutoML
+            benchmark tool and
+            <a href="docs/extending/framework/" target="_blank">adding more is easy</a>.
+            We have more information about the different frameworks on our
+            <a href="frameworks.html">framework overview page</a>. The icons below
+            link directly to their respective Github repositories.
+          </div>
+          <div class="icon-flex">
+            <a
+              href="https://github.com/openml-labs/gama"
+              target="_blank"
+              class="framework-logo"
+              ><img src="img/logos/GAMA-icon.png" title="GAMA"
+            /></a>
+            <a href="https://github.com/EpistasisLab/tpot" target="_blank"
+              ><img src="img/logos/tpot.jpeg" title="TPOT"
+            /></a>
+            <a href="https://github.com/awslabs/autogluon" target="_blank"
+              ><img src="img/logos/autogluon.png" title="AutoGluon"
+            /></a>
+            <a
+              href="https://github.com/automl/auto-sklearn"
+              target="_blank"
+              class="framework-logo"
+              ><img src="img/logos/auto-sklearn.png" title="auto-sklearn"
+            /></a>
+            <a href="https://github.com/mljar/mljar-supervised" target="_blank"
+              ><img src="img/logos/mljar.png" title="mljar-supervised"
+            /></a>
+            <a href="https://github.com/h2oai/h2o-3" target="_blank"
+              ><img src="img/logos/h2o-automl-logo.jpeg" title="H2O AutoML"
+            /></a>
+            <a
+              href="https://github.com/sb-ai-lab/LightAutoML"
+              target="_blank"
+              class="framework-logo"
+              ><img
+                src="img/logos/LightAutoML_logo_small.png"
+                title="Light AutoML"
+            /></a>
+            <a href="https://github.com/microsoft/FLAML" target="_blank"
+              ><img src="img/logos/flaml.svg" title="FLAML"
+            /></a>
+          </div>
+          <div></div>
+        </div>
+      </section>
+      <section>
+        <h2>Community</h2>
+        <div class="grey">
+          We welcome any contributions to the AutoML benchmark. Our goal is to
+          provide the best benchmark tools for AutoML research and we can't do
+          that on our own. Contributions are appreciated in many forms,
+          including feedback on the benchmark design, feature requests, bug
+          reports, code and documentation contributions, and more. Why not stop
+          by on our
+          <a
+            href="https://github.com/openml/automlbenchmark/discussions/216"
+            target="_blank"
+            >welcome board</a
+          >
+          and let us know what got you interested in the benchmark?
+        </div>
+      </section>
+    </div>
+  </body>
+  <footer>
+    © 2022 Copyright, the AutoML Benchmark team.<br />
+    Icons of AutoML frameworks and OpenML by their respective owners. <br />
+    Spotted an issue?
+    <a href="https://github.com/openml/automlbenchmark" target="_blank"
+      >Tell us on Github or open a pull request</a
+    >!
+  </footer>
+</html>
diff --git a/docs/website/papers.html b/docs/website/papers.html
new file mode 100644
index 000000000..579aee363
--- /dev/null
+++ b/docs/website/papers.html
@@ -0,0 +1,289 @@
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <title>AMLB: Papers</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link
+      href="https://fonts.googleapis.com/css2?family=Inconsolata&family=Lato:ital,wght@0,400;0,700;1,400&display=swap"
+      rel="stylesheet"
+    />
+    <link href="style.css" rel="stylesheet" />
+  </head>
+  <body>
+    <nav class="navigation-bar desktop">
+      <a href="index.html">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z"
+          />
+        </svg>
+      </a>
+      <a href="results.html" class="nav-link">Results</a>
+      <a href="frameworks.html" class="nav-link">Frameworks</a>
+      <a href="papers.html" class="nav-link">papers</a>
+      <a href="docs/index.html" class="nav-link">user guide</a>
+      <a
+        href="https://github.com/openml/automlbenchmark/discussions/216"
+        target="_blank"
+        class="nav-link"
+        >Discussions
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon-with-text"
+          fill="none"
+          viewBox="0 0 24 24"
+          stroke="currentColor"
+          stroke-width="2"
+        >
+          <path
+            stroke-linecap="round"
+            stroke-linejoin="round"
+            d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/"
+        target="_blank"
+        class="nav-icon"
+      >
+        <img src="img/logos/GitHub-Mark-Light-64px.png" height="100%" />
+      </a>
+    </nav>
+
+    <nav class="navigation-bar mobile nav-mobile">
+      <a href="index.html">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z"
+          />
+        </svg>
+      </a>
+      <a href="results.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            fill-rule="evenodd"
+            d="M5 3a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2V5a2 2 0 00-2-2H5zm9 4a1 1 0 10-2 0v6a1 1 0 102 0V7zm-3 2a1 1 0 10-2 0v4a1 1 0 102 0V9zm-3 3a1 1 0 10-2 0v1a1 1 0 102 0v-1z"
+            clip-rule="evenodd"
+          />
+        </svg>
+      </a>
+      <a href="frameworks.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M5 4a1 1 0 00-2 0v7.268a2 2 0 000 3.464V16a1 1 0 102 0v-1.268a2 2 0 000-3.464V4zM11 4a1 1 0 10-2 0v1.268a2 2 0 000 3.464V16a1 1 0 102 0V8.732a2 2 0 000-3.464V4zM16 3a1 1 0 011 1v7.268a2 2 0 010 3.464V16a1 1 0 11-2 0v-1.268a2 2 0 010-3.464V4a1 1 0 011-1z"
+          />
+        </svg>
+      </a>
+      <a href="papers.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M9 4.804A7.968 7.968 0 005.5 4c-1.255 0-2.443.29-3.5.804v10A7.969 7.969 0 015.5 14c1.669 0 3.218.51 4.5 1.385A7.962 7.962 0 0114.5 14c1.255 0 2.443.29 3.5.804v-10A7.968 7.968 0 0014.5 4c-1.255 0-2.443.29-3.5.804V12a1 1 0 11-2 0V4.804z"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark#readme"
+        target="_blank"
+        class="nav-link"
+        ><svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            fill-rule="evenodd"
+            d="M10.496 2.132a1 1 0 00-.992 0l-7 4A1 1 0 003 8v7a1 1 0 100 2h14a1 1 0 100-2V8a1 1 0 00.496-1.868l-7-4zM6 9a1 1 0 00-1 1v3a1 1 0 102 0v-3a1 1 0 00-1-1zm3 1a1 1 0 012 0v3a1 1 0 11-2 0v-3zm5-1a1 1 0 00-1 1v3a1 1 0 102 0v-3a1 1 0 00-1-1z"
+            clip-rule="evenodd"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/discussions/216"
+        target="_blank"
+        class="nav-link"
+      >
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M2 5a2 2 0 012-2h7a2 2 0 012 2v4a2 2 0 01-2 2H9l-3 3v-3H4a2 2 0 01-2-2V5z"
+          />
+          <path
+            d="M15 7v2a4 4 0 01-4 4H9.828l-1.766 1.767c.28.149.599.233.938.233h2l3 3v-3h2a2 2 0 002-2V9a2 2 0 00-2-2h-1z"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/"
+        target="_blank"
+        class="nav-icon"
+      >
+        <img src="img/logos/GitHub-Mark-Light-64px.png" height="22px" />
+      </a>
+    </nav>
+    <!-- <div class="construction-banner">
+      🚧 This website is under construction 🚧
+    </div> -->
+    <div class="page-content">
+      <h1 class="page-title">Papers</h1>
+      <div class="paper">
+        <h2 class="paper-title">AMLB: an AutoML Benchmark</h2>
+        <div class="paper-authors">
+          Pieter Gijsbers, Marcos L. P. Bueno, Stefan Coors, Erin LeDell,
+          Sébastien Poirier, Janek Thomas, Bernd Bischl and Joaquin Vanschoren
+        </div>
+
+        <!-- <div class="paper-year">2019</div>
+      <div class="paper-venue">ICML 2019 AutoML Workshop</div>
+      <div class="paper-venue">cite</div> -->
+
+        <div class="paper-abstract">
+          Comparing different AutoML frameworks is notoriously challenging and
+          often done incorrectly. We introduce an open and extensible benchmark
+          that follows best practices and avoids common mistakes when comparing
+          AutoML frameworks. We conduct a thorough comparison of 9 well-known
+          AutoML frameworks across 71 classification and 33 regression tasks.
+          The differences between the AutoML frameworks are explored with a
+          multi-faceted analysis, evaluating model accuracy, its trade-offs with
+          inference time, and framework failures. We also use Bradley-Terry
+          trees to discover subsets of tasks where the relative AutoML framework
+          rankings differ. The benchmark comes with an open-source tool that
+          integrates with many AutoML frameworks and automates the empirical
+          evaluation process end-to-end: from framework installation and
+          resource allocation to in-depth evaluation. The benchmark uses public
+          data sets, can be easily extended with other AutoML frameworks and
+          tasks, and has a website with up-to-date results.
+        </div>
+        <div class="paper-links">
+          <div class="hover-expand">
+            <strong>2022</strong>
+            <div>Under Review</div>
+          </div>
+          <a class="flat-button">
+            <label for="cite-2022">CITE</label>
+          </a>
+          <a href="https://arxiv.org/pdf/2207.12560.pdf" target="_blank">PDF</a>
+          <a href="https://arxiv.org/abs/2207.12560" target="_blank">ArXiv</a>
+        </div>
+        <div>
+          <input
+            type="checkbox"
+            id="cite-2022"
+            class="hidden-content-checkbox"
+          />
+          <div class="hidden-content bibtex">
+            @misc{https://doi.org/10.48550/arxiv.2207.12560, <br />
+            &emsp; doi = {10.48550/ARXIV.2207.12560}, <br />
+            &emsp; url = {https://arxiv.org/abs/2207.12560}, <br />
+            &emsp; author = {Gijsbers, Pieter and Bueno, Marcos L. P. and Coors,
+            Stefan and LeDell, Erin and Poirier, S\'{e}bastien and Thomas, Janek
+            and Bischl, Bernd and Vanschoren, Joaquin}, <br />
+            &emsp; keywords = {Machine Learning (cs.LG), Machine Learning
+            (stat.ML), FOS: Computer and information sciences, FOS: Computer and
+            information sciences}, <br />
+            &emsp; title = {AMLB: an AutoML Benchmark}, <br />
+            &emsp; publisher = {arXiv}, <br />
+            &emsp; year = {2022}, <br />
+            &emsp; copyright = {Creative Commons Attribution 4.0 International}
+            <br />
+            }
+          </div>
+        </div>
+      </div>
+      <div class="paper">
+        <h2 class="paper-title">An Open Source AutoML Benchmark</h2>
+        <div class="paper-authors">
+          Pieter Gijsbers, Erin LeDell, Janek Thomas, Sébastien Poirier, Bernd
+          Bischl, Joaquin Vanschoren
+        </div>
+
+        <!-- <div class="paper-year">2019</div>
+      <div class="paper-venue">ICML 2019 AutoML Workshop</div>
+      <div class="paper-venue">cite</div> -->
+
+        <div class="paper-abstract">
+          In recent years, an active field of research has developed around
+          automated machine learning (AutoML). Unfortunately, comparing
+          different AutoML systems is hard and often done incorrectly. We
+          introduce an open, ongoing, and extensible benchmark framework which
+          follows best practices and avoids common mistakes. The framework is
+          open-source, uses public datasets and has a website with up-to-date
+          results. We use the framework to conduct a thorough comparison of 4
+          AutoML systems across 39 datasets and analyze the results.
+        </div>
+        <div class="paper-links">
+          <div class="hover-expand">
+            <strong>2019</strong>
+            <div>ICML 2019 AutoML Workshop</div>
+          </div>
+          <a class="flat-button">
+            <label for="cite-2019">CITE</label>
+          </a>
+          <a
+            href="https://www.automl.org/wp-content/uploads/2019/06/automlws2019_Paper45.pdf"
+            target="_blank"
+            >PDF</a
+          >
+          <a href="https://arxiv.org/abs/1907.00909" target="_blank">ArXiv</a>
+        </div>
+        <div>
+          <input
+            type="checkbox"
+            id="cite-2019"
+            class="hidden-content-checkbox"
+          />
+          <div class="hidden-content bibtex">
+            @article{amlb2019, <br />
+            &emsp; title={An Open Source AutoML Benchmark},<br />
+            &emsp; author={Gijsbers, P. and LeDell, E. and Poirier, S. and
+            Thomas, J. and Bischl, B. and Vanschoren, J.}, <br />
+            &emsp; journal={arXiv preprint arXiv:1907.00909 [cs.LG]}, <br />
+            &emsp; url={https://arxiv.org/abs/1907.00909}, <br />
+            &emsp; note={Accepted at AutoML Workshop at ICML 2019}, <br />
+            &emsp; year={2019} <br />}
+          </div>
+        </div>
+      </div>
+    </div>
+  </body>
+  <footer>
+    © 2022 Copyright, the AutoML Benchmark team.<br />
+    Icons of AutoML frameworks and OpenML by their respective owners. <br />
+    Spotted an issue?
+    <a href="https://github.com/openml/automlbenchmark" target="_blank"
+      >Tell us on Github or open a pull request</a
+    >!
+  </footer>
+</html>
diff --git a/docs/website/results.html b/docs/website/results.html
new file mode 100644
index 000000000..dc4c84cc8
--- /dev/null
+++ b/docs/website/results.html
@@ -0,0 +1,262 @@
+<html>
+  <head>
+    <meta charset="utf-8" />
+    <title>AMLB: Results</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link
+      href="https://fonts.googleapis.com/css2?family=Inconsolata&family=Lato:ital,wght@0,400;0,700;1,400&display=swap"
+      rel="stylesheet"
+    />
+    <link href="style.css" rel="stylesheet" />
+  </head>
+  <body>
+    <nav class="navigation-bar desktop">
+      <a href="index.html">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z"
+          />
+        </svg>
+      </a>
+      <a href="results.html" class="nav-link">Results</a>
+      <a href="frameworks.html" class="nav-link">Frameworks</a>
+      <a href="papers.html" class="nav-link">papers</a>
+      <a href="docs/index.html" class="nav-link">user guide</a>
+      <a
+        href="https://github.com/openml/automlbenchmark/discussions/216"
+        target="_blank"
+        class="nav-link"
+        >Discussions
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon-with-text"
+          fill="none"
+          viewBox="0 0 24 24"
+          stroke="currentColor"
+          stroke-width="2"
+        >
+          <path
+            stroke-linecap="round"
+            stroke-linejoin="round"
+            d="M10 6H6a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2v-4M14 4h6m0 0v6m0-6L10 14"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/"
+        target="_blank"
+        class="nav-icon"
+      >
+        <img src="img/logos/GitHub-Mark-Light-64px.png" height="100%" />
+      </a>
+    </nav>
+
+    <nav class="navigation-bar mobile nav-mobile">
+      <a href="index.html">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M10.707 2.293a1 1 0 00-1.414 0l-7 7a1 1 0 001.414 1.414L4 10.414V17a1 1 0 001 1h2a1 1 0 001-1v-2a1 1 0 011-1h2a1 1 0 011 1v2a1 1 0 001 1h2a1 1 0 001-1v-6.586l.293.293a1 1 0 001.414-1.414l-7-7z"
+          />
+        </svg>
+      </a>
+      <a href="results.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            fill-rule="evenodd"
+            d="M5 3a2 2 0 00-2 2v10a2 2 0 002 2h10a2 2 0 002-2V5a2 2 0 00-2-2H5zm9 4a1 1 0 10-2 0v6a1 1 0 102 0V7zm-3 2a1 1 0 10-2 0v4a1 1 0 102 0V9zm-3 3a1 1 0 10-2 0v1a1 1 0 102 0v-1z"
+            clip-rule="evenodd"
+          />
+        </svg>
+      </a>
+      <a href="frameworks.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M5 4a1 1 0 00-2 0v7.268a2 2 0 000 3.464V16a1 1 0 102 0v-1.268a2 2 0 000-3.464V4zM11 4a1 1 0 10-2 0v1.268a2 2 0 000 3.464V16a1 1 0 102 0V8.732a2 2 0 000-3.464V4zM16 3a1 1 0 011 1v7.268a2 2 0 010 3.464V16a1 1 0 11-2 0v-1.268a2 2 0 010-3.464V4a1 1 0 011-1z"
+          />
+        </svg>
+      </a>
+      <a href="papers.html" class="nav-link">
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M9 4.804A7.968 7.968 0 005.5 4c-1.255 0-2.443.29-3.5.804v10A7.969 7.969 0 015.5 14c1.669 0 3.218.51 4.5 1.385A7.962 7.962 0 0114.5 14c1.255 0 2.443.29 3.5.804v-10A7.968 7.968 0 0014.5 4c-1.255 0-2.443.29-3.5.804V12a1 1 0 11-2 0V4.804z"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark#readme"
+        target="_blank"
+        class="nav-link"
+        ><svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            fill-rule="evenodd"
+            d="M10.496 2.132a1 1 0 00-.992 0l-7 4A1 1 0 003 8v7a1 1 0 100 2h14a1 1 0 100-2V8a1 1 0 00.496-1.868l-7-4zM6 9a1 1 0 00-1 1v3a1 1 0 102 0v-3a1 1 0 00-1-1zm3 1a1 1 0 012 0v3a1 1 0 11-2 0v-3zm5-1a1 1 0 00-1 1v3a1 1 0 102 0v-3a1 1 0 00-1-1z"
+            clip-rule="evenodd"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/discussions/216"
+        target="_blank"
+        class="nav-link"
+      >
+        <svg
+          xmlns="http://www.w3.org/2000/svg"
+          class="nav-icon"
+          viewBox="0 0 20 20"
+          fill="currentColor"
+        >
+          <path
+            d="M2 5a2 2 0 012-2h7a2 2 0 012 2v4a2 2 0 01-2 2H9l-3 3v-3H4a2 2 0 01-2-2V5z"
+          />
+          <path
+            d="M15 7v2a4 4 0 01-4 4H9.828l-1.766 1.767c.28.149.599.233.938.233h2l3 3v-3h2a2 2 0 002-2V9a2 2 0 00-2-2h-1z"
+          />
+        </svg>
+      </a>
+      <a
+        href="https://github.com/openml/automlbenchmark/"
+        target="_blank"
+        class="nav-icon"
+      >
+        <img src="img/logos/GitHub-Mark-Light-64px.png" height="22px" />
+      </a>
+    </nav>
+    <!-- <div class="construction-banner">
+      🚧 This website is under construction 🚧
+    </div> -->
+    <div class="page-content">
+      <section>
+        <h1 class="page-title">Results</h1>
+        <div class="accordion acard">
+          <div>
+            ⚠️ <a href="papers.html">Our paper</a> outlines
+            <strong>important limitations</strong> for the interpretation of
+            results. These limitations include:
+          </div>
+          <input
+            type="checkbox"
+            id="more-limitations"
+            class="accordion-input"
+          />
+          <div class="accordion-content">
+            <ul class="limitations-list">
+              <li>
+                We use AutoML framework versions from
+                <strong>September 2021</strong>, many frameworks have since seen
+                major updates.
+              </li>
+              <li>
+                We use the "benchmark" modes of the frameworks, which generally
+                <em>only</em> optimize for performance. Most AutoML frameworks
+                have multiple modes to support different use cases.
+              </li>
+              <li>
+                Results can not be used to make conclusions about which
+                algorithm is best, as all frameworks differ in multiple ways.
+              </li>
+              <li>
+                Performance statistics are often independent from many
+                qualitative differences, such as ease of use or
+                interpretability.
+              </li>
+            </ul>
+            Please read Section 5.3 in our paper for a more elaborate discussion
+            of these and other limitations.
+          </div>
+          <label for="more-limitations">
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-down accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M19 9l-7 7-7-7"
+              />
+            </svg>
+            <svg
+              xmlns="http://www.w3.org/2000/svg"
+              class="accordion-chevron-up accordion-icon"
+              fill="none"
+              viewBox="0 0 24 24"
+              stroke="currentColor"
+              stroke-width="2"
+            >
+              <path
+                stroke-linecap="round"
+                stroke-linejoin="round"
+                d="M5 15l7-7 7 7"
+              />
+            </svg>
+          </label>
+        </div>
+      </section>
+      <section class="results">
+        <div>
+          All results are available as
+          <a href="https://test.openml.org/amlb/" target="_blank">raw files</a>
+          📂, but we open source the tools we used for generating the figures in
+          our paper. The best way to explore the results is through our
+          interactive
+          <a
+            href="https://compstat-lmu.shinyapps.io/AutoML-Benchmark-Analysis/"
+            target="_blank"
+            >Shiny app</a
+          >. It loads the latest results by default. It is also possible to use
+          our
+          <a
+            href="https://github.com/openml/automlbenchmark/tree/master/reports"
+            target="_blank"
+            >notebooks</a
+          >
+          which contain additional visualizations.
+        </div>
+        <img src="img/shiny.png" height="320px" class="shiny" />
+      </section>
+    </div>
+  </body>
+  <footer>
+    © 2022 Copyright, the AutoML Benchmark team.<br />
+    Icons of AutoML frameworks and OpenML by their respective owners. <br />
+    Spotted an issue?
+    <a href="https://github.com/openml/automlbenchmark" target="_blank"
+      >Tell us on Github or open a pull request</a
+    >!
+  </footer>
+</html>
diff --git a/docs/website/style.css b/docs/website/style.css
new file mode 100644
index 000000000..117611739
--- /dev/null
+++ b/docs/website/style.css
@@ -0,0 +1,527 @@
+* {
+  margin: 0;
+  padding: 0;
+  box-sizing: border-box;
+  font-size: 18px;
+}
+.grey {
+  color: #495057;
+}
+
+/* main: #1971c2 */
+
+body {
+  font-family: "lato", sans-serif;
+  color: #343a40;
+}
+
+section {
+  margin-bottom: 64px;
+}
+
+a {
+  color: #1971c2;
+}
+
+.page-content {
+  padding: 0 clamp(15%, 100%, calc((100vw - 1000px) / 2)); /* should maybe set width in px instead */
+}
+
+h1 {
+  font-size: 72px;
+  padding-bottom: 24px;
+}
+
+h2 {
+  font-size: 48px;
+  padding-bottom: 24px;
+}
+
+h3 {
+  font-size: 32px;
+  padding-bottom: 16px;
+}
+
+footer {
+  color: #e7f5ff;
+  font-size: 14px;
+  background-color: #1971c2;
+  text-align: center;
+  padding: 8px;
+}
+
+footer > a {
+  color: #e7f5ff;
+  font-size: 14px;
+}
+
+.terminal {
+  background-color: #343a40;
+  color: #f8f9fa;
+  font-family: "Inconsolata", sans-serif;
+  padding: 8px;
+  margin: 16px 5%;
+}
+
+/* NAVBAR */
+.construction-banner {
+  background-color: #1c7ed6;
+  color: white;
+  text-align: center;
+}
+
+.navigation-bar {
+  background-color: #1971c2;
+  color: white;
+  text-transform: uppercase;
+
+  display: flex;
+  gap: 24px;
+  justify-content: flex-end;
+  align-items: center;
+  padding: 16px;
+  height: 60px;
+}
+
+nav a:first-child {
+  flex-grow: 1;
+}
+
+.nav-icon {
+  height: 24px;
+  fill: white;
+}
+
+.nav-icon-with-text {
+  height: 18px;
+}
+
+.nav-icon:hover,
+.nav-icon:active {
+  height: 28px;
+}
+
+.nav-link:link,
+.nav-link:visited {
+  text-decoration: none;
+  color: white;
+}
+.nav-link:hover,
+.nav-link:active {
+  color: white;
+  font-weight: bold;
+}
+
+/* TITLE */
+
+.title,
+.subtitle {
+  color: #343a40;
+  text-align: center;
+}
+
+.title {
+  font-size: 74px;
+  padding-top: 48px;
+  font-weight: bold;
+}
+
+.subtitle {
+  font-size: 62px;
+  font-weight: bold;
+  padding-bottom: 24px;
+}
+
+.summary {
+  font-size: 24px;
+  color: #495057;
+  padding: 0 max((100% - 900px) / 2, 0px); /* does not seem to work with just the math expression */
+}
+
+/* CARDS */
+
+.three-cols {
+  display: grid;
+  grid-template-columns: repeat(3, 1fr);
+  column-gap: 5%;
+}
+
+.card {
+  box-shadow: 0 5px 10px rgba(0, 0, 0, 0.1);
+  border-radius: 5px;
+  padding: 16px;
+
+  display: flex;
+  flex-direction: column;
+}
+
+.card img {
+  padding: 16px 2%;
+}
+
+.card > svg {
+  display: block;
+  margin: 16px auto;
+}
+
+.flex-grow {
+  flex-grow: 1;
+}
+
+.card > .card-nav {
+  color: #1971c2;
+  text-decoration: none;
+
+  display: flex;
+  align-items: center;
+  justify-content: flex-start;
+  gap: 8px;
+}
+
+.card > .card-nav > svg {
+  height: 24px;
+  fill: #1971c2;
+}
+
+h3 {
+  font-size: 24px;
+  padding-bottom: 10px;
+}
+
+/* Frameworks */
+/* .icon-grid {
+  padding: 24px;
+  display: grid;
+  grid-auto-rows: 60px;
+} */
+.icon-flex {
+  margin-top: 24px;
+  display: flex;
+  flex-wrap: wrap;
+  justify-content: center;
+  align-items: center;
+  gap: 40px;
+}
+
+.icon-flex a img {
+  height: 90px;
+}
+/* Paper */
+
+.two-col {
+  display: grid;
+  grid-template-columns: 1fr 1fr;
+  column-gap: 2%;
+
+  background-color: #1971c2;
+  color: #e7f5ff;
+  border-radius: 5px;
+}
+
+.two-col a {
+  color: white;
+}
+
+.two-col h3 {
+  color: white;
+}
+
+.flat-card {
+  padding: 16px;
+}
+
+/* FRAMEWORKS PAGE */
+.page-title {
+  margin-top: 72px;
+}
+
+.framework-card-list {
+  background-color: #e7f5ff;
+  padding: 32px 5%;
+  border-radius: 10px;
+}
+
+.framework-card-list > .accordion {
+  margin-bottom: 32px;
+}
+/* 
+.framework-card {
+  background-color: white;
+  margin-bottom: 64;
+  padding: 32;
+  border-radius: 10px;
+} */
+.framework-card:last-of-type {
+  margin-bottom: 0;
+}
+
+.framework-header {
+  display: flex;
+  align-items: center;
+  gap: 16px;
+  flex-wrap: wrap;
+}
+
+.framework-header > h3 {
+  padding-bottom: 0px;
+  font-size: 32px;
+}
+
+.framework-links {
+  display: flex;
+  gap: 16px;
+  justify-content: flex-end;
+
+  flex-grow: 1;
+}
+
+.framework-links > a {
+  text-decoration: none;
+  font-size: 24px;
+}
+
+/* RESULTS */
+
+.limitations-list {
+  padding: 0px 32px 16px 32px;
+}
+
+.limitations-list li {
+  padding: 8px 0px;
+}
+
+.results {
+  display: flex;
+  gap: 40px;
+}
+
+/* Accordions */
+.accordion {
+}
+
+.acard {
+  box-shadow: 0 5px 10px rgba(0, 0, 0, 0.2);
+  border-radius: 10px;
+  background-color: white;
+}
+
+.accordion > div {
+  padding: 8px 32px;
+}
+
+.accordion > div:first-child {
+  padding-top: 16px;
+}
+
+.accordion > label {
+  width: 100%;
+  display: flex;
+  justify-content: center;
+}
+
+.accordion-input {
+  display: none;
+}
+
+.accordion-icon {
+  height: 24px;
+}
+
+.accordion-input:checked ~ .accordion-content {
+  display: block;
+}
+
+.accordion-input:checked ~ label .accordion-chevron-down {
+  display: none;
+}
+
+.accordion-input:checked ~ label .accordion-chevron-up {
+  display: block;
+}
+
+.accordion-input ~ .accordion-content {
+  display: none;
+}
+
+.accordion-input ~ label .accordion-chevron-up {
+  display: none;
+}
+
+.accordion-input ~ label .accordion-chevron-down {
+  display: block;
+}
+
+.accordion > label {
+  background: linear-gradient(0deg, rgba(0, 0, 0, 0.02), white);
+  border-radius: 10px;
+  padding-bottom: 8px;
+}
+
+.accordion > label:hover {
+  cursor: pointer;
+}
+
+/* Papers */
+.paper {
+  margin-bottom: 48px;
+}
+
+.paper-title {
+  padding-bottom: 0px;
+  margin-bottom: 0px;
+}
+
+.paper-links {
+  display: flex;
+  align-items: center;
+  gap: 12px;
+
+  margin-bottom: 16px;
+}
+
+.paper-links > a {
+  color: white;
+  background-color: #1c7ed6;
+  padding: 4px 12px;
+  border-radius: 4px;
+
+  text-decoration: none;
+  text-transform: uppercase;
+}
+
+.paper-year {
+  color: #343a40;
+  display: inline;
+  font-weight: bold;
+}
+
+.paper-venue {
+  color: #495057;
+  display: inline;
+}
+
+.paper-authors {
+  color: #4263eb;
+  padding-bottom: 8px;
+}
+
+.paper-abstract {
+  color: #495057;
+  padding-bottom: 16px;
+}
+
+.hover-expand {
+  display: flex;
+  gap: 20px;
+
+  color: #343a40;
+  background-color: white;
+  border: 2px solid #495057;
+  border-radius: 4px;
+  padding: 2px 8px;
+}
+
+.hover-expand > div {
+  display: none;
+}
+
+.hover-expand:hover > div {
+  display: block;
+}
+
+.hidden-content {
+  display: none;
+}
+
+.hidden-content-checkbox {
+  display: none;
+}
+
+.hidden-content-checkbox:checked ~ .hidden-content {
+  display: block;
+}
+
+.bibtex {
+  font-family: "Inconsolata", sans-serif;
+  color: #343a40;
+  background-color: #e7f5ff;
+  border-radius: 10px;
+  padding: 16px;
+}
+
+.flat-button {
+  cursor: pointer;
+}
+
+.flat-button > label {
+  cursor: pointer;
+}
+
+.mobile {
+  display: none;
+}
+
+.nav-mobile {
+  justify-content: space-between;
+  display: none;
+}
+
+@media (max-width: 900px) {
+  .desktop {
+    display: none;
+  }
+
+  .mobile {
+    display: block;
+    padding: 25px;
+  }
+
+  .shiny {
+    height: auto;
+    width: 100%;
+  }
+
+  .results {
+    flex-wrap: wrap;
+  }
+
+  .framework-header > h3 {
+    order: -1;
+    flex-basis: 100%;
+  }
+
+  h1 {
+    font-size: 48px;
+    padding-bottom: 18px;
+  }
+
+  .page-title {
+    margin-top: 32px;
+  }
+
+  .page-content {
+    padding: 0 32px;
+  }
+
+  .three-cols {
+    grid-template-columns: 1fr;
+    row-gap: 32px;
+  }
+
+  .two-col {
+    grid-template-columns: 1fr;
+    row-gap: 32px;
+  }
+
+  .nav-mobile {
+    display: flex;
+  }
+}
+
+.mobile * {
+  color: white;
+}
+
+.mobile > .summary > a {
+  font-size: 24px;
+}
diff --git a/frameworks/AutoGluon/README.md b/frameworks/AutoGluon/README.md
index 51286533e..1b5c2dc65 100644
--- a/frameworks/AutoGluon/README.md
+++ b/frameworks/AutoGluon/README.md
@@ -1,16 +1,5 @@
 # AutoGluon
 
-To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...```
+To run v0.8.2: ```python3 ../automlbenchmark/runbenchmark.py autogluon ...```
 
-To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest ...```
-
-
-# AutoGluonTS
-
-AutoGluonTS stands for autogluon.timeseries. This framework handles time series problems.
-
-## Run Steps
-
-To run v0.5.2: ```python3 ../automlbenchmark/runbenchmark.py autogluonts timeseries ...```
-
-To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluonts:latest timeseries ...```
+To run mainline: ```python3 ../automlbenchmark/runbenchmark.py autogluon:latest ...```
diff --git a/frameworks/AutoGluon/__init__.py b/frameworks/AutoGluon/__init__.py
index 9d3d980a3..2d5734e33 100644
--- a/frameworks/AutoGluon/__init__.py
+++ b/frameworks/AutoGluon/__init__.py
@@ -10,11 +10,11 @@ def setup(*args, **kwargs):
 
 def run(dataset: Dataset, config: TaskConfig):
 
-    if dataset.type is not DatasetType.timeseries:
+    if dataset.type == DatasetType.timeseries:
+        return run_autogluon_timeseries(dataset, config)
+    else:
         return run_autogluon_tabular(dataset, config)
 
-    else:
-        return run_autogluon_timeseries(dataset, config)
 
 def run_autogluon_tabular(dataset: Dataset, config: TaskConfig):
     from frameworks.shared.caller import run_in_venv
@@ -36,26 +36,18 @@ def run_autogluon_tabular(dataset: Dataset, config: TaskConfig):
 def run_autogluon_timeseries(dataset: Dataset, config: TaskConfig):
     from frameworks.shared.caller import run_in_venv
     dataset = deepcopy(dataset)
-    if not hasattr(dataset, 'timestamp_column'):
-        dataset.timestamp_column = None
-    if not hasattr(dataset, 'id_column'):
-        dataset.id_column = None
-    if not hasattr(dataset, 'forecast_range_in_steps'):
-        raise AttributeError("Unspecified `forecast_range_in_steps`.")
 
     data = dict(
-        # train=dict(path=dataset.train.data_path('parquet')),
-        # test=dict(path=dataset.test.data_path('parquet')),
-        train=dict(path=dataset.train.path),
-        test=dict(path=dataset.test.path),
-        target=dict(
-            name=dataset.target.name,
-            classes=dataset.target.values
-        ),
-        problem_type=dataset.type.name,  # AutoGluon problem_type is using same names as amlb.data.DatasetType
-        timestamp_column=dataset.timestamp_column,
+        train_path=dataset.train.path,
+        test_path=dataset.test.path,
+        target=dataset.target.name,
         id_column=dataset.id_column,
-        forecast_range_in_steps=dataset.forecast_range_in_steps
+        timestamp_column=dataset.timestamp_column,
+        forecast_horizon_in_steps=dataset.forecast_horizon_in_steps,
+        freq=dataset.freq,
+        seasonality=dataset.seasonality,
+        repeated_abs_seasonal_error=dataset.repeated_abs_seasonal_error,
+        repeated_item_id=dataset.repeated_item_id,
     )
 
     return run_in_venv(__file__, "exec_ts.py",
diff --git a/frameworks/AutoGluon/exec_ts.py b/frameworks/AutoGluon/exec_ts.py
index ab7c4110f..32fd34072 100644
--- a/frameworks/AutoGluon/exec_ts.py
+++ b/frameworks/AutoGluon/exec_ts.py
@@ -1,21 +1,20 @@
 import logging
+import numpy as np
 import os
+import pandas as pd
 import shutil
-import warnings
 import sys
 import tempfile
-import numpy as np
+import warnings
 warnings.simplefilter("ignore")
 
 if sys.platform == 'darwin':
     os.environ['OMP_NUM_THREADS'] = '1'
 
-import pandas as pd
-
 from autogluon.core.utils.savers import save_pd, save_pkl
-from autogluon.tabular import TabularDataset
 from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
 from autogluon.timeseries.version import __version__
+from joblib.externals.loky import get_reusable_executor
 
 from frameworks.shared.callee import call_run, result, output_subdir
 from frameworks.shared.utils import Timer, zip_path
@@ -25,111 +24,69 @@
 
 def run(dataset, config):
     log.info(f"\n**** AutoGluon TimeSeries [v{__version__}] ****\n")
+    prediction_length = dataset.forecast_horizon_in_steps
 
-    timestamp_column = dataset.timestamp_column
-    id_column = dataset.id_column
-    prediction_length = dataset.forecast_range_in_steps
-
-    eval_metric = get_eval_metric(config)
-    label = dataset.target.name
-    time_limit = config.max_runtime_seconds
-
-    training_params = {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
-
-    train_data, test_data = load_data(train_path=dataset.train.path,
-                                      test_path=dataset.test.path,
-                                      timestamp_column=timestamp_column,
-                                      id_column=id_column)
-    test_data_past = test_data.copy().slice_by_timestep(slice(None, -prediction_length))
+    train_data = TimeSeriesDataFrame.from_path(
+        dataset.train_path,
+        id_column=dataset.id_column,
+        timestamp_column=dataset.timestamp_column,
+    )
 
     predictor_path = tempfile.mkdtemp() + os.sep
     with Timer() as training:
         predictor = TimeSeriesPredictor(
-            target=label,
+            target=dataset.target,
             path=predictor_path,
             prediction_length=prediction_length,
-            eval_metric=eval_metric,
+            eval_metric=get_eval_metric(config),
+            eval_metric_seasonal_period=dataset.seasonality,
+            quantile_levels=config.quantile_levels,
         )
         predictor.fit(
             train_data=train_data,
-            time_limit=time_limit,
-            **training_params,
+            time_limit=config.max_runtime_seconds,
+            **{k: v for k, v in config.framework_params.items() if not k.startswith('_')},
         )
 
     with Timer() as predict:
-        predictions = predictor.predict(test_data_past)
-    log.info(predictions)
+        predictions = pd.DataFrame(predictor.predict(train_data))
 
-    predictions_only = predictions['mean'].values
-    test_data_future = test_data.copy().slice_by_timestep(slice(-prediction_length, None))
-    truth_only = test_data_future[label].values
+    # Add columns necessary for the metric computation + quantile forecast to `optional_columns`
+    test_data_future = pd.read_csv(dataset.test_path, parse_dates=[dataset.timestamp_column])
+    optional_columns = dict(
+        repeated_item_id=np.load(dataset.repeated_item_id),
+        repeated_abs_seasonal_error=np.load(dataset.repeated_abs_seasonal_error),
+    )
+    for q in config.quantile_levels:
+        optional_columns[str(q)] = predictions[str(q)].values
+
+    predictions_only = get_point_forecast(predictions, config.metric)
+    truth_only = test_data_future[dataset.target].values
 
-    log.info(predictions_only)
-    log.info(truth_only)
+    # Sanity check - make sure predictions are ordered correctly
+    future_index = pd.MultiIndex.from_frame(test_data_future[[dataset.id_column, dataset.timestamp_column]])
+    assert predictions.index.equals(future_index), "Predictions and test data index do not match"
 
-    leaderboard = predictor.leaderboard(test_data, silent=True)
+    test_data_full = pd.concat([train_data, test_data_future.set_index([dataset.id_column, dataset.timestamp_column])])
+    leaderboard = predictor.leaderboard(test_data_full, silent=True)
 
     with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.width', 1000):
         log.info(leaderboard)
 
-    num_models_trained = len(leaderboard)
-
     save_artifacts(predictor=predictor, leaderboard=leaderboard, config=config)
     shutil.rmtree(predictor.path, ignore_errors=True)
 
-    quantiles = predictions.drop(columns=['mean']).reset_index(drop=True)
-    period_length = 1 # TODO: This period length could be adapted to the Dataset, but then we need to pass this information as well. As of now this works.
-
-    # we aim to calculate the mean period error from the past for each sequence: 1/N sum_{i=1}^N |x(t_i) - x(t_i - T)|
-    # 1. retrieve item_ids for each sequence/item
-    #dataset..X /. y
-    item_ids, inverse_item_ids = np.unique(test_data.reset_index()["item_id"].squeeze().to_numpy(), return_index=False, return_inverse=True)
-    # 2. capture sequences in a list
-    y_past = [test_data[label].squeeze().to_numpy()[inverse_item_ids == i][:-prediction_length] for i in range(len(item_ids))]
-    # 3. calculate period error per sequence
-    y_past_period_error = [np.abs(y_past_item[period_length:] - y_past_item[:-period_length]).mean() for y_past_item in y_past]
-    # 4. repeat period error for each sequence, to save one for each element
-    y_past_period_error_rep = np.repeat(y_past_period_error, prediction_length)
-
-    optional_columns = quantiles
-    optional_columns = optional_columns.assign(y_past_period_error=y_past_period_error_rep)
+    # Kill child processes spawned by Joblib to avoid spam in the AMLB log
+    get_reusable_executor().shutdown(wait=True)
 
     return result(output_file=config.output_predictions_file,
                   predictions=predictions_only,
                   truth=truth_only,
-                  probabilities=None,
-                  probabilities_labels=None,
                   target_is_encoded=False,
-                  models_count=num_models_trained,
+                  models_count=len(leaderboard),
                   training_duration=training.duration,
                   predict_duration=predict.duration,
-                  optional_columns=optional_columns)
-
-def load_data(train_path, test_path, timestamp_column, id_column):
-
-    train_df = pd.read_csv(
-        train_path,
-        parse_dates=[timestamp_column],
-    )
-
-    train_data = TimeSeriesDataFrame.from_data_frame(
-        train_df,
-        id_column=id_column,
-        timestamp_column=timestamp_column,
-    )
-
-    test_df = pd.read_csv(
-        test_path,
-        parse_dates=[timestamp_column],
-    )
-
-    test_data = TimeSeriesDataFrame.from_data_frame(
-        test_df,
-        id_column=id_column,
-        timestamp_column=timestamp_column,
-    )
-
-    return train_data, test_data
+                  optional_columns=pd.DataFrame(optional_columns))
 
 
 def get_eval_metric(config):
@@ -148,6 +105,16 @@ def get_eval_metric(config):
     return eval_metric
 
 
+def get_point_forecast(predictions, metric):
+    # Return median for metrics optimized by median, if possible
+    if metric.lower() in ["rmse", "mse"] or "0.5" not in predictions.columns:
+        log.info("Using mean as point forecast")
+        return predictions["mean"].values
+    else:
+        log.info("Using median as point forecast")
+        return predictions["0.5"].values
+
+
 def save_artifacts(predictor, leaderboard, config):
     artifacts = config.framework_params.get('_save_artifacts', ['leaderboard'])
     try:
diff --git a/frameworks/FEDOT/__init__.py b/frameworks/FEDOT/__init__.py
new file mode 100644
index 000000000..86e68de98
--- /dev/null
+++ b/frameworks/FEDOT/__init__.py
@@ -0,0 +1,25 @@
+from amlb.benchmark import TaskConfig
+from amlb.data import Dataset
+from amlb.utils import call_script_in_same_dir
+
+
+def setup(*args, **kwargs):
+    call_script_in_same_dir(__file__, "setup.sh", *args, **kwargs)
+
+
+def run(dataset: Dataset, config: TaskConfig):
+    from frameworks.shared.caller import run_in_venv
+
+    data = dict(
+        train=dict(
+            X=dataset.train.X,
+            y=dataset.train.y
+        ),
+        test=dict(
+            X=dataset.test.X,
+            y=dataset.test.y
+        )
+    )
+
+    return run_in_venv(__file__, "exec.py",
+                       input_data=data, dataset=dataset, config=config)
diff --git a/frameworks/FEDOT/exec.py b/frameworks/FEDOT/exec.py
new file mode 100644
index 000000000..b57448949
--- /dev/null
+++ b/frameworks/FEDOT/exec.py
@@ -0,0 +1,99 @@
+import logging
+import os
+from pathlib import Path
+
+from fedot.api.main import Fedot
+
+from frameworks.shared.callee import call_run, result, output_subdir
+from frameworks.shared.utils import Timer
+
+log = logging.getLogger(__name__)
+
+
+def run(dataset, config):
+    log.info("\n**** FEDOT ****\n")
+
+    is_classification = config.type == 'classification'
+    # Mapping of benchmark metrics to FEDOT metrics
+    metrics_mapping = dict(
+        acc='acc',
+        auc='roc_auc',
+        f1='f1',
+        logloss='logloss',
+        mae='mae',
+        mse='mse',
+        msle='msle',
+        r2='r2',
+        rmse='rmse'
+    )
+    scoring_metric = metrics_mapping.get(config.metric, None)
+
+    if scoring_metric is None:
+        log.warning("Performance metric %s not supported.", config.metric)
+
+    training_params = {"preset": "best_quality", "n_jobs": config.cores}
+    training_params |= {k: v for k, v in config.framework_params.items() if not k.startswith('_')}
+    n_jobs = training_params["n_jobs"]
+
+    log.info('Running FEDOT with a maximum time of %ss on %s cores, optimizing %s.',
+             config.max_runtime_seconds, n_jobs, scoring_metric)
+    runtime_min = config.max_runtime_seconds / 60
+
+    fedot = Fedot(problem=config.type, timeout=runtime_min, metric=scoring_metric, seed=config.seed,
+                  max_pipeline_fit_time=runtime_min / 10, **training_params)
+
+    with Timer() as training:
+        fedot.fit(features=dataset.train.X, target=dataset.train.y)
+
+    log.info('Predicting on the test set.')
+    with Timer() as predict:
+        predictions = fedot.predict(features=dataset.test.X)
+        probabilities = None
+        if is_classification:
+            probabilities = fedot.predict_proba(features=dataset.test.X, probs_for_all_classes=True)
+
+    save_artifacts(fedot, config)
+
+    return result(output_file=config.output_predictions_file,
+                  predictions=predictions,
+                  truth=dataset.test.y,
+                  probabilities=probabilities,
+                  target_is_encoded=False,
+                  models_count=fedot.current_pipeline.length,
+                  training_duration=training.duration,
+                  predict_duration=predict.duration)
+
+
+def save_artifacts(automl, config):
+
+    artifacts = config.framework_params.get('_save_artifacts', [])
+    if 'models' in artifacts:
+        try:
+            models_dir = output_subdir('models', config)
+            models_file = os.path.join(models_dir, 'model.json')
+            automl.current_pipeline.save(models_file)
+        except Exception as e:
+            log.info(f"Error when saving 'models': {e}.", exc_info=True)
+
+    if 'info' in artifacts:
+        try:
+            info_dir = output_subdir("info", config)
+            if automl.history:
+                automl.history.save(os.path.join(info_dir, 'history.json'))
+            else:
+                log.info(f"There is no optimization history info to save.")
+        except Exception as e:
+            log.info(f"Error when saving info about optimisation history: {e}.", exc_info=True)
+
+    if 'leaderboard' in artifacts:
+        try:
+            leaderboard_dir = output_subdir("leaderboard", config)
+            if automl.history:
+                lb = automl.history.get_leaderboard()
+                Path(os.path.join(leaderboard_dir, "leaderboard.csv")).write_text(lb)
+        except Exception as e:
+            log.info(f"Error when saving 'leaderboard': {e}.", exc_info=True)
+
+
+if __name__ == '__main__':
+    call_run(run)
diff --git a/frameworks/FEDOT/setup.sh b/frameworks/FEDOT/setup.sh
new file mode 100644
index 000000000..a89781583
--- /dev/null
+++ b/frameworks/FEDOT/setup.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+HERE=$(dirname "$0")
+VERSION=${1:-"stable"}
+REPO=${2:-"https://github.com/aimclub/FEDOT.git"}
+PKG=${3:-"fedot"}
+if [[ "$VERSION" == "latest" ]]; then
+    VERSION="master"
+fi
+
+# creating local venv
+. ${HERE}/../shared/setup.sh ${HERE} true
+
+RAWREPO=$(echo ${REPO} | sed "s/github\.com/raw\.githubusercontent\.com/")
+if [[ "$VERSION" == "stable" ]]; then
+    PIP install --no-cache-dir -U ${PKG}
+    echo GET_VERSION_STABLE
+    VERSION=$(PY -c "${GET_VERSION_STABLE}")
+elif [[ "$VERSION" =~ ^[0-9] ]]; then
+    PIP install --no-cache-dir -U ${PKG}==${VERSION}
+else
+    TARGET_DIR="${HERE}/lib/${PKG}"
+    rm -Rf ${TARGET_DIR}
+
+    if [[ "$VERSION" =~ ^# ]]; then
+      COMMIT="${VERSION:1}"
+    else
+      # find the latest commit to the VERSION branch
+      COMMIT=$(git ls-remote "${REPO}" | grep "refs/heads/${VERSION}" | cut -f 1)
+      DEPTH="--depth 1 --branch ${VERSION}"
+    fi
+
+    git clone  --recurse-submodules --shallow-submodules ${DEPTH} ${REPO} ${TARGET_DIR}
+    cd ${TARGET_DIR}
+    git checkout "${COMMIT}"
+    git submodule update --init --recursive
+    cd ${HERE}
+    PIP install -U -e ${TARGET_DIR}
+fi
+
+installed="${HERE}/.setup/installed"
+PY -c "from fedot import __version__; print(__version__)" >> "$installed"
+if [[ -n $COMMIT ]]; then
+  truncate -s-1 "$installed"
+  echo "#${COMMIT}" >> "$installed"
+fi
diff --git a/frameworks/NaiveAutoML/__init__.py b/frameworks/NaiveAutoML/__init__.py
index 32a48deed..889396d4c 100644
--- a/frameworks/NaiveAutoML/__init__.py
+++ b/frameworks/NaiveAutoML/__init__.py
@@ -1,6 +1,6 @@
 from amlb.benchmark import TaskConfig
 from amlb.data import Dataset
-from amlb.utils import call_script_in_same_dir, unsparsify
+from amlb.utils import call_script_in_same_dir
 
 
 def setup(*args, **kwargs):
@@ -14,11 +14,11 @@ def run(dataset: Dataset, config: TaskConfig):
         target=dataset.target.name,
         train=dict(
             X=dataset.train.X,
-            y=unsparsify(dataset.train.y_enc),
+            y=dataset.train.y_enc,
         ),
         test=dict(
             X=dataset.test.X,
-            y=unsparsify(dataset.test.y_enc),
+            y=dataset.test.y_enc,
         ),
     )
     if config.measure_inference_time:
diff --git a/frameworks/NaiveAutoML/exec.py b/frameworks/NaiveAutoML/exec.py
index 4f0c00050..aa9072156 100644
--- a/frameworks/NaiveAutoML/exec.py
+++ b/frameworks/NaiveAutoML/exec.py
@@ -45,11 +45,9 @@ def run(dataset, config):
     if scoring_metric is None:
         raise ValueError(f"Performance metric {config.metric} not supported.")
 
-    is_classification = (config.type == 'classification')
     kwargs = dict(
         scoring=scoring_metric,
         num_cpus=config.cores,
-        task_type=config.type,
     )
     # NAML wasn't really designed to run for long time constraints, so we
     # make it easy to run NAML with its default configuration for time/iterations.
@@ -65,13 +63,14 @@ def run(dataset, config):
         log.info("`_use_default_time_and_iterations` is set, ignoring time constraint.")
 
     kwargs |= {k: v for k, v in config.framework_params.items() if not k.startswith("_")}
-    log.info(f"Initializing NaiveAutoml(**{kwargs})")
     automl = NaiveAutoML(**kwargs)
 
     with Timer() as training:
         automl.fit(dataset.train.X, dataset.train.y)
     log.info(f"Finished fit in {training.duration}s.")
 
+    is_classification = (config.type == 'classification')
+
     def infer(data: Union[str, pd.DataFrame]):
         test_data = pd.read_parquet(data) if isinstance(data, str) else data
         predict_fn = automl.predict_proba if is_classification else automl.predict
diff --git a/frameworks/autosklearn/exec.py b/frameworks/autosklearn/exec.py
index e2e7023dc..dda275f2a 100644
--- a/frameworks/autosklearn/exec.py
+++ b/frameworks/autosklearn/exec.py
@@ -8,7 +8,6 @@
 from typing import Union
 
 import pandas as pd
-import pandas.api.types
 from numpy.random import default_rng
 
 os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir()
@@ -69,10 +68,7 @@ def run(dataset, config):
     )
     log.info("Environment: %s", os.environ)
 
-    def is_sparse(data: pd.DataFrame) -> bool:
-        return any(pd.api.types.is_sparse(data[column]) for column in data)
-
-    use_pandas = (askl_version >= version.parse("0.15")) and not is_sparse(dataset.train.X)
+    use_pandas = askl_version >= version.parse("0.15")
     X_train = dataset.train.X if use_pandas else dataset.train.X_enc
     y_train = dataset.train.y if use_pandas else dataset.train.y_enc
     predictors_type = dataset.predictors_type
diff --git a/mkdocs.yml b/mkdocs.yml
new file mode 100644
index 000000000..5f2dd0f6c
--- /dev/null
+++ b/mkdocs.yml
@@ -0,0 +1,51 @@
+site_name: AutoML Benchmark
+theme:
+  name: material
+  features:
+    - navigation.expand
+    - navigation.indexes
+    - content.tabs.link
+    - content.code.annotate
+  icon:
+    logo: material/home
+    admonition:
+      windows: fontawesome/brands/windows
+extra:
+  homepage: WEBSITE/index.html
+
+nav:
+  - index.md
+  - getting_started.md
+  - Using the Benchmark:
+      - Parameters: using/parameters.md
+      - Configuration: using/configuration.md
+      - using/aws.md
+      - using/result_analysis.md
+  - Extending the Benchmark:
+      - extending/index.md
+      - extending/benchmark.md
+      - extending/constraint.md
+      - Frameworks: extending/framework.md
+  - FAQ: faq.md
+
+extra_css:
+  - stylesheets/extra.css
+
+markdown_extensions:
+  - def_list
+  - admonition
+  - toc
+  - attr_list
+  - pymdownx.details
+  - pymdownx.superfences
+  - pymdownx.snippets
+  - pymdownx.inlinehilite
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.tabbed:
+      alternate_style: true
+  - pymdownx.emoji:
+      emoji_index: !!python/name:materialx.emoji.twemoji
+      emoji_generator: !!python/name:materialx.emoji.to_svg
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 000000000..78d101b38
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,37 @@
+[tool.mypy]
+files=[
+    "amlb/**/*.py"
+]
+python_version = "3.9"
+# Required because the normal usage pattern of namespaces raises [attr-defined] errors.
+# I can't a way to disable [attr-defined] errors for `Namespace` only.
+disable_error_code = "attr-defined"
+
+[[tool.mypy.overrides]]
+ignore_errors = false
+module = "amlb.utils.*"
+
+
+[[tool.mypy.overrides]]
+ignore_errors = true
+module = "amlb.benchmarks.*"
+
+
+[[tool.mypy.overrides]]
+ignore_errors = true
+module = "amlb.datasets.*"
+
+
+[[tool.mypy.overrides]]
+ignore_errors = true
+module = "amlb.frameworks.*"
+
+
+[[tool.mypy.overrides]]
+ignore_errors = true
+module = "amlb.runners.*"
+
+
+[[tool.mypy.overrides]]
+ignore_errors = true
+module = "amlb.*"
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 4ff2a0f94..ab42f00ce 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,3 +1,7 @@
 pytest
 pytest-mock
-pip-tools
\ No newline at end of file
+pip-tools
+
+types-psutil
+pandas-stubs
+mypy
diff --git a/resources/benchmarks/timeseries.yaml b/resources/benchmarks/timeseries.yaml
index 26af06497..586cf738a 100644
--- a/resources/benchmarks/timeseries.yaml
+++ b/resources/benchmarks/timeseries.yaml
@@ -1,13 +1,16 @@
 ---
 
-- name: covid
+- name: m4_hourly
   dataset:
-    train: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/train.csv
-    test: https://autogluon.s3-us-west-2.amazonaws.com/datasets/CovidTimeSeries/test.csv
-    target: ConfirmedCases
+    path: https://autogluon.s3.amazonaws.com/datasets/timeseries/m4_hourly/test.csv
     type: timeseries
-    forecast_range_in_steps: 19
-    id_column: name
-    timestamp_column: Date
+    freq: H
+    forecast_horizon_in_steps: 48
+    seasonality: 24
+    target: target
+    id_column: item_id
+    timestamp_column: timestamp
+  metric: [mase, smape, mape, rmse, mql, wql, sql]
+  quantile_levels: [0.05, 0.5, 0.95]
 
-  folds: 1
+  folds: 2
diff --git a/resources/config.yaml b/resources/config.yaml
index faae3657e..a3b7809b8 100644
--- a/resources/config.yaml
+++ b/resources/config.yaml
@@ -54,13 +54,15 @@ benchmarks:                     # configuration namespace for the benchmarks def
     binary: ['auc', 'logloss', 'acc', 'balacc']     # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error).
     multiclass: ['logloss', 'acc', 'balacc']        # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average.
     regression: ['rmse', 'r2', 'mae']               # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2).
-    timeseries: ['mase', 'mape', 'smape', 'rmse', 'mse', 'nrmse', 'wape', 'ncrps']
+    timeseries: ['mase', 'mape', 'smape', 'wape', 'rmse', 'mse', 'mql', 'wql', 'sql']  # available metrics: mase (Mean Absolute Scaled Error), mape (Mean Absolute Percentage Error), smape (Symmetric Mean Absolute Percentage Error), wape (Weighted Absolute Percentage Error), rmse (Root Mean Square Error), mse (Mean Square Error), mql (Mean Quantile Loss), wql (Weighted Quantile Loss), sql (Scaled Quantile Loss).
+
   defaults:            # the default constraints, usually overridden by a constraint.
     folds: 10          # the amount of fold-runs executed for each dataset.
     max_runtime_seconds: 3600   # default time allocated to the framework to train a model.
     cores: -1                   # default amount of cores used for each automl task. If <= 0, will try to use all cores.
     max_mem_size_mb: -1         # default amount of memory assigned to each automl task. If <= 0, then the amount of memory is computed from os available memory.
     min_vol_size_mb: -1         # default minimum amount of free space required on the volume. If <= 0, skips verification.
+    quantile_levels: [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]  # default quantile_levels for timeseries problem type
 
 job_scheduler:           # configuration namespace
   exit_on_job_failure:   # if true, the entire run will be aborted on the first job failure (mainly used for testing) : set by caller (runbenchmark.py)
diff --git a/resources/frameworks.yaml b/resources/frameworks.yaml
index dbaa0d1ac..da2881ce0 100644
--- a/resources/frameworks.yaml
+++ b/resources/frameworks.yaml
@@ -203,17 +203,15 @@ TPOT:
 #    population_size: 25
 #    verbosity: 2
 
-####################################
-### TimeSeries AutoML frameworks ###
-####################################
-
-AutoGluonTS:
-  extends: AutoGluon
-  version: "stable"
+FEDOT:
+  version: 'master'
   description: |
-    AutoGluon-TimeSeries
-  setup_env:
-    MODULE: timeseries
+    FEDOT is a AutoML tool that optimizes composite machine learning pipelines using evolutionary optimisation.
+  project: https://github.com/aimclub/FEDOT
+  refs:
+    - https://doi.org/10.1016/j.future.2021.08.022
+#  params:
+#    _save_artifacts: ['leaderboard', 'models', 'info']
 
 #######################################
 ### Non AutoML reference frameworks ###
diff --git a/resources/frameworks_2023Q2.yaml b/resources/frameworks_2023Q2.yaml
index eab8170cc..1ac098b6f 100644
--- a/resources/frameworks_2023Q2.yaml
+++ b/resources/frameworks_2023Q2.yaml
@@ -97,8 +97,7 @@ mlr3automl:
   project: https://github.com/a-hanf/mlr3automl
 
 NaiveAutoML:
-  repo: https://github.com/pgijsbers/naiveautoml
-  version: '#1af07c22b64510df7700798f4c0f32c3ca76ab93'
+  version: '0.0.27'
 
 TPOT:
   version: '0.12.0'
diff --git a/resources/frameworks_latest.yaml b/resources/frameworks_latest.yaml
index 7a5ca78a5..44f924232 100644
--- a/resources/frameworks_latest.yaml
+++ b/resources/frameworks_latest.yaml
@@ -86,17 +86,8 @@ oboe:
 TPOT:
   version: 'latest'
 
-####################################
-### TimeSeries AutoML frameworks ###
-####################################
-
-AutoGluonTS:
-  extends: AutoGluon
-  version: "latest"
-  description: |
-    AutoGluon-TimeSeries
-  setup_env:
-    MODULE: timeseries
+FEDOT:
+  version: 'latest'
 
 #######################################
 ### Non AutoML reference frameworks ###
diff --git a/resources/frameworks_stable.yaml b/resources/frameworks_stable.yaml
index 3de7da369..d6b5a1ce0 100644
--- a/resources/frameworks_stable.yaml
+++ b/resources/frameworks_stable.yaml
@@ -91,8 +91,8 @@ oboe:
 TPOT:
   version: 'stable'
 
-
-
+FEDOT:
+  version: 'stable'
 
 #######################################
 ### Non AutoML reference frameworks ###
diff --git a/tests/unit/amlb/datasets/file/resources/m4_hourly_subset.csv b/tests/unit/amlb/datasets/file/resources/m4_hourly_subset.csv
new file mode 100644
index 000000000..7ee20e07d
--- /dev/null
+++ b/tests/unit/amlb/datasets/file/resources/m4_hourly_subset.csv
@@ -0,0 +1,301 @@
+item_id,timestamp,target
+T1,2015-07-01 12:00:00,605.0
+T1,2015-07-01 13:00:00,586.0
+T1,2015-07-01 14:00:00,586.0
+T1,2015-07-01 15:00:00,559.0
+T1,2015-07-01 16:00:00,511.0
+T1,2015-07-01 17:00:00,443.0
+T1,2015-07-01 18:00:00,422.0
+T1,2015-07-01 19:00:00,395.0
+T1,2015-07-01 20:00:00,382.0
+T1,2015-07-01 21:00:00,370.0
+T1,2015-07-01 22:00:00,383.0
+T1,2015-07-01 23:00:00,397.0
+T1,2015-07-02 00:00:00,420.0
+T1,2015-07-02 01:00:00,455.0
+T1,2015-07-02 02:00:00,493.0
+T1,2015-07-02 03:00:00,554.0
+T1,2015-07-02 04:00:00,610.0
+T1,2015-07-02 05:00:00,666.0
+T1,2015-07-02 06:00:00,715.0
+T1,2015-07-02 07:00:00,755.0
+T1,2015-07-02 08:00:00,778.0
+T1,2015-07-02 09:00:00,794.0
+T1,2015-07-02 10:00:00,806.0
+T1,2015-07-02 11:00:00,808.0
+T1,2015-07-02 12:00:00,776.0
+T1,2015-07-02 13:00:00,723.0
+T1,2015-07-02 14:00:00,709.0
+T1,2015-07-02 15:00:00,660.0
+T1,2015-07-02 16:00:00,585.0
+T1,2015-07-02 17:00:00,527.0
+T1,2015-07-02 18:00:00,462.0
+T1,2015-07-02 19:00:00,437.0
+T1,2015-07-02 20:00:00,413.0
+T1,2015-07-02 21:00:00,407.0
+T1,2015-07-02 22:00:00,404.0
+T1,2015-07-02 23:00:00,420.0
+T1,2015-07-03 00:00:00,441.0
+T1,2015-07-03 01:00:00,471.0
+T1,2015-07-03 02:00:00,526.0
+T1,2015-07-03 03:00:00,571.0
+T1,2015-07-03 04:00:00,612.0
+T1,2015-07-03 05:00:00,635.0
+T1,2015-07-03 06:00:00,613.0
+T1,2015-07-03 07:00:00,608.0
+T1,2015-07-03 08:00:00,614.0
+T1,2015-07-03 09:00:00,637.0
+T1,2015-07-03 10:00:00,669.0
+T1,2015-07-03 11:00:00,683.0
+T1,2015-07-03 12:00:00,687.0
+T1,2015-07-03 13:00:00,660.0
+T1,2015-07-03 14:00:00,661.0
+T1,2015-07-03 15:00:00,632.0
+T1,2015-07-03 16:00:00,573.0
+T1,2015-07-03 17:00:00,521.0
+T1,2015-07-03 18:00:00,481.0
+T1,2015-07-03 19:00:00,452.0
+T1,2015-07-03 20:00:00,447.0
+T1,2015-07-03 21:00:00,425.0
+T1,2015-07-03 22:00:00,427.0
+T1,2015-07-03 23:00:00,441.0
+T1,2015-07-04 00:00:00,438.0
+T1,2015-07-04 01:00:00,472.0
+T1,2015-07-04 02:00:00,528.0
+T1,2015-07-04 03:00:00,596.0
+T1,2015-07-04 04:00:00,661.0
+T1,2015-07-04 05:00:00,708.0
+T1,2015-07-04 06:00:00,754.0
+T1,2015-07-04 07:00:00,781.0
+T1,2015-07-04 08:00:00,808.0
+T1,2015-07-04 09:00:00,819.0
+T1,2015-07-04 10:00:00,820.0
+T1,2015-07-04 11:00:00,801.0
+T1,2015-07-04 12:00:00,770.0
+T1,2015-07-04 13:00:00,717.0
+T1,2015-07-04 14:00:00,697.0
+T1,2015-07-04 15:00:00,655.0
+T1,2015-07-04 16:00:00,607.0
+T1,2015-07-04 17:00:00,552.0
+T1,2015-07-04 18:00:00,512.0
+T1,2015-07-04 19:00:00,475.0
+T1,2015-07-04 20:00:00,452.0
+T1,2015-07-04 21:00:00,436.0
+T1,2015-07-04 22:00:00,429.0
+T1,2015-07-04 23:00:00,433.0
+T1,2015-07-05 00:00:00,430.0
+T1,2015-07-05 01:00:00,472.0
+T1,2015-07-05 02:00:00,536.0
+T1,2015-07-05 03:00:00,611.0
+T1,2015-07-05 04:00:00,662.0
+T1,2015-07-05 05:00:00,705.0
+T1,2015-07-05 06:00:00,707.0
+T1,2015-07-05 07:00:00,718.0
+T1,2015-07-05 08:00:00,733.0
+T1,2015-07-05 09:00:00,741.0
+T1,2015-07-05 10:00:00,737.0
+T1,2015-07-05 11:00:00,710.0
+T1,2015-07-05 12:00:00,647.0
+T1,2015-07-05 13:00:00,593.0
+T1,2015-07-05 14:00:00,564.0
+T1,2015-07-05 15:00:00,528.0
+T2,2015-07-01 12:00:00,3124.0
+T2,2015-07-01 13:00:00,2990.0
+T2,2015-07-01 14:00:00,2862.0
+T2,2015-07-01 15:00:00,2809.0
+T2,2015-07-01 16:00:00,2544.0
+T2,2015-07-01 17:00:00,2201.0
+T2,2015-07-01 18:00:00,1996.0
+T2,2015-07-01 19:00:00,1861.0
+T2,2015-07-01 20:00:00,1735.0
+T2,2015-07-01 21:00:00,1713.0
+T2,2015-07-01 22:00:00,1724.0
+T2,2015-07-01 23:00:00,1798.0
+T2,2015-07-02 00:00:00,1891.0
+T2,2015-07-02 01:00:00,2037.0
+T2,2015-07-02 02:00:00,2102.0
+T2,2015-07-02 03:00:00,2163.0
+T2,2015-07-02 04:00:00,2269.0
+T2,2015-07-02 05:00:00,2404.0
+T2,2015-07-02 06:00:00,2515.0
+T2,2015-07-02 07:00:00,2621.0
+T2,2015-07-02 08:00:00,2745.0
+T2,2015-07-02 09:00:00,2816.0
+T2,2015-07-02 10:00:00,2938.0
+T2,2015-07-02 11:00:00,3022.0
+T2,2015-07-02 12:00:00,2976.0
+T2,2015-07-02 13:00:00,2892.0
+T2,2015-07-02 14:00:00,2784.0
+T2,2015-07-02 15:00:00,2725.0
+T2,2015-07-02 16:00:00,2530.0
+T2,2015-07-02 17:00:00,2211.0
+T2,2015-07-02 18:00:00,1995.0
+T2,2015-07-02 19:00:00,1833.0
+T2,2015-07-02 20:00:00,1768.0
+T2,2015-07-02 21:00:00,1712.0
+T2,2015-07-02 22:00:00,1707.0
+T2,2015-07-02 23:00:00,1762.0
+T2,2015-07-03 00:00:00,1880.0
+T2,2015-07-03 01:00:00,1995.0
+T2,2015-07-03 02:00:00,2134.0
+T2,2015-07-03 03:00:00,2227.0
+T2,2015-07-03 04:00:00,2376.0
+T2,2015-07-03 05:00:00,2477.0
+T2,2015-07-03 06:00:00,2597.0
+T2,2015-07-03 07:00:00,2691.0
+T2,2015-07-03 08:00:00,2751.0
+T2,2015-07-03 09:00:00,2782.0
+T2,2015-07-03 10:00:00,2810.0
+T2,2015-07-03 11:00:00,2781.0
+T2,2015-07-03 12:00:00,2693.0
+T2,2015-07-03 13:00:00,2567.0
+T2,2015-07-03 14:00:00,2490.0
+T2,2015-07-03 15:00:00,2448.0
+T2,2015-07-03 16:00:00,2277.0
+T2,2015-07-03 17:00:00,1997.0
+T2,2015-07-03 18:00:00,1785.0
+T2,2015-07-03 19:00:00,1689.0
+T2,2015-07-03 20:00:00,1562.0
+T2,2015-07-03 21:00:00,1560.0
+T2,2015-07-03 22:00:00,1505.0
+T2,2015-07-03 23:00:00,1538.0
+T2,2015-07-04 00:00:00,1641.0
+T2,2015-07-04 01:00:00,1735.0
+T2,2015-07-04 02:00:00,1950.0
+T2,2015-07-04 03:00:00,2138.0
+T2,2015-07-04 04:00:00,2303.0
+T2,2015-07-04 05:00:00,2432.0
+T2,2015-07-04 06:00:00,2528.0
+T2,2015-07-04 07:00:00,2656.0
+T2,2015-07-04 08:00:00,2740.0
+T2,2015-07-04 09:00:00,2803.0
+T2,2015-07-04 10:00:00,2855.0
+T2,2015-07-04 11:00:00,2880.0
+T2,2015-07-04 12:00:00,2778.0
+T2,2015-07-04 13:00:00,2637.0
+T2,2015-07-04 14:00:00,2479.0
+T2,2015-07-04 15:00:00,2381.0
+T2,2015-07-04 16:00:00,2228.0
+T2,2015-07-04 17:00:00,2037.0
+T2,2015-07-04 18:00:00,1758.0
+T2,2015-07-04 19:00:00,1648.0
+T2,2015-07-04 20:00:00,1560.0
+T2,2015-07-04 21:00:00,1508.0
+T2,2015-07-04 22:00:00,1486.0
+T2,2015-07-04 23:00:00,1486.0
+T2,2015-07-05 00:00:00,1515.0
+T2,2015-07-05 01:00:00,1623.0
+T2,2015-07-05 02:00:00,1919.0
+T2,2015-07-05 03:00:00,2172.0
+T2,2015-07-05 04:00:00,2416.0
+T2,2015-07-05 05:00:00,2605.0
+T2,2015-07-05 06:00:00,2755.0
+T2,2015-07-05 07:00:00,2822.0
+T2,2015-07-05 08:00:00,2917.0
+T2,2015-07-05 09:00:00,2997.0
+T2,2015-07-05 10:00:00,3060.0
+T2,2015-07-05 11:00:00,3046.0
+T2,2015-07-05 12:00:00,2942.0
+T2,2015-07-05 13:00:00,2758.0
+T2,2015-07-05 14:00:00,2487.0
+T2,2015-07-05 15:00:00,2349.0
+T3,2015-07-01 12:00:00,1828.0
+T3,2015-07-01 13:00:00,1806.0
+T3,2015-07-01 14:00:00,1897.0
+T3,2015-07-01 15:00:00,1750.0
+T3,2015-07-01 16:00:00,1679.0
+T3,2015-07-01 17:00:00,1620.0
+T3,2015-07-01 18:00:00,1463.0
+T3,2015-07-01 19:00:00,1342.0
+T3,2015-07-01 20:00:00,1192.0
+T3,2015-07-01 21:00:00,1108.0
+T3,2015-07-01 22:00:00,1058.0
+T3,2015-07-01 23:00:00,1024.0
+T3,2015-07-02 00:00:00,1031.0
+T3,2015-07-02 01:00:00,1091.0
+T3,2015-07-02 02:00:00,1208.0
+T3,2015-07-02 03:00:00,1337.0
+T3,2015-07-02 04:00:00,1435.0
+T3,2015-07-02 05:00:00,1515.0
+T3,2015-07-02 06:00:00,1593.0
+T3,2015-07-02 07:00:00,1667.0
+T3,2015-07-02 08:00:00,1753.0
+T3,2015-07-02 09:00:00,1768.0
+T3,2015-07-02 10:00:00,1823.0
+T3,2015-07-02 11:00:00,1813.0
+T3,2015-07-02 12:00:00,1842.0
+T3,2015-07-02 13:00:00,1838.0
+T3,2015-07-02 14:00:00,1800.0
+T3,2015-07-02 15:00:00,1761.0
+T3,2015-07-02 16:00:00,1670.0
+T3,2015-07-02 17:00:00,1609.0
+T3,2015-07-02 18:00:00,1467.0
+T3,2015-07-02 19:00:00,1309.0
+T3,2015-07-02 20:00:00,1189.0
+T3,2015-07-02 21:00:00,1102.0
+T3,2015-07-02 22:00:00,1054.0
+T3,2015-07-02 23:00:00,1017.0
+T3,2015-07-03 00:00:00,1014.0
+T3,2015-07-03 01:00:00,1063.0
+T3,2015-07-03 02:00:00,1187.0
+T3,2015-07-03 03:00:00,1314.0
+T3,2015-07-03 04:00:00,1424.0
+T3,2015-07-03 05:00:00,1497.0
+T3,2015-07-03 06:00:00,1586.0
+T3,2015-07-03 07:00:00,1659.0
+T3,2015-07-03 08:00:00,1722.0
+T3,2015-07-03 09:00:00,1781.0
+T3,2015-07-03 10:00:00,1805.0
+T3,2015-07-03 11:00:00,1831.0
+T3,2015-07-03 12:00:00,1851.0
+T3,2015-07-03 13:00:00,1831.0
+T3,2015-07-03 14:00:00,1809.0
+T3,2015-07-03 15:00:00,1755.0
+T3,2015-07-03 16:00:00,1685.0
+T3,2015-07-03 17:00:00,1618.0
+T3,2015-07-03 18:00:00,1487.0
+T3,2015-07-03 19:00:00,1311.0
+T3,2015-07-03 20:00:00,1180.0
+T3,2015-07-03 21:00:00,1087.0
+T3,2015-07-03 22:00:00,1033.0
+T3,2015-07-03 23:00:00,1002.0
+T3,2015-07-04 00:00:00,991.0
+T3,2015-07-04 01:00:00,1005.0
+T3,2015-07-04 02:00:00,1071.0
+T3,2015-07-04 03:00:00,1191.0
+T3,2015-07-04 04:00:00,1307.0
+T3,2015-07-04 05:00:00,1407.0
+T3,2015-07-04 06:00:00,1495.0
+T3,2015-07-04 07:00:00,1576.0
+T3,2015-07-04 08:00:00,1635.0
+T3,2015-07-04 09:00:00,1688.0
+T3,2015-07-04 10:00:00,1711.0
+T3,2015-07-04 11:00:00,1741.0
+T3,2015-07-04 12:00:00,1768.0
+T3,2015-07-04 13:00:00,1765.0
+T3,2015-07-04 14:00:00,1738.0
+T3,2015-07-04 15:00:00,1684.0
+T3,2015-07-04 16:00:00,1605.0
+T3,2015-07-04 17:00:00,1553.0
+T3,2015-07-04 18:00:00,1433.0
+T3,2015-07-04 19:00:00,1297.0
+T3,2015-07-04 20:00:00,1177.0
+T3,2015-07-04 21:00:00,1082.0
+T3,2015-07-04 22:00:00,1028.0
+T3,2015-07-04 23:00:00,987.0
+T3,2015-07-05 00:00:00,970.0
+T3,2015-07-05 01:00:00,959.0
+T3,2015-07-05 02:00:00,993.0
+T3,2015-07-05 03:00:00,1083.0
+T3,2015-07-05 04:00:00,1215.0
+T3,2015-07-05 05:00:00,1310.0
+T3,2015-07-05 06:00:00,1415.0
+T3,2015-07-05 07:00:00,1479.0
+T3,2015-07-05 08:00:00,1525.0
+T3,2015-07-05 09:00:00,1599.0
+T3,2015-07-05 10:00:00,1623.0
+T3,2015-07-05 11:00:00,1652.0
+T3,2015-07-05 12:00:00,1671.0
+T3,2015-07-05 13:00:00,1664.0
+T3,2015-07-05 14:00:00,1637.0
+T3,2015-07-05 15:00:00,1574.0
diff --git a/tests/unit/amlb/datasets/file/resources/m4_hourly_subset_nondefault_cols.csv b/tests/unit/amlb/datasets/file/resources/m4_hourly_subset_nondefault_cols.csv
new file mode 100644
index 000000000..bc2b6c24f
--- /dev/null
+++ b/tests/unit/amlb/datasets/file/resources/m4_hourly_subset_nondefault_cols.csv
@@ -0,0 +1,301 @@
+CustomId,CustomTimestamp,CustomTarget
+T1,2015-07-01 12:00:00,605.0
+T1,2015-07-01 13:00:00,586.0
+T1,2015-07-01 14:00:00,586.0
+T1,2015-07-01 15:00:00,559.0
+T1,2015-07-01 16:00:00,511.0
+T1,2015-07-01 17:00:00,443.0
+T1,2015-07-01 18:00:00,422.0
+T1,2015-07-01 19:00:00,395.0
+T1,2015-07-01 20:00:00,382.0
+T1,2015-07-01 21:00:00,370.0
+T1,2015-07-01 22:00:00,383.0
+T1,2015-07-01 23:00:00,397.0
+T1,2015-07-02 00:00:00,420.0
+T1,2015-07-02 01:00:00,455.0
+T1,2015-07-02 02:00:00,493.0
+T1,2015-07-02 03:00:00,554.0
+T1,2015-07-02 04:00:00,610.0
+T1,2015-07-02 05:00:00,666.0
+T1,2015-07-02 06:00:00,715.0
+T1,2015-07-02 07:00:00,755.0
+T1,2015-07-02 08:00:00,778.0
+T1,2015-07-02 09:00:00,794.0
+T1,2015-07-02 10:00:00,806.0
+T1,2015-07-02 11:00:00,808.0
+T1,2015-07-02 12:00:00,776.0
+T1,2015-07-02 13:00:00,723.0
+T1,2015-07-02 14:00:00,709.0
+T1,2015-07-02 15:00:00,660.0
+T1,2015-07-02 16:00:00,585.0
+T1,2015-07-02 17:00:00,527.0
+T1,2015-07-02 18:00:00,462.0
+T1,2015-07-02 19:00:00,437.0
+T1,2015-07-02 20:00:00,413.0
+T1,2015-07-02 21:00:00,407.0
+T1,2015-07-02 22:00:00,404.0
+T1,2015-07-02 23:00:00,420.0
+T1,2015-07-03 00:00:00,441.0
+T1,2015-07-03 01:00:00,471.0
+T1,2015-07-03 02:00:00,526.0
+T1,2015-07-03 03:00:00,571.0
+T1,2015-07-03 04:00:00,612.0
+T1,2015-07-03 05:00:00,635.0
+T1,2015-07-03 06:00:00,613.0
+T1,2015-07-03 07:00:00,608.0
+T1,2015-07-03 08:00:00,614.0
+T1,2015-07-03 09:00:00,637.0
+T1,2015-07-03 10:00:00,669.0
+T1,2015-07-03 11:00:00,683.0
+T1,2015-07-03 12:00:00,687.0
+T1,2015-07-03 13:00:00,660.0
+T1,2015-07-03 14:00:00,661.0
+T1,2015-07-03 15:00:00,632.0
+T1,2015-07-03 16:00:00,573.0
+T1,2015-07-03 17:00:00,521.0
+T1,2015-07-03 18:00:00,481.0
+T1,2015-07-03 19:00:00,452.0
+T1,2015-07-03 20:00:00,447.0
+T1,2015-07-03 21:00:00,425.0
+T1,2015-07-03 22:00:00,427.0
+T1,2015-07-03 23:00:00,441.0
+T1,2015-07-04 00:00:00,438.0
+T1,2015-07-04 01:00:00,472.0
+T1,2015-07-04 02:00:00,528.0
+T1,2015-07-04 03:00:00,596.0
+T1,2015-07-04 04:00:00,661.0
+T1,2015-07-04 05:00:00,708.0
+T1,2015-07-04 06:00:00,754.0
+T1,2015-07-04 07:00:00,781.0
+T1,2015-07-04 08:00:00,808.0
+T1,2015-07-04 09:00:00,819.0
+T1,2015-07-04 10:00:00,820.0
+T1,2015-07-04 11:00:00,801.0
+T1,2015-07-04 12:00:00,770.0
+T1,2015-07-04 13:00:00,717.0
+T1,2015-07-04 14:00:00,697.0
+T1,2015-07-04 15:00:00,655.0
+T1,2015-07-04 16:00:00,607.0
+T1,2015-07-04 17:00:00,552.0
+T1,2015-07-04 18:00:00,512.0
+T1,2015-07-04 19:00:00,475.0
+T1,2015-07-04 20:00:00,452.0
+T1,2015-07-04 21:00:00,436.0
+T1,2015-07-04 22:00:00,429.0
+T1,2015-07-04 23:00:00,433.0
+T1,2015-07-05 00:00:00,430.0
+T1,2015-07-05 01:00:00,472.0
+T1,2015-07-05 02:00:00,536.0
+T1,2015-07-05 03:00:00,611.0
+T1,2015-07-05 04:00:00,662.0
+T1,2015-07-05 05:00:00,705.0
+T1,2015-07-05 06:00:00,707.0
+T1,2015-07-05 07:00:00,718.0
+T1,2015-07-05 08:00:00,733.0
+T1,2015-07-05 09:00:00,741.0
+T1,2015-07-05 10:00:00,737.0
+T1,2015-07-05 11:00:00,710.0
+T1,2015-07-05 12:00:00,647.0
+T1,2015-07-05 13:00:00,593.0
+T1,2015-07-05 14:00:00,564.0
+T1,2015-07-05 15:00:00,528.0
+T2,2015-07-01 12:00:00,3124.0
+T2,2015-07-01 13:00:00,2990.0
+T2,2015-07-01 14:00:00,2862.0
+T2,2015-07-01 15:00:00,2809.0
+T2,2015-07-01 16:00:00,2544.0
+T2,2015-07-01 17:00:00,2201.0
+T2,2015-07-01 18:00:00,1996.0
+T2,2015-07-01 19:00:00,1861.0
+T2,2015-07-01 20:00:00,1735.0
+T2,2015-07-01 21:00:00,1713.0
+T2,2015-07-01 22:00:00,1724.0
+T2,2015-07-01 23:00:00,1798.0
+T2,2015-07-02 00:00:00,1891.0
+T2,2015-07-02 01:00:00,2037.0
+T2,2015-07-02 02:00:00,2102.0
+T2,2015-07-02 03:00:00,2163.0
+T2,2015-07-02 04:00:00,2269.0
+T2,2015-07-02 05:00:00,2404.0
+T2,2015-07-02 06:00:00,2515.0
+T2,2015-07-02 07:00:00,2621.0
+T2,2015-07-02 08:00:00,2745.0
+T2,2015-07-02 09:00:00,2816.0
+T2,2015-07-02 10:00:00,2938.0
+T2,2015-07-02 11:00:00,3022.0
+T2,2015-07-02 12:00:00,2976.0
+T2,2015-07-02 13:00:00,2892.0
+T2,2015-07-02 14:00:00,2784.0
+T2,2015-07-02 15:00:00,2725.0
+T2,2015-07-02 16:00:00,2530.0
+T2,2015-07-02 17:00:00,2211.0
+T2,2015-07-02 18:00:00,1995.0
+T2,2015-07-02 19:00:00,1833.0
+T2,2015-07-02 20:00:00,1768.0
+T2,2015-07-02 21:00:00,1712.0
+T2,2015-07-02 22:00:00,1707.0
+T2,2015-07-02 23:00:00,1762.0
+T2,2015-07-03 00:00:00,1880.0
+T2,2015-07-03 01:00:00,1995.0
+T2,2015-07-03 02:00:00,2134.0
+T2,2015-07-03 03:00:00,2227.0
+T2,2015-07-03 04:00:00,2376.0
+T2,2015-07-03 05:00:00,2477.0
+T2,2015-07-03 06:00:00,2597.0
+T2,2015-07-03 07:00:00,2691.0
+T2,2015-07-03 08:00:00,2751.0
+T2,2015-07-03 09:00:00,2782.0
+T2,2015-07-03 10:00:00,2810.0
+T2,2015-07-03 11:00:00,2781.0
+T2,2015-07-03 12:00:00,2693.0
+T2,2015-07-03 13:00:00,2567.0
+T2,2015-07-03 14:00:00,2490.0
+T2,2015-07-03 15:00:00,2448.0
+T2,2015-07-03 16:00:00,2277.0
+T2,2015-07-03 17:00:00,1997.0
+T2,2015-07-03 18:00:00,1785.0
+T2,2015-07-03 19:00:00,1689.0
+T2,2015-07-03 20:00:00,1562.0
+T2,2015-07-03 21:00:00,1560.0
+T2,2015-07-03 22:00:00,1505.0
+T2,2015-07-03 23:00:00,1538.0
+T2,2015-07-04 00:00:00,1641.0
+T2,2015-07-04 01:00:00,1735.0
+T2,2015-07-04 02:00:00,1950.0
+T2,2015-07-04 03:00:00,2138.0
+T2,2015-07-04 04:00:00,2303.0
+T2,2015-07-04 05:00:00,2432.0
+T2,2015-07-04 06:00:00,2528.0
+T2,2015-07-04 07:00:00,2656.0
+T2,2015-07-04 08:00:00,2740.0
+T2,2015-07-04 09:00:00,2803.0
+T2,2015-07-04 10:00:00,2855.0
+T2,2015-07-04 11:00:00,2880.0
+T2,2015-07-04 12:00:00,2778.0
+T2,2015-07-04 13:00:00,2637.0
+T2,2015-07-04 14:00:00,2479.0
+T2,2015-07-04 15:00:00,2381.0
+T2,2015-07-04 16:00:00,2228.0
+T2,2015-07-04 17:00:00,2037.0
+T2,2015-07-04 18:00:00,1758.0
+T2,2015-07-04 19:00:00,1648.0
+T2,2015-07-04 20:00:00,1560.0
+T2,2015-07-04 21:00:00,1508.0
+T2,2015-07-04 22:00:00,1486.0
+T2,2015-07-04 23:00:00,1486.0
+T2,2015-07-05 00:00:00,1515.0
+T2,2015-07-05 01:00:00,1623.0
+T2,2015-07-05 02:00:00,1919.0
+T2,2015-07-05 03:00:00,2172.0
+T2,2015-07-05 04:00:00,2416.0
+T2,2015-07-05 05:00:00,2605.0
+T2,2015-07-05 06:00:00,2755.0
+T2,2015-07-05 07:00:00,2822.0
+T2,2015-07-05 08:00:00,2917.0
+T2,2015-07-05 09:00:00,2997.0
+T2,2015-07-05 10:00:00,3060.0
+T2,2015-07-05 11:00:00,3046.0
+T2,2015-07-05 12:00:00,2942.0
+T2,2015-07-05 13:00:00,2758.0
+T2,2015-07-05 14:00:00,2487.0
+T2,2015-07-05 15:00:00,2349.0
+T3,2015-07-01 12:00:00,1828.0
+T3,2015-07-01 13:00:00,1806.0
+T3,2015-07-01 14:00:00,1897.0
+T3,2015-07-01 15:00:00,1750.0
+T3,2015-07-01 16:00:00,1679.0
+T3,2015-07-01 17:00:00,1620.0
+T3,2015-07-01 18:00:00,1463.0
+T3,2015-07-01 19:00:00,1342.0
+T3,2015-07-01 20:00:00,1192.0
+T3,2015-07-01 21:00:00,1108.0
+T3,2015-07-01 22:00:00,1058.0
+T3,2015-07-01 23:00:00,1024.0
+T3,2015-07-02 00:00:00,1031.0
+T3,2015-07-02 01:00:00,1091.0
+T3,2015-07-02 02:00:00,1208.0
+T3,2015-07-02 03:00:00,1337.0
+T3,2015-07-02 04:00:00,1435.0
+T3,2015-07-02 05:00:00,1515.0
+T3,2015-07-02 06:00:00,1593.0
+T3,2015-07-02 07:00:00,1667.0
+T3,2015-07-02 08:00:00,1753.0
+T3,2015-07-02 09:00:00,1768.0
+T3,2015-07-02 10:00:00,1823.0
+T3,2015-07-02 11:00:00,1813.0
+T3,2015-07-02 12:00:00,1842.0
+T3,2015-07-02 13:00:00,1838.0
+T3,2015-07-02 14:00:00,1800.0
+T3,2015-07-02 15:00:00,1761.0
+T3,2015-07-02 16:00:00,1670.0
+T3,2015-07-02 17:00:00,1609.0
+T3,2015-07-02 18:00:00,1467.0
+T3,2015-07-02 19:00:00,1309.0
+T3,2015-07-02 20:00:00,1189.0
+T3,2015-07-02 21:00:00,1102.0
+T3,2015-07-02 22:00:00,1054.0
+T3,2015-07-02 23:00:00,1017.0
+T3,2015-07-03 00:00:00,1014.0
+T3,2015-07-03 01:00:00,1063.0
+T3,2015-07-03 02:00:00,1187.0
+T3,2015-07-03 03:00:00,1314.0
+T3,2015-07-03 04:00:00,1424.0
+T3,2015-07-03 05:00:00,1497.0
+T3,2015-07-03 06:00:00,1586.0
+T3,2015-07-03 07:00:00,1659.0
+T3,2015-07-03 08:00:00,1722.0
+T3,2015-07-03 09:00:00,1781.0
+T3,2015-07-03 10:00:00,1805.0
+T3,2015-07-03 11:00:00,1831.0
+T3,2015-07-03 12:00:00,1851.0
+T3,2015-07-03 13:00:00,1831.0
+T3,2015-07-03 14:00:00,1809.0
+T3,2015-07-03 15:00:00,1755.0
+T3,2015-07-03 16:00:00,1685.0
+T3,2015-07-03 17:00:00,1618.0
+T3,2015-07-03 18:00:00,1487.0
+T3,2015-07-03 19:00:00,1311.0
+T3,2015-07-03 20:00:00,1180.0
+T3,2015-07-03 21:00:00,1087.0
+T3,2015-07-03 22:00:00,1033.0
+T3,2015-07-03 23:00:00,1002.0
+T3,2015-07-04 00:00:00,991.0
+T3,2015-07-04 01:00:00,1005.0
+T3,2015-07-04 02:00:00,1071.0
+T3,2015-07-04 03:00:00,1191.0
+T3,2015-07-04 04:00:00,1307.0
+T3,2015-07-04 05:00:00,1407.0
+T3,2015-07-04 06:00:00,1495.0
+T3,2015-07-04 07:00:00,1576.0
+T3,2015-07-04 08:00:00,1635.0
+T3,2015-07-04 09:00:00,1688.0
+T3,2015-07-04 10:00:00,1711.0
+T3,2015-07-04 11:00:00,1741.0
+T3,2015-07-04 12:00:00,1768.0
+T3,2015-07-04 13:00:00,1765.0
+T3,2015-07-04 14:00:00,1738.0
+T3,2015-07-04 15:00:00,1684.0
+T3,2015-07-04 16:00:00,1605.0
+T3,2015-07-04 17:00:00,1553.0
+T3,2015-07-04 18:00:00,1433.0
+T3,2015-07-04 19:00:00,1297.0
+T3,2015-07-04 20:00:00,1177.0
+T3,2015-07-04 21:00:00,1082.0
+T3,2015-07-04 22:00:00,1028.0
+T3,2015-07-04 23:00:00,987.0
+T3,2015-07-05 00:00:00,970.0
+T3,2015-07-05 01:00:00,959.0
+T3,2015-07-05 02:00:00,993.0
+T3,2015-07-05 03:00:00,1083.0
+T3,2015-07-05 04:00:00,1215.0
+T3,2015-07-05 05:00:00,1310.0
+T3,2015-07-05 06:00:00,1415.0
+T3,2015-07-05 07:00:00,1479.0
+T3,2015-07-05 08:00:00,1525.0
+T3,2015-07-05 09:00:00,1599.0
+T3,2015-07-05 10:00:00,1623.0
+T3,2015-07-05 11:00:00,1652.0
+T3,2015-07-05 12:00:00,1671.0
+T3,2015-07-05 13:00:00,1664.0
+T3,2015-07-05 14:00:00,1637.0
+T3,2015-07-05 15:00:00,1574.0
diff --git a/tests/unit/amlb/datasets/file/test_file_dataloader.py b/tests/unit/amlb/datasets/file/test_file_dataloader.py
index fa8151789..778cccdf7 100644
--- a/tests/unit/amlb/datasets/file/test_file_dataloader.py
+++ b/tests/unit/amlb/datasets/file/test_file_dataloader.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import pandas.api.types as pat
 
 from amlb.resources import from_config
 from amlb.data import DatasetType
@@ -241,14 +242,15 @@ def _assert_data_paths(dataset, definition):
             assert dataset.train.data_path(f) == path_from_split(s)
 
 
-def _assert_X_y_types(data_split):
+def _assert_X_y_types(data_split, check_encoded=True):
     assert isinstance(data_split.X, pd.DataFrame)
     assert isinstance(data_split.y, pd.DataFrame)
-    assert isinstance(data_split.X_enc, np.ndarray)
-    assert isinstance(data_split.y_enc, np.ndarray)
+    if check_encoded:
+        assert isinstance(data_split.X_enc, np.ndarray)
+        assert isinstance(data_split.y_enc, np.ndarray)
 
 
-def _assert_data_consistency(dataset):
+def _assert_data_consistency(dataset, check_encoded=True):
     assert len(dataset.train.X) == len(dataset.train.y)
     assert len(dataset.train.X.columns) == len(dataset.predictors)
     assert len(dataset.train.y.columns) == 1
@@ -257,11 +259,107 @@ def _assert_data_consistency(dataset):
 
     assert not any([p.is_target for p in dataset.predictors])
 
-    assert dataset.train.X_enc.shape == dataset.train.X.shape
 
     assert dataset.test.X.dtypes.equals(dataset.train.X.dtypes)
     assert dataset.test.y.dtypes.equals(dataset.train.y.dtypes)
 
-    assert np.issubdtype(dataset.train.X_enc.dtype, np.floating)
-    assert np.issubdtype(dataset.train.y_enc.dtype, np.floating)  # not ideal given that it's also for classification targets, but well…
+    if check_encoded:
+        assert dataset.train.X_enc.shape == dataset.train.X.shape
+        assert np.issubdtype(dataset.train.X_enc.dtype, np.floating)
+        assert np.issubdtype(dataset.train.y_enc.dtype, np.floating)  # not ideal given that it's also for classification targets, but well…
 
+
+
+@pytest.mark.use_disk
+def test_load_timeseries_task_csv(file_loader):
+    ds_def = ns(
+        path=os.path.join(res, "m4_hourly_subset.csv"),
+        forecast_horizon_in_steps=24,
+        seasonality=24,
+        freq="H",
+        target="target",
+        type="timeseries",
+    )
+    ds = file_loader.load(ds_def)
+    assert ds.type is DatasetType.timeseries
+    print(ds.train.X.dtypes)
+    _assert_data_consistency(ds, check_encoded=False)
+    _assert_X_y_types(ds.train, check_encoded=False)
+    _assert_X_y_types(ds.test, check_encoded=False)
+
+    assert isinstance(ds.train.data, pd.DataFrame)
+    assert isinstance(ds.test.data, pd.DataFrame)
+    assert len(ds.repeated_abs_seasonal_error) == len(ds.test.data)
+    assert len(ds.repeated_item_id) == len(ds.test.data)
+
+    assert pat.is_categorical_dtype(ds._dtypes[ds.id_column])
+    assert pat.is_datetime64_dtype(ds._dtypes[ds.timestamp_column])
+    assert pat.is_float_dtype(ds._dtypes[ds.target.name])
+
+    # timeseries uses different task schema - set attributes for test to work
+    ds_def['train'] = ds.train.path
+    ds_def['test'] = ds.test.path
+    _assert_data_paths(ds, ds_def)
+
+
+@pytest.mark.parametrize("missing_key", ["freq", "forecast_horizon_in_steps", "seasonality"])
+def test_when_timeseries_task_key_is_missing_then_exception_is_raised(file_loader, missing_key):
+    task_kwargs = dict(
+        path=os.path.join(res, "m4_hourly_subset.csv"),
+        forecast_horizon_in_steps=24,
+        seasonality=24,
+        freq="H",
+        target="target",
+        type="timeseries",
+    )
+    task_kwargs.pop(missing_key)
+    ds_def = ns.from_dict(task_kwargs)
+    with pytest.raises(AssertionError, match=f"Task definition for timeseries must include `{missing_key}`"):
+        file_loader.load(ds_def)
+
+
+@pytest.mark.parametrize("missing_key", ["id_column", "timestamp_column"])
+def test_given_nondefault_column_names_when_key_is_missing_then_exception_is_raised(file_loader, missing_key):
+    task_kwargs = dict(
+        path=os.path.join(res, "m4_hourly_subset_nondefault_cols.csv"),
+        forecast_horizon_in_steps=24,
+        seasonality=24,
+        freq="H",
+        type="timeseries",
+        target="CustomTarget",
+        id_column="CustomId",
+        timestamp_column="CustomTimestamp",
+    )
+    task_kwargs.pop(missing_key)
+    ds_def = ns.from_dict(task_kwargs)
+    with pytest.raises(ValueError, match=missing_key):
+        file_loader.load(ds_def)
+
+
+def test_given_nondefault_column_names_then_timeseries_dataset_can_be_loaded(file_loader):
+    task_kwargs = dict(
+        path=os.path.join(res, "m4_hourly_subset_nondefault_cols.csv"),
+        forecast_horizon_in_steps=24,
+        seasonality=24,
+        freq="H",
+        type="timeseries",
+        target="CustomTarget",
+        id_column="CustomId",
+        timestamp_column="CustomTimestamp",
+    )
+    ds_def = ns.from_dict(task_kwargs)
+    ds = file_loader.load(ds_def)
+    _assert_data_consistency(ds, check_encoded=False)
+
+
+@pytest.mark.parametrize("forecast_horizon, fold", [(50, 2), (100, 0), (10, 9)])
+def test_if_timeseries_dataset_too_short_for_requested_fold_then_exception_is_raised(file_loader, forecast_horizon, fold):
+    ds_def = ns(
+        path=os.path.join(res, "m4_hourly_subset.csv"),
+        forecast_horizon_in_steps=forecast_horizon,
+        seasonality=24,
+        freq="H",
+        type="timeseries",
+    )
+    with pytest.raises(ValueError, match="All time series in the dataset must have length"):
+        file_loader.load(ds_def, fold=fold)