Master to stable v2 (#588)

Update NAML version and fix evaluations of sparse targets
openml · Sep 1, 2023 · 8aa96c2 · 8aa96c2
1 parent 4143791
commit 8aa96c2
Show file tree

Hide file tree

Showing 77 changed files with 5,576 additions and 1,320 deletions.
diff --git a/.github/workflows/build_deploy_docs.yml b/.github/workflows/build_deploy_docs.yml
@@ -0,0 +1,66 @@
+# Simple workflow for deploying static content to GitHub Pages generated by Github
+# except for added job steps "Copy Static Files" through "Build MkDocs Pages".
+name: Deploy static content to Pages
+
+on:
+  # Runs on pushes targeting the default branch
+  push:
+    branches: ["master"]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  # Single deploy job since we're just deploying
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Setup Pages
+        uses: actions/configure-pages@v3
+      - name: Copy Static Files
+        run: |
+          cp -R docs/website site/
+      - name: Replace GITHUB token
+        # Use different sed delimiter to avoid clashing with forward slash in URL
+        run: |
+          find docs/ -type f -exec sed -i "s@GITHUB@${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}\/blob\/master@g" {} \;
+      - name: Replace WEBSITE token
+        # Use different sed delimiter to avoid clashing with forward slash in URL
+        run: |
+          WEBSITE_URL="https://${GITHUB_REPOSITORY_OWNER}.github.io/automlbenchmark"
+          find docs/ -type f -exec sed -i "s@WEBSITE@${WEBSITE_URL}@g" {} \;
+          sed -i "s@WEBSITE@${WEBSITE_URL}@g" mkdocs.yml
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11' 
+      - name: Install MkDocs
+        run: |
+           python -m pip install mkdocs-material
+      - name: Build MkDocs Pages
+        run: |
+          mkdocs build --site-dir site/docs
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v2
+        with:
+          # Upload entire repository
+          path: './site/'
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v2
diff --git a/amlb/benchmark.py b/amlb/benchmark.py
@@ -379,7 +379,7 @@ def _is_task_enabled(task_def):
 
 class TaskConfig:
 
-    def __init__(self, name, openml_task_id, test_server, fold, metrics, seed,
+    def __init__(self, name, openml_task_id, test_server, fold, metrics, quantile_levels, seed,
                  max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb,
                  input_dir, output_dir, tag, command, git_info, measure_inference_time: bool = False):
         self.framework = None
@@ -404,6 +404,7 @@ def __init__(self, name, openml_task_id, test_server, fold, metrics, seed,
         self.git_info = git_info
         self.measure_inference_time = measure_inference_time
         self.ext = ns()  # used if frameworks require extra config points
+        self.quantile_levels = list(sorted(quantile_levels))
 
     def __setattr__(self, name, value):
         if name == 'metrics':
@@ -477,9 +478,10 @@ def __init__(self, benchmark: Benchmark, task_def, fold):
         self.fold = fold
         self.task_config = TaskConfig(
             name=task_def.name,
-            openml_task_id=task_def.openml_task_id,
+            openml_task_id=task_def["openml_task_id"],
             fold=fold,
             metrics=task_def.metric,
+            quantile_levels=task_def.quantile_levels,
             seed=rget().seed(fold),
             max_runtime_seconds=task_def.max_runtime_seconds,
             cores=task_def.cores,

diff --git a/amlb/datasets/file.py b/amlb/datasets/file.py
@@ -1,5 +1,6 @@
 from abc import abstractmethod
 import logging
+import math
 import os
 import re
 import tempfile
@@ -33,17 +34,17 @@ def __init__(self, cache_dir=None):
     def load(self, dataset, fold=0):
         dataset = dataset if isinstance(dataset, ns) else ns(path=dataset)
         log.debug("Loading dataset %s", dataset)
+        target = dataset['target']
+        type_ = dataset['type']
+        features = dataset['features']
+
+        if type_ and DatasetType[type_] == DatasetType.timeseries:
+            return TimeSeriesDataset(path=dataset['path'], fold=fold, target=target, features=features, cache_dir=self._cache_dir, config=dataset)
+
         paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold, name=dataset['name'] if 'name' in dataset else None)
         assert fold < len(paths['train']), f"No training dataset available for fold {fold} among dataset files {paths['train']}"
-        # seed = rget().seed(fold)
-        # if len(paths['test']) == 0:
-        #     log.warning("No test file in the dataset, the train set will automatically be split 90%/10% using the given seed.")
-        # else:
         assert fold < len(paths['test']), f"No test dataset available for fold {fold} among dataset files {paths['test']}"
 
-        target = dataset['target']
-        type_ = dataset['type']
-        features = dataset['features']
         ext = os.path.splitext(paths['train'][fold])[1].lower()
         train_path = paths['train'][fold]
         test_path = paths['test'][fold] if len(paths['test']) > 0 else None
@@ -139,40 +140,6 @@ def __repr__(self):
         return repr_def(self)
 
 
-    def extend_dataset_with_timeseries_config(self, dataset, dataset_config):
-        dataset = deepcopy(dataset)
-        dataset_config = deepcopy(dataset_config)
-        if dataset_config['id_column'] is None:
-            log.warning("Warning: For timeseries task setting undefined `id_column` to `item_id`.")
-            dataset_config['id_column'] = "item_id"
-        if dataset_config['forecast_range_in_steps'] is None:
-            log.warning("Warning: For timeseries task setting undefined `forecast_range_in_steps` to `1`.")
-            dataset_config['forecast_range_in_steps'] = "1"
-
-        dataset.timestamp_column=dataset_config['timestamp_column']
-        dataset.id_column=dataset_config['id_column']
-        dataset.forecast_range_in_steps=int(dataset_config['forecast_range_in_steps'])
-
-        train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count()
-        test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count()
-        forecast_range_in_steps_mean_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
-        forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
-        if not dataset.forecast_range_in_steps == forecast_range_in_steps_mean_diff_train_test:
-            msg = f"Warning: Forecast range {dataset.forecast_range_in_steps}, does not equal mean difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
-            log.warning(msg)
-        if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
-            msg = f"Error: Not all sequences of train and test set have same sequence length difference."
-            raise ValueError(msg)
-        if dataset.forecast_range_in_steps > forecast_range_in_steps_mean_diff_train_test:
-            msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
-            raise ValueError(msg)
-        if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test:
-            msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than minimum sequence length + 1, {forecast_range_in_steps_max_min_train_test}."
-            raise ValueError(msg)
-        return dataset
-
-
-
 class FileDataset(Dataset):
 
     def __init__(self, train: Datasplit, test: Datasplit,
@@ -350,10 +317,88 @@ def __init__(self, train_path, test_path,
         # todo: handle auto-split (if test_path is None): requires loading the training set, split, save
         super().__init__(None, None,
                          target=target, features=features, type=type)
-        self._train = CsvDatasplit(self, train_path, timestamp_column=timestamp_column)
-        self._test = CsvDatasplit(self, test_path, timestamp_column=timestamp_column)
+        self._train = CsvDatasplit(self, train_path)
+        self._test = CsvDatasplit(self, test_path)
+        self._dtypes = None
+
+
+class TimeSeriesDataset(FileDataset):
+    def __init__(self, path, fold, target, features, cache_dir, config):
+        super().__init__(None, None, target=target, features=features, type="timeseries")
+        if config['forecast_horizon_in_steps'] is None:
+            raise AssertionError("Task definition for timeseries must include `forecast_horizon_in_steps`")
+        if config['freq'] is None:
+            raise AssertionError("Task definition for timeseries must include `freq`")
+        if config['seasonality'] is None:
+            raise AssertionError("Task definition for timeseries must include `seasonality`")
+
+        full_data = read_csv(path)
+        if config['id_column'] is None:
+            log.warning("Warning: For timeseries task, setting undefined `id_column` to `item_id`")
+            config['id_column'] = 'item_id'
+        if config['id_column'] not in full_data.columns:
+            raise ValueError(f'The id_column with name {config["id_column"]} is missing from the dataset')
+        if config['timestamp_column'] is None:
+            log.warning("Warning: For timeseries task, setting undefined `timestamp_column` to `timestamp`")
+            config['timestamp_column'] = 'timestamp'
+        if config['timestamp_column'] not in full_data.columns:
+            raise ValueError(f'The timestamp_column with name {config["timestamp_column"]} is missing from the dataset')
+
+        self.forecast_horizon_in_steps = int(config['forecast_horizon_in_steps'])
+        self.freq = pd.tseries.frequencies.to_offset(config['freq']).freqstr
+        self.seasonality = int(config['seasonality'])
+        self.id_column = config['id_column']
+        self.timestamp_column = config['timestamp_column']
+
+        full_data[self.timestamp_column] = pd.to_datetime(full_data[self.timestamp_column])
+        if config['name'] is not None:
+            file_name = config['name']
+        else:
+            file_name = os.path.splitext(os.path.basename(path))[0]
+        save_dir = os.path.join(cache_dir, file_name, str(fold))
+        train_path, test_path = self.save_train_and_test_splits(full_data, fold=fold, save_dir=save_dir)
+
+        self._train = CsvDatasplit(self, train_path, timestamp_column=self.timestamp_column)
+        self._test = CsvDatasplit(self, test_path, timestamp_column=self.timestamp_column)
         self._dtypes = None
 
+        # Store repeated item_id & in-sample seasonal error for each time step in the forecast horizon - needed later for metrics like MASE.
+        # We need to store this information here because Result object has no access to past time series values.
+        self.repeated_item_id = self.test.data[self.id_column].astype("category").cat.codes.to_numpy()
+        self.repeated_abs_seasonal_error = self.compute_seasonal_error()
+
+    def save_train_and_test_splits(self, full_data, fold, save_dir):
+        full_data = full_data.sort_values(by=[self.id_column, self.timestamp_column])
+        shortest_ts_length = full_data.groupby(self.id_column).size().min()
+        min_expected_ts_length = (fold + 1) * self.forecast_horizon_in_steps + 1
+        if shortest_ts_length < min_expected_ts_length:
+            raise ValueError(
+                f'All time series in the dataset must have length > `(fold + 1) * forecast_horizon_in_steps` '
+                f'(at least {min_expected_ts_length + 1}), but shortest time series has length {shortest_ts_length}'
+            )
+        # Remove the last `steps_to_remove` steps from each time series to obtain the correct fold
+        if fold > 0:
+            steps_to_remove = (fold + 1) * self.forecast_horizon_in_steps
+            full_data = full_data.groupby(self.id_column, as_index=False).nth(slice(None, -steps_to_remove))
+        train_data = full_data.groupby(self.id_column, as_index=False).nth(slice(None, -self.forecast_horizon_in_steps))
+        test_data = full_data.groupby(self.id_column, as_index=False).nth(slice(-self.forecast_horizon_in_steps, None))
+
+        if not os.path.exists(save_dir):
+            os.makedirs(save_dir)
+        train_path = os.path.join(save_dir, "train.csv")
+        test_path = os.path.join(save_dir, "test.csv")
+
+        train_data.to_csv(train_path, index=False)
+        test_data.to_csv(test_path, index=False)
+        return train_path, test_path
+
+    def compute_seasonal_error(self):
+        train_data_with_index = self.train.data.set_index(self.id_column)
+        seasonal_diffs = train_data_with_index[self.target.name].groupby(level=self.id_column).diff(self.seasonality).abs()
+        abs_seasonal_error = seasonal_diffs.groupby(level=self.id_column).mean().fillna(1.0).values
+        # Repeat seasonal error for each time step in the forecast horizon
+        return np.repeat(abs_seasonal_error, self.forecast_horizon_in_steps)
+
 
 class CsvDatasplit(FileDatasplit):
 
@@ -396,8 +441,7 @@ def load_metadata(self):
                                       else 'string' if pat.is_string_dtype(dt)
                                       else 'datetime' if pat.is_datetime64_dtype(dt)
                                       else 'object')
-        features = [Feature(i, col, to_feature_type(dtypes[i]))
-                    for i, col in enumerate(self._ds.columns)]
+        features = [Feature(i, col, to_feature_type(dtypes[i])) for i, col in enumerate(self._ds.columns)]
 
         for f in features:
             col = self._ds.iloc[:, f.index]

diff --git a/amlb/datasets/openml.py b/amlb/datasets/openml.py
@@ -25,6 +25,12 @@
 from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify
 
 
+# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
+try:
+  set_openml_cache = oml.config.set_cache_directory
+except AttributeError:
+  set_openml_cache = oml.config.set_root_cache_directory
+
 log = logging.getLogger(__name__)
 
 # hack (only adding a ? to the regexp pattern) to ensure that '?' values remain quoted when we save dataplits in arff format.
@@ -39,7 +45,7 @@ class OpenmlLoader:
     def __init__(self, api_key, cache_dir=None):
         oml.config.apikey = api_key
         if cache_dir:
-            oml.config.set_cache_directory(cache_dir)
+            set_openml_cache(cache_dir)
 
         if oml.config.retry_policy != "robot":
             log.debug("Setting openml retry_policy from '%s' to 'robot'." % oml.config.retry_policy)

diff --git a/amlb/datautils.py b/amlb/datautils.py
@@ -37,19 +37,21 @@ def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dty
     :param header: if the columns header should be read.
     :param as_data_frame: if the result should be returned as a data frame (default) or a numpy array.
     :param dtype: data type for columns.
-    :param timestamp_column: column name for timestamp, to ensure dates are correctly parsed by pandas.
+    :param timestamp_column: name of the column that should be parsed as date.
     :return: a DataFrame
     """
-    if dtype is not None and timestamp_column is not None and timestamp_column in dtype:
-            dtype = dtype.copy() # to avoid outer context manipulation
-            del dtype[timestamp_column]
-
+    if timestamp_column is None:
+        parse_dates = None
+    else:
+        if dtype is not None:
+            dtype.pop(timestamp_column, None)
+        parse_dates = [timestamp_column]
     df = pd.read_csv(path,
                      nrows=nrows,
                      header=0 if header else None,
                      index_col=0 if index else None,
                      dtype=dtype,
-                     parse_dates=[timestamp_column] if timestamp_column is not None else None)
+                     parse_dates=parse_dates)
     return df if as_data_frame else df.values
 
 

diff --git a/amlb/defaults.py b/amlb/defaults.py
@@ -1,9 +1,15 @@
 import pathlib
 
-from openml.config import cache_directory
+import openml
 
 from amlb.utils import Namespace as ns
 
+# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
+try:
+    cache_directory = openml.config.cache_directory
+except AttributeError:
+    cache_directory = openml.config.get_cache_directory()
+
 default_dirs = ns(
     input_dir=cache_directory,
     output_dir=str(pathlib.Path(__file__).parent.parent / "results"),

diff --git a/amlb/resources.py b/amlb/resources.py
@@ -210,7 +210,7 @@ def _validate_task(self, task, lenient=False):
         if not lenient and len(missing) > 0:
             raise ValueError("{missing} mandatory properties as missing in task definition {taskdef}.".format(missing=missing, taskdef=task))
 
-        for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb']:
+        for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb', 'quantile_levels']:
             if task[conf] is None:
                 task[conf] = self.config.benchmarks.defaults[conf]
                 log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf]))
@@ -310,4 +310,3 @@ def output_dirs(root, session=None, subdirs=None, create=False):
     TransformRule(from_key='aws.query_frequency_seconds', to_key='aws.query_interval_seconds'),
     TransformRule(from_key='aws.ec2.monitoring.cpu.query_frequency_seconds', to_key='aws.ec2.monitoring.cpu.query_interval_seconds'),
 ]
-