Skip to content

Commit

Permalink
Master to stable v2 (#588)
Browse files Browse the repository at this point in the history
Update NAML version and fix evaluations of sparse targets
  • Loading branch information
PGijsbers authored Sep 1, 2023
1 parent 4143791 commit 8aa96c2
Show file tree
Hide file tree
Showing 77 changed files with 5,576 additions and 1,320 deletions.
66 changes: 66 additions & 0 deletions .github/workflows/build_deploy_docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Simple workflow for deploying static content to GitHub Pages generated by Github
# except for added job steps "Copy Static Files" through "Build MkDocs Pages".
name: Deploy static content to Pages

on:
# Runs on pushes targeting the default branch
push:
branches: ["master"]

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
permissions:
contents: read
pages: write
id-token: write

# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
concurrency:
group: "pages"
cancel-in-progress: false

jobs:
# Single deploy job since we're just deploying
deploy:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Setup Pages
uses: actions/configure-pages@v3
- name: Copy Static Files
run: |
cp -R docs/website site/
- name: Replace GITHUB token
# Use different sed delimiter to avoid clashing with forward slash in URL
run: |
find docs/ -type f -exec sed -i "s@GITHUB@${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}\/blob\/master@g" {} \;
- name: Replace WEBSITE token
# Use different sed delimiter to avoid clashing with forward slash in URL
run: |
WEBSITE_URL="https://${GITHUB_REPOSITORY_OWNER}.github.io/automlbenchmark"
find docs/ -type f -exec sed -i "s@WEBSITE@${WEBSITE_URL}@g" {} \;
sed -i "s@WEBSITE@${WEBSITE_URL}@g" mkdocs.yml
- uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install MkDocs
run: |
python -m pip install mkdocs-material
- name: Build MkDocs Pages
run: |
mkdocs build --site-dir site/docs
- name: Upload artifact
uses: actions/upload-pages-artifact@v2
with:
# Upload entire repository
path: './site/'
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v2
6 changes: 4 additions & 2 deletions amlb/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def _is_task_enabled(task_def):

class TaskConfig:

def __init__(self, name, openml_task_id, test_server, fold, metrics, seed,
def __init__(self, name, openml_task_id, test_server, fold, metrics, quantile_levels, seed,
max_runtime_seconds, cores, max_mem_size_mb, min_vol_size_mb,
input_dir, output_dir, tag, command, git_info, measure_inference_time: bool = False):
self.framework = None
Expand All @@ -404,6 +404,7 @@ def __init__(self, name, openml_task_id, test_server, fold, metrics, seed,
self.git_info = git_info
self.measure_inference_time = measure_inference_time
self.ext = ns() # used if frameworks require extra config points
self.quantile_levels = list(sorted(quantile_levels))

def __setattr__(self, name, value):
if name == 'metrics':
Expand Down Expand Up @@ -477,9 +478,10 @@ def __init__(self, benchmark: Benchmark, task_def, fold):
self.fold = fold
self.task_config = TaskConfig(
name=task_def.name,
openml_task_id=task_def.openml_task_id,
openml_task_id=task_def["openml_task_id"],
fold=fold,
metrics=task_def.metric,
quantile_levels=task_def.quantile_levels,
seed=rget().seed(fold),
max_runtime_seconds=task_def.max_runtime_seconds,
cores=task_def.cores,
Expand Down
134 changes: 89 additions & 45 deletions amlb/datasets/file.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from abc import abstractmethod
import logging
import math
import os
import re
import tempfile
Expand Down Expand Up @@ -33,17 +34,17 @@ def __init__(self, cache_dir=None):
def load(self, dataset, fold=0):
dataset = dataset if isinstance(dataset, ns) else ns(path=dataset)
log.debug("Loading dataset %s", dataset)
target = dataset['target']
type_ = dataset['type']
features = dataset['features']

if type_ and DatasetType[type_] == DatasetType.timeseries:
return TimeSeriesDataset(path=dataset['path'], fold=fold, target=target, features=features, cache_dir=self._cache_dir, config=dataset)

paths = self._extract_train_test_paths(dataset.path if 'path' in dataset else dataset, fold=fold, name=dataset['name'] if 'name' in dataset else None)
assert fold < len(paths['train']), f"No training dataset available for fold {fold} among dataset files {paths['train']}"
# seed = rget().seed(fold)
# if len(paths['test']) == 0:
# log.warning("No test file in the dataset, the train set will automatically be split 90%/10% using the given seed.")
# else:
assert fold < len(paths['test']), f"No test dataset available for fold {fold} among dataset files {paths['test']}"

target = dataset['target']
type_ = dataset['type']
features = dataset['features']
ext = os.path.splitext(paths['train'][fold])[1].lower()
train_path = paths['train'][fold]
test_path = paths['test'][fold] if len(paths['test']) > 0 else None
Expand Down Expand Up @@ -139,40 +140,6 @@ def __repr__(self):
return repr_def(self)


def extend_dataset_with_timeseries_config(self, dataset, dataset_config):
dataset = deepcopy(dataset)
dataset_config = deepcopy(dataset_config)
if dataset_config['id_column'] is None:
log.warning("Warning: For timeseries task setting undefined `id_column` to `item_id`.")
dataset_config['id_column'] = "item_id"
if dataset_config['forecast_range_in_steps'] is None:
log.warning("Warning: For timeseries task setting undefined `forecast_range_in_steps` to `1`.")
dataset_config['forecast_range_in_steps'] = "1"

dataset.timestamp_column=dataset_config['timestamp_column']
dataset.id_column=dataset_config['id_column']
dataset.forecast_range_in_steps=int(dataset_config['forecast_range_in_steps'])

train_seqs_lengths = dataset.train.X.groupby(dataset.id_column).count()
test_seqs_lengths = dataset.test.X.groupby(dataset.id_column).count()
forecast_range_in_steps_mean_diff_train_test = int((test_seqs_lengths - train_seqs_lengths).mean())
forecast_range_in_steps_max_min_train_test = int(min(int(test_seqs_lengths.min()), int(train_seqs_lengths.min()))) - 1
if not dataset.forecast_range_in_steps == forecast_range_in_steps_mean_diff_train_test:
msg = f"Warning: Forecast range {dataset.forecast_range_in_steps}, does not equal mean difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
log.warning(msg)
if not (test_seqs_lengths - train_seqs_lengths).var().item() == 0.:
msg = f"Error: Not all sequences of train and test set have same sequence length difference."
raise ValueError(msg)
if dataset.forecast_range_in_steps > forecast_range_in_steps_mean_diff_train_test:
msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than difference between test and train sequence lengths {forecast_range_in_steps_mean_diff_train_test}."
raise ValueError(msg)
if dataset.forecast_range_in_steps > forecast_range_in_steps_max_min_train_test:
msg = f"Error: Forecast range {dataset.forecast_range_in_steps} longer than minimum sequence length + 1, {forecast_range_in_steps_max_min_train_test}."
raise ValueError(msg)
return dataset



class FileDataset(Dataset):

def __init__(self, train: Datasplit, test: Datasplit,
Expand Down Expand Up @@ -350,10 +317,88 @@ def __init__(self, train_path, test_path,
# todo: handle auto-split (if test_path is None): requires loading the training set, split, save
super().__init__(None, None,
target=target, features=features, type=type)
self._train = CsvDatasplit(self, train_path, timestamp_column=timestamp_column)
self._test = CsvDatasplit(self, test_path, timestamp_column=timestamp_column)
self._train = CsvDatasplit(self, train_path)
self._test = CsvDatasplit(self, test_path)
self._dtypes = None


class TimeSeriesDataset(FileDataset):
def __init__(self, path, fold, target, features, cache_dir, config):
super().__init__(None, None, target=target, features=features, type="timeseries")
if config['forecast_horizon_in_steps'] is None:
raise AssertionError("Task definition for timeseries must include `forecast_horizon_in_steps`")
if config['freq'] is None:
raise AssertionError("Task definition for timeseries must include `freq`")
if config['seasonality'] is None:
raise AssertionError("Task definition for timeseries must include `seasonality`")

full_data = read_csv(path)
if config['id_column'] is None:
log.warning("Warning: For timeseries task, setting undefined `id_column` to `item_id`")
config['id_column'] = 'item_id'
if config['id_column'] not in full_data.columns:
raise ValueError(f'The id_column with name {config["id_column"]} is missing from the dataset')
if config['timestamp_column'] is None:
log.warning("Warning: For timeseries task, setting undefined `timestamp_column` to `timestamp`")
config['timestamp_column'] = 'timestamp'
if config['timestamp_column'] not in full_data.columns:
raise ValueError(f'The timestamp_column with name {config["timestamp_column"]} is missing from the dataset')

self.forecast_horizon_in_steps = int(config['forecast_horizon_in_steps'])
self.freq = pd.tseries.frequencies.to_offset(config['freq']).freqstr
self.seasonality = int(config['seasonality'])
self.id_column = config['id_column']
self.timestamp_column = config['timestamp_column']

full_data[self.timestamp_column] = pd.to_datetime(full_data[self.timestamp_column])
if config['name'] is not None:
file_name = config['name']
else:
file_name = os.path.splitext(os.path.basename(path))[0]
save_dir = os.path.join(cache_dir, file_name, str(fold))
train_path, test_path = self.save_train_and_test_splits(full_data, fold=fold, save_dir=save_dir)

self._train = CsvDatasplit(self, train_path, timestamp_column=self.timestamp_column)
self._test = CsvDatasplit(self, test_path, timestamp_column=self.timestamp_column)
self._dtypes = None

# Store repeated item_id & in-sample seasonal error for each time step in the forecast horizon - needed later for metrics like MASE.
# We need to store this information here because Result object has no access to past time series values.
self.repeated_item_id = self.test.data[self.id_column].astype("category").cat.codes.to_numpy()
self.repeated_abs_seasonal_error = self.compute_seasonal_error()

def save_train_and_test_splits(self, full_data, fold, save_dir):
full_data = full_data.sort_values(by=[self.id_column, self.timestamp_column])
shortest_ts_length = full_data.groupby(self.id_column).size().min()
min_expected_ts_length = (fold + 1) * self.forecast_horizon_in_steps + 1
if shortest_ts_length < min_expected_ts_length:
raise ValueError(
f'All time series in the dataset must have length > `(fold + 1) * forecast_horizon_in_steps` '
f'(at least {min_expected_ts_length + 1}), but shortest time series has length {shortest_ts_length}'
)
# Remove the last `steps_to_remove` steps from each time series to obtain the correct fold
if fold > 0:
steps_to_remove = (fold + 1) * self.forecast_horizon_in_steps
full_data = full_data.groupby(self.id_column, as_index=False).nth(slice(None, -steps_to_remove))
train_data = full_data.groupby(self.id_column, as_index=False).nth(slice(None, -self.forecast_horizon_in_steps))
test_data = full_data.groupby(self.id_column, as_index=False).nth(slice(-self.forecast_horizon_in_steps, None))

if not os.path.exists(save_dir):
os.makedirs(save_dir)
train_path = os.path.join(save_dir, "train.csv")
test_path = os.path.join(save_dir, "test.csv")

train_data.to_csv(train_path, index=False)
test_data.to_csv(test_path, index=False)
return train_path, test_path

def compute_seasonal_error(self):
train_data_with_index = self.train.data.set_index(self.id_column)
seasonal_diffs = train_data_with_index[self.target.name].groupby(level=self.id_column).diff(self.seasonality).abs()
abs_seasonal_error = seasonal_diffs.groupby(level=self.id_column).mean().fillna(1.0).values
# Repeat seasonal error for each time step in the forecast horizon
return np.repeat(abs_seasonal_error, self.forecast_horizon_in_steps)


class CsvDatasplit(FileDatasplit):

Expand Down Expand Up @@ -396,8 +441,7 @@ def load_metadata(self):
else 'string' if pat.is_string_dtype(dt)
else 'datetime' if pat.is_datetime64_dtype(dt)
else 'object')
features = [Feature(i, col, to_feature_type(dtypes[i]))
for i, col in enumerate(self._ds.columns)]
features = [Feature(i, col, to_feature_type(dtypes[i])) for i, col in enumerate(self._ds.columns)]

for f in features:
col = self._ds.iloc[:, f.index]
Expand Down
8 changes: 7 additions & 1 deletion amlb/datasets/openml.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
from ..utils import as_list, lazy_property, path_from_split, profile, split_path, unsparsify


# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
try:
set_openml_cache = oml.config.set_cache_directory
except AttributeError:
set_openml_cache = oml.config.set_root_cache_directory

log = logging.getLogger(__name__)

# hack (only adding a ? to the regexp pattern) to ensure that '?' values remain quoted when we save dataplits in arff format.
Expand All @@ -39,7 +45,7 @@ class OpenmlLoader:
def __init__(self, api_key, cache_dir=None):
oml.config.apikey = api_key
if cache_dir:
oml.config.set_cache_directory(cache_dir)
set_openml_cache(cache_dir)

if oml.config.retry_policy != "robot":
log.debug("Setting openml retry_policy from '%s' to 'robot'." % oml.config.retry_policy)
Expand Down
14 changes: 8 additions & 6 deletions amlb/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,21 @@ def read_csv(path, nrows=None, header=True, index=False, as_data_frame=True, dty
:param header: if the columns header should be read.
:param as_data_frame: if the result should be returned as a data frame (default) or a numpy array.
:param dtype: data type for columns.
:param timestamp_column: column name for timestamp, to ensure dates are correctly parsed by pandas.
:param timestamp_column: name of the column that should be parsed as date.
:return: a DataFrame
"""
if dtype is not None and timestamp_column is not None and timestamp_column in dtype:
dtype = dtype.copy() # to avoid outer context manipulation
del dtype[timestamp_column]

if timestamp_column is None:
parse_dates = None
else:
if dtype is not None:
dtype.pop(timestamp_column, None)
parse_dates = [timestamp_column]
df = pd.read_csv(path,
nrows=nrows,
header=0 if header else None,
index_col=0 if index else None,
dtype=dtype,
parse_dates=[timestamp_column] if timestamp_column is not None else None)
parse_dates=parse_dates)
return df if as_data_frame else df.values


Expand Down
8 changes: 7 additions & 1 deletion amlb/defaults.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import pathlib

from openml.config import cache_directory
import openml

from amlb.utils import Namespace as ns

# https://github.com/openml/automlbenchmark/pull/574#issuecomment-1646179921
try:
cache_directory = openml.config.cache_directory
except AttributeError:
cache_directory = openml.config.get_cache_directory()

default_dirs = ns(
input_dir=cache_directory,
output_dir=str(pathlib.Path(__file__).parent.parent / "results"),
Expand Down
3 changes: 1 addition & 2 deletions amlb/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def _validate_task(self, task, lenient=False):
if not lenient and len(missing) > 0:
raise ValueError("{missing} mandatory properties as missing in task definition {taskdef}.".format(missing=missing, taskdef=task))

for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb']:
for conf in ['max_runtime_seconds', 'cores', 'folds', 'max_mem_size_mb', 'min_vol_size_mb', 'quantile_levels']:
if task[conf] is None:
task[conf] = self.config.benchmarks.defaults[conf]
log.debug("Config `{config}` not set for task {name}, using default `{value}`.".format(config=conf, name=task.name, value=task[conf]))
Expand Down Expand Up @@ -310,4 +310,3 @@ def output_dirs(root, session=None, subdirs=None, create=False):
TransformRule(from_key='aws.query_frequency_seconds', to_key='aws.query_interval_seconds'),
TransformRule(from_key='aws.ec2.monitoring.cpu.query_frequency_seconds', to_key='aws.ec2.monitoring.cpu.query_interval_seconds'),
]

Loading

0 comments on commit 8aa96c2

Please sign in to comment.