diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml index 82f9aa432..3bfd0693c 100644 --- a/.github/workflows/dist.yml +++ b/.github/workflows/dist.yml @@ -1,33 +1,62 @@ name: dist-check -on: [push, pull_request] +on: + # Manually triggerable in github + workflow_dispatch: + + # When a push occurs on either of these branches + push: + branches: + - master + - development + + # When a push occurs on a PR that targets these branches + pull_request: + branches: + - master + - development + + schedule: + # Every day at 7AM UTC + - cron: '0 07 * * *' jobs: + dist: runs-on: ubuntu-latest + steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: submodules: recursive - name: Setup Python uses: actions/setup-python@v2 with: python-version: 3.8 + - name: Build dist run: | python setup.py sdist + - name: Twine check run: | pip install twine last_dist=$(ls -t dist/autoPyTorch-*.tar.gz | head -n 1) twine_output=`twine check "$last_dist"` if [[ "$twine_output" != "Checking $last_dist: PASSED" ]]; then echo $twine_output && exit 1;fi + - name: Install dist run: | last_dist=$(ls -t dist/autoPyTorch-*.tar.gz | head -n 1) pip install $last_dist + - name: PEP 561 Compliance run: | pip install mypy - cd .. # required to use the installed version of autosklearn - if ! python -c "import autoPyTorch"; then exit 1; fi \ No newline at end of file + + cd .. # required to use the installed version of autoPyTorch + + # Note this doesn't perform mypy checks, those are handled in pre-commit.yaml + # This only checks if autoPyTorch exports type information + if ! mypy -c "import autoPyTorch"; then exit 1; fi diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 000000000..b8c5d916e --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,80 @@ +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Publish Docker image + +on: + push: + # Push to `master` or `development` + branches: + - master + - development + - add_docker-publish + workflow_dispatch: + +jobs: + push_to_registries: + name: Push Docker image to multiple registries + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + steps: + - name: Check out the repo + uses: actions/checkout@v2 + + - name: Extract branch name + shell: bash + run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" + id: extract_branch + + - name: Log in to Docker Hub + uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Log in to the Container registry + uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 + with: + images: | + automlorg/autopytorch + ghcr.io/${{ github.repository }} + + - name: Build and push Docker images + uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc + with: + context: . + push: true + tags: ${{ steps.extract_branch.outputs.branch }} + + - name: Docker Login + run: docker login ghcr.io -u $GITHUB_ACTOR -p $GITHUB_TOKEN + env: + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} + + - name: Pull Docker image + run: docker pull ghcr.io/$GITHUB_REPOSITORY/autoPyTorch:$BRANCH + env: + BRANCH: ${{ steps.extract_branch.outputs.branch }} + + - name: Run image + run: docker run -i -d --name unittester -v $GITHUB_WORKSPACE:/workspace -w /workspace ghcr.io/$GITHUB_REPOSITORY/autoPyTorch:$BRANCH + env: + BRANCH: ${{ steps.extract_branch.outputs.branch }} + + - name: Auto-PyTorch loaded + run: docker exec -i unittester python3 -c 'import autoPyTorch; print(f"Auto-PyTorch imported from {autoPyTorch.__file__}")' + + - name: Run unit testing + run: docker exec -i unittester python3 -m pytest -v test \ No newline at end of file diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index f6a87c91b..cd665ecf9 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -1,29 +1,51 @@ name: Docs -on: [pull_request, push] + +on: + # Allow to manually trigger through github API + # Wont trigger the push to github pages where the documentation is located + workflow_dispatch: + + # Triggers with push to these branches + push: + branches: + - master + - development + + # Triggers with push to a pr aimed at these branches + pull_request: + branches: + - master + - development jobs: build-and-deploy: runs-on: ubuntu-latest + steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v2 with: submodules: recursive - name: Setup Python uses: actions/setup-python@v2 with: python-version: 3.8 + - name: Install dependencies run: | pip install -e .[docs,examples] + - name: Make docs run: | cd docs make html + - name: Pull latest gh-pages if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' run: | cd .. git clone https://github.com/automl/Auto-PyTorch.git --branch gh-pages --single-branch gh-pages + - name: Copy new doc into gh-pages if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' run: | @@ -31,6 +53,7 @@ jobs: cd ../gh-pages rm -rf $branch_name cp -r ../Auto-PyTorch/docs/build/html $branch_name + - name: Push to gh-pages if: (contains(github.ref, 'develop') || contains(github.ref, 'master')) && github.event_name == 'push' run: | diff --git a/.github/workflows/long_regression_test.yml b/.github/workflows/long_regression_test.yml index e7ccb5ea0..3007b22de 100644 --- a/.github/workflows/long_regression_test.yml +++ b/.github/workflows/long_regression_test.yml @@ -7,15 +7,15 @@ on: #- cron: '0 07 * * 2' - cron: '0 07 * * *' - jobs: - ubuntu: + ubuntu: runs-on: ubuntu-latest + strategy: + fail-fast: false matrix: python-version: [3.8] - fail-fast: false steps: - uses: actions/checkout@v2 @@ -26,10 +26,12 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + - name: Install test dependencies run: | python -m pip install --upgrade pip pip install -e .[test] + - name: Run tests run: | python -m pytest --durations=200 cicd/test_preselected_configs.py -vs diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml index 5e192375a..d9fd438c5 100644 --- a/.github/workflows/pre-commit.yaml +++ b/.github/workflows/pre-commit.yaml @@ -1,22 +1,44 @@ name: pre-commit -on: [push, pull_request] +on: + # Allow to manually trigger through github API + workflow_dispatch: + + # Triggers with push to these branches + push: + branches: + - master + - development + + # Triggers with push to a pr aimed at these branches + pull_request: + branches: + - master + - development jobs: + run-all-files: runs-on: ubuntu-latest + steps: - - uses: actions/checkout@v2 - with: - submodules: recursive + - name: Checkout + uses: actions/checkout@v2 + - name: Setup Python 3.7 uses: actions/setup-python@v2 with: python-version: 3.7 + + - name: Init Submodules + run: | + git submodule update --init --recursive + - name: Install pre-commit run: | pip install pre-commit pre-commit install + - name: Run pre-commit run: | pre-commit run --all-files diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index fed77c484..5a5cce20e 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -1,42 +1,118 @@ name: Tests -on: [push, pull_request] +on: + # Allow to manually trigger through github API + workflow_dispatch: + + # Triggers with push to these branches + push: + branches: + - master + - development + + # Triggers with push to pr targeting these branches + pull_request: + branches: + - master + - development + + schedule: + # Every day at 7AM UTC + - cron: '0 07 * * *' + +env: + + # Arguments used for pytest + pytest-args: >- + --forked + --durations=20 + --timeout=600 + --timeout-method=signal + -v + + # Arguments used for code-cov which is later used to annotate PR's on github + code-cov-args: >- + --cov=autoPyTorch + --cov-report=xml + --cov-config=.coveragerc jobs: - ubuntu: + tests: + + name: ${{ matrix.os }}-${{ matrix.python-version }}-${{ matrix.kind }} + runs-on: ${{ matrix.os }} - runs-on: ubuntu-latest strategy: + fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9] + os: [windows-latest, macos-latest, ubuntu-latest] + python-version: ['3.7', '3.8', '3.9', '3.10'] + kind: ['source', 'dist'] + + exclude: + # Exclude all configurations *-*-dist, include one later + - kind: 'dist' + + # Exclude windows as bash commands wont work in windows runner + - os: windows-latest + + # Exclude macos as there are permission errors using conda as we do + - os: macos-latest + + # Exclude python 3.10 as torch is not support python 3.10 yet + - python-version: '3.10' + include: - - python-version: 3.8 + # Add the tag code-cov to ubuntu-3.7-source + - os: ubuntu-latest + python-version: 3.7 + kind: 'source' code-cov: true - fail-fast: false - max-parallel: 2 + + # Include one config with dist, ubuntu-3.7-dist + - os: ubuntu-latest + python-version: 3.7 + kind: 'dist' steps: - - uses: actions/checkout@v2 - with: - submodules: recursive + - name: Checkout + uses: actions/checkout@v2 + + - name: Setup Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - - name: Install test dependencies + + - name: Source install + if: matrix.kind == 'source' run: | + git submodule update --init --recursive python -m pip install --upgrade pip pip install -e .[test] + + - name: Dist install + if: matrix.kind == 'dist' + run: | + git submodule update --init --recursive + + python setup.py sdist + last_dist=$(ls -t dist/autoPyTorch-*.tar.gz | head -n 1) + pip install $last_dist[test] + - name: Store repository status id: status-before run: | echo "::set-output name=BEFORE::$(git status --porcelain -b)" + - name: Run tests run: | if [ ${{ matrix.code-cov }} ]; then - codecov='--cov=autoPyTorch --cov-report=xml --cov-config=.coveragerc'; + python -m pytest ${{ env.pytest-args }} ${{ env.code-cov-args }} test + else + python -m pytest ${{ env.pytest-args }} test fi - python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v $codecov test + - name: Check for files left behind by test if: ${{ always() }} run: | @@ -48,6 +124,7 @@ jobs: echo "Not all generated files have been deleted!" exit 1 fi + - name: Upload coverage if: matrix.code-cov && always() uses: codecov/codecov-action@v1 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d76014c44..c9b2e7615 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -8,7 +8,7 @@ on: workflow_dispatch: jobs: - build-n-publish: + publish: runs-on: "ubuntu-latest" steps: @@ -50,4 +50,4 @@ jobs: uses: pypa/gh-action-pypi-publish@master with: user: __token__ - password: ${{ secrets.PYPI_TOKEN }} + password: ${{ secrets.pypi_token }} diff --git a/.github/workflows/scheduled_test.yml b/.github/workflows/scheduled_test.yml deleted file mode 100644 index ce9615b0c..000000000 --- a/.github/workflows/scheduled_test.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: Tests - -on: - schedule: - # Every Monday at 7AM UTC - - cron: '0 07 * * 1' - - -jobs: - ubuntu: - - runs-on: ubuntu-latest - strategy: - matrix: - python-version: [3.8] - fail-fast: false - max-parallel: 2 - - steps: - - uses: actions/checkout@v2 - with: - ref: master - submodules: recursive - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install test dependencies - run: | - git submodule update --init --recursive - python -m pip install --upgrade pip - pip install -e .[test] - - name: Run tests - run: | - python -m pytest --forked --durations=20 --timeout=600 --timeout-method=signal -v test \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in index 2f6b9ae8b..4096cc1b6 100755 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include requirements.txt +include autoPyTorch/py.typed include autoPyTorch/utils/logging.yaml include autoPyTorch/configs/default_pipeline_options.json include autoPyTorch/configs/greedy_portfolio.json diff --git a/README.md b/README.md index 92f63c387..7a8ca03c1 100755 --- a/README.md +++ b/README.md @@ -6,8 +6,10 @@ While early AutoML frameworks focused on optimizing traditional ML pipelines and Auto-PyTorch is mainly developed to support tabular data (classification, regression). The newest features in Auto-PyTorch for tabular data are described in the paper ["Auto-PyTorch Tabular: Multi-Fidelity MetaLearning for Efficient and Robust AutoDL"](https://arxiv.org/abs/2006.13799) (see below for bibtex ref). + Also, find the documentation [here](https://automl.github.io/Auto-PyTorch/master). + ***From v0.1.0, AutoPyTorch has been updated to further improve usability, robustness and efficiency by using SMAC as the underlying optimization package as well as changing the code structure. Therefore, moving from v0.0.2 to v0.1.0 will break compatibility. In case you would like to use the old API, you can find it at [`master_old`](https://github.com/automl/Auto-PyTorch/tree/master-old).*** @@ -23,7 +25,6 @@ The current version only supports the *greedy portfolio* as described in the pap This portfolio is used to warm-start the optimization of SMAC. In other words, we evaluate the portfolio on a provided data as initial configurations. Then API starts the following procedures: - 1. **Validate input data**: Process each data type, e.g. encoding categorical data, so that Auto-Pytorch can handled. 2. **Create dataset**: Create a dataset that can be handled in this API with a choice of cross validation or holdout splits. 3. **Evaluate baselines** *1: Train each algorithm in the predefined pool with a fixed hyperparameter configuration and dummy model from `sklearn.dummy` that represents the worst possible performance. diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index a997c505b..a048e2054 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -11,7 +11,7 @@ import typing import unittest.mock import warnings -from abc import abstractmethod +from abc import ABC, abstractmethod from typing import Any, Callable, Dict, List, Optional, Tuple, Union from ConfigSpace.configuration_space import Configuration, ConfigurationSpace @@ -21,15 +21,17 @@ import joblib +import matplotlib.pyplot as plt + import numpy as np import pandas as pd -from smac.runhistory.runhistory import DataOrigin, RunHistory +from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue from smac.stats.stats import Stats from smac.tae import StatusType -from autoPyTorch.api.results_manager import ResultsManager, SearchResults +from autoPyTorch import metrics from autoPyTorch.automl_common.common.utils.backend import Backend, create from autoPyTorch.constants import ( REGRESSION_TASKS, @@ -38,11 +40,17 @@ ) from autoPyTorch.data.base_validator import BaseInputValidator from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType -from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes +from autoPyTorch.datasets.resampling_strategy import ( + CrossValTypes, + HoldoutValTypes, + NoResamplingStrategyTypes, + ResamplingStrategies, +) from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager from autoPyTorch.ensemble.singlebest_ensemble import SingleBest from autoPyTorch.evaluation.abstract_evaluator import fit_and_suppress_warnings from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.optimizer.smbo import AutoMLSMBO from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.setup.traditional_ml.traditional_learner import get_available_traditional_learners @@ -58,6 +66,8 @@ ) from autoPyTorch.utils.parallel import preload_modules from autoPyTorch.utils.pipeline import get_configuration_space, get_dataset_requirements +from autoPyTorch.utils.results_manager import MetricResults, ResultsManager, SearchResults +from autoPyTorch.utils.results_visualizer import ColorLabelSettings, PlotSettingParams, ResultsVisualizer from autoPyTorch.utils.single_thread_client import SingleThreadedClient from autoPyTorch.utils.stopwatch import StopWatch @@ -100,7 +110,7 @@ def send_warnings_to_log( return prediction -class BaseTask: +class BaseTask(ABC): """ Base class for the tasks that serve as API to the pipelines. @@ -130,13 +140,23 @@ class BaseTask: delete_tmp_folder_after_terminate (bool): Determines whether to delete the temporary directory, when finished - include_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + resampling_strategy resampling_strategy (RESAMPLING_STRATEGIES), + (default=HoldoutValTypes.holdout_validation): + strategy to split the training data. + resampling_strategy_args (Optional[Dict[str, Any]]): arguments + required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): Search space updates that can be used to modify the search space of particular components or choice modules of the pipeline @@ -155,14 +175,18 @@ def __init__( output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, - resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, + resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, task_type: Optional[str] = None ) -> None: + + if isinstance(resampling_strategy, NoResamplingStrategyTypes) and ensemble_size != 0: + raise ValueError("`NoResamplingStrategy` cannot be used for ensemble construction") + self.seed = seed self.n_jobs = n_jobs self.n_threads = n_threads @@ -219,7 +243,7 @@ def __init__( if self.n_jobs == 1: self._multiprocessing_context = 'fork' - self.InputValidator: Optional[BaseInputValidator] = None + self.input_validator: Optional[BaseInputValidator] = None self.search_space_updates = search_space_updates if search_space_updates is not None: @@ -229,19 +253,132 @@ def __init__( " HyperparameterSearchSpaceUpdates got {}".format(type(self.search_space_updates))) @abstractmethod - def build_pipeline(self, dataset_properties: Dict[str, Any]) -> BasePipeline: + def build_pipeline( + self, + dataset_properties: Dict[str, BaseDatasetPropertiesType], + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> BasePipeline: """ Build pipeline according to current task and for the passed dataset properties Args: - dataset_properties (Dict[str,Any]) + dataset_properties (Dict[str, Any]): + Characteristics of the dataset to guide the pipeline + choices of components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + Search space updates that can be used to modify the search + space of particular components or choice modules of the pipeline Returns: + BasePipeline + + """ + raise NotImplementedError("Function called on BaseTask, this can only be called by " + "specific task which is a child of the BaseTask") + @abstractmethod + def _get_dataset_input_validator( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[ResamplingStrategies] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> Tuple[BaseDataset, BaseInputValidator]: + """ + Returns an object of a child class of `BaseDataset` and + an object of a child class of `BaseInputValidator` according + to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[RESAMPLING_STRATEGIES]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. + + Returns: + BaseDataset: + the dataset object + BaseInputValidator: + fitted input validator """ raise NotImplementedError + def get_dataset( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[ResamplingStrategies] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + ) -> BaseDataset: + """ + Returns an object of a child class of `BaseDataset` according to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[RESAMPLING_STRATEGIES]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. + + Returns: + BaseDataset: + the dataset object + """ + dataset, _ = self._get_dataset_input_validator( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name) + + return dataset + @property def run_history(self) -> RunHistory: return self._results_manager.run_history @@ -553,13 +690,14 @@ def _do_dummy_prediction(self) -> None: backend=self._backend, seed=self.seed, metric=self._metric, + multi_objectives=["cost"], logger_port=self._logger_port, cost_for_crash=get_cost_of_crash(self._metric), abort_on_first_run_crash=False, initial_num_run=num_run, stats=stats, memory_limit=memory_limit, - disable_file_output=True if len(self._disable_file_output) > 0 else False, + disable_file_output=self._disable_file_output, all_supported_metrics=self._all_supported_metrics ) @@ -636,6 +774,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: pynisher_context=self._multiprocessing_context, backend=self._backend, seed=self.seed, + multi_objectives=["cost"], metric=self._metric, logger_port=self._logger_port, cost_for_crash=get_cost_of_crash(self._metric), @@ -643,7 +782,7 @@ def _do_traditional_prediction(self, time_left: int, func_eval_time_limit_secs: initial_num_run=self._backend.get_next_num_run(), stats=stats, memory_limit=memory_limit, - disable_file_output=True if len(self._disable_file_output) > 0 else False, + disable_file_output=self._disable_file_output, all_supported_metrics=self._all_supported_metrics ) dask_futures.append([ @@ -739,7 +878,7 @@ def _search( tae_func: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: List = [], + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, dask_client: Optional[dask.distributed.Client] = None @@ -840,10 +979,10 @@ def _search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Union[bool, List]): - If True, disable model and prediction output. - Can also be used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -856,6 +995,9 @@ def _search( pipelines fit on each fold. + `y_test`: do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -897,7 +1039,14 @@ def _search( self._backend.setup_logger(port=self._logger_port) self._all_supported_metrics = all_supported_metrics - self._disable_file_output = disable_file_output + self._disable_file_output = disable_file_output if disable_file_output is not None else [] + if ( + DisableFileOutputParameters.y_optimization in self._disable_file_output + and self.ensemble_size > 1 + ): + self._logger.warning(f"No ensemble will be created when {DisableFileOutputParameters.y_optimization}" + f" is in disable_file_output") + self._memory_limit = memory_limit self._time_for_task = total_walltime_limit # Save start time to backend @@ -1219,10 +1368,30 @@ def refit( return self - def fit(self, - dataset: BaseDataset, - pipeline_config: Optional[Configuration] = None, - split_id: int = 0) -> BasePipeline: + def fit_pipeline( + self, + configuration: Configuration, + *, + dataset: Optional[BaseDataset] = None, + X_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_train: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + dataset_name: Optional[str] = None, + resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + run_time_limit_secs: int = 60, + memory_limit: Optional[int] = None, + eval_metric: Optional[str] = None, + all_supported_metrics: bool = False, + budget_type: Optional[str] = None, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + budget: Optional[float] = None, + pipeline_options: Optional[Dict] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, + ) -> Tuple[Optional[BasePipeline], RunInfo, RunValue, BaseDataset]: """ Fit a pipeline on the given task for the budget. A pipeline configuration can be specified if None, @@ -1233,24 +1402,130 @@ def fit(self, methods. Args: - dataset (Dataset): - The argument that will provide the dataset splits. It can either - be a dictionary with the splits, or the dataset object which can - generate the splits based on different restrictions. - split_id (int: default=0): - split id to fit on. - pipeline_config (Optional[Configuration]): - configuration to fit the pipeline with. If None, - uses default + configuration (Configuration): + configuration to fit the pipeline with. + dataset (BaseDataset): + An object of the appropriate child class of `BaseDataset`, + that will be used to fit the pipeline + X_train, y_train, X_test, y_test: Union[np.ndarray, List, pd.DataFrame] + A pair of features (X_train) and targets (y_train) used to fit a + pipeline. Additionally, a holdout of this pairs (X_test, y_test) can + be provided to track the generalization performance of each stage. + dataset_name (Optional[str]): + Name of the dataset, if None, random value is used. + resampling_strategy (Optional[RESAMPLING_STRATEGIES]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + Arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. + run_time_limit_secs (int: default=60): + Time limit for a single call to the machine learning model. + Model fitting will be terminated if the machine learning algorithm + runs over the time limit. Set this value high enough so that + typical machine learning algorithms can be fit on the training + data. + memory_limit (Optional[int]): + Memory limit in MB for the machine learning algorithm. autopytorch + will stop fitting the machine learning algorithm if it tries + to allocate more than memory_limit MB. If None is provided, + no memory limit is set. In case of multi-processing, memory_limit + will be per job. This memory limit also applies to the ensemble + creation process. + eval_metric (Optional[str]): + Name of the metric that is used to evaluate a pipeline. + all_supported_metrics (bool: default=True): + if True, all metrics supporting current task will be calculated + for each pipeline and results will be available via cv_results + budget_type (str): + Type of budget to be used when fitting the pipeline. + It can be one of: + + + `epochs`: The training of each pipeline will be terminated after + a number of epochs have passed. This number of epochs is determined by the + budget argument of this method. + + `runtime`: The training of each pipeline will be terminated after + a number of seconds have passed. This number of seconds is determined by the + budget argument of this method. The overall fitting time of a pipeline is + controlled by func_eval_time_limit_secs. 'runtime' only controls the allocated + time to train a pipeline, but it does not consider the overall time it takes + to create a pipeline (data loading and preprocessing, other i/o operations, etc.). + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates(Optional[HyperparameterSearchSpaceUpdates]): + Updates to be made to the hyperparameter search space of the pipeline + budget (Optional[float]): + Budget to fit a single run of the pipeline. If not + provided, uses the default in the pipeline config + pipeline_options (Optional[Dict]): + Valid config options include "device", + "torch_num_threads", "early_stopping", "use_tensorboard_logger", + "metrics_during_training" + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. Returns: - BasePipeline: + (BasePipeline): fitted pipeline + (RunInfo): + Run information + (RunValue): + Result of fitting the pipeline + (BaseDataset): + Dataset created from the given tensors """ - self.dataset_name = dataset.dataset_name - if self._logger is None: - self._logger = self._get_logger(str(self.dataset_name)) + if dataset is None: + if ( + X_train is not None + and y_train is not None + ): + raise ValueError("No dataset provided, must provide X_train, y_train tensors") + dataset = self.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) + + # dataset_name is created inside the constructor of BaseDataset + # we expect it to be not None. This is for mypy + assert dataset.dataset_name is not None + + # TAE expects each configuration to have a config_id. + # For fitting a pipeline as it is not part of the + # search process, it makes sense to set it to 0 + configuration.__setattr__('config_id', 0) # get dataset properties dataset_requirements = get_dataset_requirements( @@ -1261,21 +1536,116 @@ def fit(self, dataset_properties = dataset.get_dataset_properties(dataset_requirements) self._backend.save_datamanager(dataset) - # build pipeline - pipeline = self.build_pipeline(dataset_properties) - if pipeline_config is not None: - pipeline.set_hyperparameters(pipeline_config) + if self._logger is None: + self._logger = self._get_logger(dataset.dataset_name) + + include_components = self.include_components if include_components is None else include_components + exclude_components = self.exclude_components if exclude_components is None else exclude_components + search_space_updates = self.search_space_updates if search_space_updates is None else search_space_updates + + scenario_mock = unittest.mock.Mock() + scenario_mock.wallclock_limit = run_time_limit_secs + # This stats object is a hack - maybe the SMAC stats object should + # already be generated here! + stats = Stats(scenario_mock) + + if memory_limit is None and getattr(self, '_memory_limit', None) is not None: + memory_limit = self._memory_limit + + metric = get_metrics(dataset_properties=dataset_properties, + names=[eval_metric] if eval_metric is not None else None, + all_supported_metrics=False).pop() + + pipeline_options = self.pipeline_options.copy().update(pipeline_options) if pipeline_options is not None \ + else self.pipeline_options.copy() + + assert pipeline_options is not None + + if budget_type is not None: + pipeline_options.update({'budget_type': budget_type}) + else: + budget_type = pipeline_options['budget_type'] + + budget = budget if budget is not None else pipeline_options[budget_type] + + if disable_file_output is None: + disable_file_output = getattr(self, '_disable_file_output', []) + + stats.start_timing() + + tae = ExecuteTaFuncWithQueue( + backend=self._backend, + seed=self.seed, + metric=metric, + multi_objectives=["cost"], + logger_port=self._logger_port, + cost_for_crash=get_cost_of_crash(metric), + abort_on_first_run_crash=False, + initial_num_run=self._backend.get_next_num_run(), + stats=stats, + memory_limit=memory_limit, + disable_file_output=disable_file_output, + all_supported_metrics=all_supported_metrics, + budget_type=budget_type, + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates, + pipeline_config=pipeline_options, + pynisher_context=self._multiprocessing_context + ) - # initialise fit dictionary - X = self._get_fit_dictionary( - dataset_properties=dataset_properties, - dataset=dataset, - split_id=split_id) + run_info, run_value = tae.run_wrapper( + RunInfo(config=configuration, + budget=budget, + seed=self.seed, + cutoff=run_time_limit_secs, + capped=False, + instance_specific=None, + instance=None) + ) - fit_and_suppress_warnings(self._logger, pipeline, X, y=None) + fitted_pipeline = self._get_fitted_pipeline( + dataset_name=dataset.dataset_name, + pipeline_idx=run_info.config.config_id + tae.initial_num_run, + run_info=run_info, + run_value=run_value, + disable_file_output=disable_file_output + ) self._clean_logger() - return pipeline + + return fitted_pipeline, run_info, run_value, dataset + + def _get_fitted_pipeline( + self, + dataset_name: str, + pipeline_idx: int, + run_info: RunInfo, + run_value: RunValue, + disable_file_output: List[Union[str, DisableFileOutputParameters]] + ) -> Optional[BasePipeline]: + + if self._logger is None: + self._logger = self._get_logger(str(dataset_name)) + + if run_value.status != StatusType.SUCCESS: + warnings.warn(f"Fitting pipeline failed with status: {run_value.status}" + f", additional_info: {run_value.additional_info}") + return None + elif any(disable_file_output for c in ['all', 'pipeline']): + self._logger.warning("File output is disabled. No pipeline can returned") + return None + + if self.resampling_strategy in CrossValTypes: + load_function = self._backend.load_cv_model_by_seed_and_id_and_budget + else: + load_function = self._backend.load_model_by_seed_and_id_and_budget + + return load_function( # type: ignore[no-any-return] + seed=self.seed, + idx=pipeline_idx, + budget=float(run_info.budget), + ) def predict( self, @@ -1306,7 +1676,7 @@ def predict( # Mypy assert assert self.ensemble_ is not None, "Load models should error out if no ensemble" - if isinstance(self.resampling_strategy, HoldoutValTypes): + if isinstance(self.resampling_strategy, (HoldoutValTypes, NoResamplingStrategyTypes)): models = self.models_ elif isinstance(self.resampling_strategy, CrossValTypes): models = self.cv_models_ @@ -1479,3 +1849,59 @@ def sprint_statistics(self) -> str: scoring_functions=self._scoring_functions, metric=self._metric ) + + def plot_perf_over_time( + self, + metric_name: str, + ax: Optional[plt.Axes] = None, + plot_setting_params: PlotSettingParams = PlotSettingParams(), + color_label_settings: ColorLabelSettings = ColorLabelSettings(), + *args: Any, + **kwargs: Any + ) -> None: + """ + Visualize the performance over time using matplotlib. + The plot related arguments are based on matplotlib. + Please refer to the matplotlib documentation for more details. + + Args: + metric_name (str): + The name of metric to visualize. + The names are available in + * autoPyTorch.metrics.CLASSIFICATION_METRICS + * autoPyTorch.metrics.REGRESSION_METRICS + ax (Optional[plt.Axes]): + axis to plot (subplots of matplotlib). + If None, it will be created automatically. + plot_setting_params (PlotSettingParams): + Parameters for the plot. + color_label_settings (ColorLabelSettings): + The settings of a pair of color and label for each plot. + args, kwargs (Any): + Arguments for the ax.plot. + + Note: + You might need to run `export DISPLAY=:0.0` if you are using non-GUI based environment. + """ + + if not hasattr(metrics, metric_name): + raise ValueError( + f'metric_name must be in {list(metrics.CLASSIFICATION_METRICS.keys())} ' + f'or {list(metrics.REGRESSION_METRICS.keys())}, but got {metric_name}' + ) + if len(self.ensemble_performance_history) == 0: + raise RuntimeError('Visualization is available only after ensembles are evaluated.') + + results = MetricResults( + metric=getattr(metrics, metric_name), + run_history=self.run_history, + ensemble_performance_history=self.ensemble_performance_history + ) + + colors, labels = color_label_settings.extract_dicts(results) + + ResultsVisualizer().plot_perf_over_time( # type: ignore + results=results, plot_setting_params=plot_setting_params, + colors=colors, labels=labels, ax=ax, + *args, **kwargs + ) diff --git a/autoPyTorch/api/results_manager.py b/autoPyTorch/api/results_manager.py deleted file mode 100644 index e52d21613..000000000 --- a/autoPyTorch/api/results_manager.py +++ /dev/null @@ -1,326 +0,0 @@ -import io -from typing import Any, Dict, List, Optional, Tuple, Union - -from ConfigSpace.configuration_space import Configuration - -import numpy as np - -import scipy - -from smac.runhistory.runhistory import RunHistory, RunValue -from smac.tae import StatusType -from smac.utils.io.traj_logging import TrajEntry - -from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric - - -# TODO remove StatusType.RUNNING at some point in the future when the new SMAC 0.13.2 -# is the new minimum required version! -STATUS2MSG = { - StatusType.SUCCESS: 'Success', - StatusType.DONOTADVANCE: 'Success (but did not advance to higher budget)', - StatusType.TIMEOUT: 'Timeout', - StatusType.CRASHED: 'Crash', - StatusType.ABORT: 'Abort', - StatusType.MEMOUT: 'Memory out' -} - - -def cost2metric(cost: float, metric: autoPyTorchMetric) -> float: - """ - Revert cost metric evaluated in SMAC to the original metric. - - The conversion is defined in: - autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss - cost = metric._optimum - metric._sign * original_metric_value - ==> original_metric_value = metric._sign * (metric._optimum - cost) - """ - return metric._sign * (metric._optimum - cost) - - -def _extract_metrics_info( - run_value: RunValue, - scoring_functions: List[autoPyTorchMetric] -) -> Dict[str, float]: - """ - Extract the metric information given a run_value - and a list of metrics of interest. - - Args: - run_value (RunValue): - The information for each config evaluation. - scoring_functions (List[autoPyTorchMetric]): - The list of metrics to retrieve the info. - """ - - if run_value.status not in (StatusType.SUCCESS, StatusType.DONOTADVANCE): - # Additional info for metrics is not available in this case. - return {metric.name: np.nan for metric in scoring_functions} - - cost_info = run_value.additional_info['opt_loss'] - avail_metrics = cost_info.keys() - - return { - metric.name: cost2metric(cost=cost_info[metric.name], metric=metric) - if metric.name in avail_metrics else np.nan - for metric in scoring_functions - } - - -class SearchResults: - def __init__( - self, - metric: autoPyTorchMetric, - scoring_functions: List[autoPyTorchMetric], - run_history: RunHistory - ): - self.metric_dict: Dict[str, List[float]] = { - metric.name: [] - for metric in scoring_functions - } - self._opt_scores: List[float] = [] - self._fit_times: List[float] = [] - self.configs: List[Configuration] = [] - self.status_types: List[str] = [] - self.budgets: List[float] = [] - self.config_ids: List[int] = [] - self.is_traditionals: List[bool] = [] - self.additional_infos: List[Optional[Dict[str, Any]]] = [] - self.rank_test_scores: np.ndarray = np.array([]) - self._scoring_functions = scoring_functions - self._metric = metric - - self._extract_results_from_run_history(run_history) - - @property - def opt_scores(self) -> np.ndarray: - return np.asarray(self._opt_scores) - - @property - def fit_times(self) -> np.ndarray: - return np.asarray(self._fit_times) - - def update( - self, - config: Configuration, - status: str, - budget: float, - fit_time: float, - config_id: int, - is_traditional: bool, - additional_info: Dict[str, Any], - score: float, - metric_info: Dict[str, float] - ) -> None: - - self.status_types.append(status) - self.configs.append(config) - self.budgets.append(budget) - self.config_ids.append(config_id) - self.is_traditionals.append(is_traditional) - self.additional_infos.append(additional_info) - self._fit_times.append(fit_time) - self._opt_scores.append(score) - - for metric_name, val in metric_info.items(): - self.metric_dict[metric_name].append(val) - - def clear(self) -> None: - self._opt_scores = [] - self._fit_times = [] - self.configs = [] - self.status_types = [] - self.budgets = [] - self.config_ids = [] - self.additional_infos = [] - self.is_traditionals = [] - self.rank_test_scores = np.array([]) - - def _extract_results_from_run_history(self, run_history: RunHistory) -> None: - """ - Extract the information to match this class format. - - Args: - run_history (RunHistory): - The history of config evals from SMAC. - """ - - self.clear() # Delete cache before the extraction - - for run_key, run_value in run_history.data.items(): - config_id = run_key.config_id - config = run_history.ids_config[config_id] - - status_msg = STATUS2MSG.get(run_value.status, None) - if run_value.status in (StatusType.STOP, StatusType.RUNNING): - continue - elif status_msg is None: - raise ValueError(f'Unexpected run status: {run_value.status}') - - is_traditional = False # If run is not successful, unsure ==> not True ==> False - if run_value.additional_info is not None: - is_traditional = run_value.additional_info['configuration_origin'] == 'traditional' - - self.update( - status=status_msg, - config=config, - budget=run_key.budget, - fit_time=run_value.time, - score=cost2metric(cost=run_value.cost, metric=self._metric), - metric_info=_extract_metrics_info(run_value=run_value, scoring_functions=self._scoring_functions), - is_traditional=is_traditional, - additional_info=run_value.additional_info, - config_id=config_id - ) - - self.rank_test_scores = scipy.stats.rankdata( - -1 * self._metric._sign * self.opt_scores, # rank order - method='min' - ) - - -class ResultsManager: - def __init__(self, *args: Any, **kwargs: Any): - """ - Attributes: - run_history (RunHistory): - A `SMAC Runshistory `_ - object that holds information about the runs of the target algorithm made during search - ensemble_performance_history (List[Dict[str, Any]]): - The list of ensemble performance in the optimization. - The list includes the `timestamp`, `result on train set`, and `result on test set` - trajectory (List[TrajEntry]): - A list of all incumbent configurations during search - """ - self.run_history: RunHistory = RunHistory() - self.ensemble_performance_history: List[Dict[str, Any]] = [] - self.trajectory: List[TrajEntry] = [] - - def _check_run_history(self) -> None: - if self.run_history is None: - raise RuntimeError("No Run History found, search has not been called.") - - if self.run_history.empty(): - raise RuntimeError("Run History is empty. Something went wrong, " - "SMAC was not able to fit any model?") - - def get_incumbent_results( - self, - metric: autoPyTorchMetric, - include_traditional: bool = False - ) -> Tuple[Configuration, Dict[str, Union[int, str, float]]]: - """ - Get Incumbent config and the corresponding results - - Args: - metric (autoPyTorchMetric): - A metric that is evaluated when searching with fit AutoPytorch. - include_traditional (bool): - Whether to include results from tradtional pipelines - - Returns: - Configuration (CS.ConfigurationSpace): - The incumbent configuration - Dict[str, Union[int, str, float]]: - Additional information about the run of the incumbent configuration. - """ - self._check_run_history() - - results = SearchResults(metric=metric, scoring_functions=[], run_history=self.run_history) - - if not include_traditional: - non_traditional = ~np.array(results.is_traditionals) - scores = results.opt_scores[non_traditional] - indices = np.arange(len(results.configs))[non_traditional] - else: - scores = results.opt_scores - indices = np.arange(len(results.configs)) - - incumbent_idx = indices[np.nanargmax(metric._sign * scores)] - incumbent_config = results.configs[incumbent_idx] - incumbent_results = results.additional_infos[incumbent_idx] - - assert incumbent_results is not None # mypy check - return incumbent_config, incumbent_results - - def get_search_results( - self, - scoring_functions: List[autoPyTorchMetric], - metric: autoPyTorchMetric - ) -> SearchResults: - """ - This attribute is populated with data from `self.run_history` - and contains information about the configurations, and their - corresponding metric results, status of run, parameters and - the budget - - Args: - scoring_functions (List[autoPyTorchMetric]): - Metrics to show in the results. - metric (autoPyTorchMetric): - A metric that is evaluated when searching with fit AutoPytorch. - - Returns: - SearchResults: - An instance that contains the results from search - """ - self._check_run_history() - return SearchResults(metric=metric, scoring_functions=scoring_functions, run_history=self.run_history) - - def sprint_statistics( - self, - dataset_name: str, - scoring_functions: List[autoPyTorchMetric], - metric: autoPyTorchMetric - ) -> str: - """ - Prints statistics about the SMAC search. - - These statistics include: - - 1. Optimisation Metric - 2. Best Optimisation score achieved by individual pipelines - 3. Total number of target algorithm runs - 4. Total number of successful target algorithm runs - 5. Total number of crashed target algorithm runs - 6. Total number of target algorithm runs that exceeded the time limit - 7. Total number of successful target algorithm runs that exceeded the memory limit - - Args: - dataset_name (str): - The dataset name that was used in the run. - scoring_functions (List[autoPyTorchMetric]): - Metrics to show in the results. - metric (autoPyTorchMetric): - A metric that is evaluated when searching with fit AutoPytorch. - - Returns: - (str): - Formatted string with statistics - """ - search_results = self.get_search_results(scoring_functions, metric) - success_msgs = (STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.DONOTADVANCE]) - sio = io.StringIO() - sio.write("autoPyTorch results:\n") - sio.write(f"\tDataset name: {dataset_name}\n") - sio.write(f"\tOptimisation Metric: {metric}\n") - - num_runs = len(search_results.status_types) - num_success = sum([s in success_msgs for s in search_results.status_types]) - num_crash = sum([s == STATUS2MSG[StatusType.CRASHED] for s in search_results.status_types]) - num_timeout = sum([s == STATUS2MSG[StatusType.TIMEOUT] for s in search_results.status_types]) - num_memout = sum([s == STATUS2MSG[StatusType.MEMOUT] for s in search_results.status_types]) - - if num_success > 0: - best_score = metric._sign * np.nanmax(metric._sign * search_results.opt_scores) - sio.write(f"\tBest validation score: {best_score}\n") - - sio.write(f"\tNumber of target algorithm runs: {num_runs}\n") - sio.write(f"\tNumber of successful target algorithm runs: {num_success}\n") - sio.write(f"\tNumber of crashed target algorithm runs: {num_crash}\n") - sio.write(f"\tNumber of target algorithms that exceeded the time " - f"limit: {num_timeout}\n") - sio.write(f"\tNumber of target algorithms that exceeded the memory " - f"limit: {num_memout}\n") - - return sio.getvalue() diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index d83f1dc01..684c22a7b 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -1,6 +1,4 @@ -import os -import uuid -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union import numpy as np @@ -13,11 +11,16 @@ TASK_TYPES_TO_STRING, ) from autoPyTorch.data.tabular_validator import TabularInputValidator +from autoPyTorch.data.utils import ( + get_dataset_compression_mapping +) +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( - CrossValTypes, HoldoutValTypes, + ResamplingStrategies, ) from autoPyTorch.datasets.tabular_dataset import TabularDataset +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -54,15 +57,25 @@ class TabularClassificationTask(BaseTask): delete_tmp_folder_after_terminate (bool): Determines whether to delete the temporary directory, when finished - include_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components. + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + resampling_strategy resampling_strategy (RESAMPLING_STRATEGIES), + (default=HoldoutValTypes.holdout_validation): + strategy to split the training data. + resampling_strategy_args (Optional[Dict[str, Any]]): arguments + required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): - search space updates that can be used to modify the search + Search space updates that can be used to modify the search space of particular components or choice modules of the pipeline """ def __init__( @@ -78,9 +91,9 @@ def __init__( output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, - resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None @@ -106,18 +119,111 @@ def __init__( task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION], ) - def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularClassificationPipeline: + def build_pipeline( + self, + dataset_properties: Dict[str, BaseDatasetPropertiesType], + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> TabularClassificationPipeline: """ - Build pipeline according to current task and for the passed dataset properties + Build pipeline according to current task + and for the passed dataset properties Args: - dataset_properties (Dict[str,Any]) + dataset_properties (Dict[str, Any]): + Characteristics of the dataset to guide the pipeline + choices of components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + Search space updates that can be used to modify the search + space of particular components or choice modules of the pipeline + + Returns: + TabularClassificationPipeline + + """ + return TabularClassificationPipeline(dataset_properties=dataset_properties, + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates) + + def _get_dataset_input_validator( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[ResamplingStrategies] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + dataset_compression: Optional[Mapping[str, Any]] = None, + ) -> Tuple[TabularDataset, TabularInputValidator]: + """ + Returns an object of `TabularDataset` and an object of + `TabularInputValidator` according to the current task. + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[RESAMPLING_STRATEGIES]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. Returns: - TabularClassificationPipeline: - Pipeline compatible with the given dataset properties. + TabularDataset: + the dataset object. + TabularInputValidator: + the input validator fitted on the data. """ - return TabularClassificationPipeline(dataset_properties=dataset_properties) + + resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy + resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ + self.resampling_strategy_args + + # Create a validator object to make sure that the data provided by + # the user matches the autopytorch requirements + input_validator = TabularInputValidator( + is_classification=True, + logger_port=self._logger_port, + dataset_compression=dataset_compression + ) + + # Fit a input validator to check the provided data + # Also, an encoder is fit to both train and test data, + # to prevent unseen categories during inference + input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + dataset = TabularDataset( + X=X_train, Y=y_train, + X_test=X_test, Y_test=y_test, + validator=input_validator, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) + + return dataset, input_validator def search( self, @@ -133,14 +239,15 @@ def search( total_walltime_limit: int = 100, func_eval_time_limit_secs: Optional[int] = None, enable_traditional_pipeline: bool = True, - memory_limit: Optional[int] = 4096, + memory_limit: int = 4096, smac_scenario_args: Optional[Dict[str, Any]] = None, get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: List = [], + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, + dataset_compression: Union[Mapping[str, Any], bool] = False, ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -209,7 +316,7 @@ def search( feature by turning this flag to False. All machine learning algorithms that are fitted during search() are considered for ensemble building. - memory_limit (Optional[int]: default=4096): + memory_limit (int: default=4096): Memory limit in MB for the machine learning algorithm. Autopytorch will stop fitting the machine learning algorithm if it tries to allocate more than memory_limit MB. If None @@ -237,10 +344,10 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Union[bool, List]): - If True, disable model and prediction output. - Can also be used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -253,6 +360,9 @@ def search( pipelines fit on each fold. + `y_test`: do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -264,37 +374,52 @@ def search( Additionally, the keyword 'greedy' is supported, which would use the default portfolio from `AutoPyTorch Tabular `_. + dataset_compression: Union[bool, Mapping[str, Any]] = True + We compress datasets so that they fit into some predefined amount of memory. + **NOTE** + + Default configuration when left as ``True``: + .. code-block:: python + { + "memory_allocation": 0.1, + "methods": ["precision"] + } + You can also pass your own configuration with the same keys and choosing + from the available ``"methods"``. + The available options are described here: + **memory_allocation** + By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This + float value can be set with ``"memory_allocation": 0.1``. We also allow for + specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``. + The memory used by the dataset is checked after each reduction method is + performed. If the dataset fits into the allocated memory, any further methods + listed in ``"methods"`` will not be performed. + + **methods** + We currently provide the following methods for reducing the dataset size. + These can be provided in a list and are performed in the order as given. + * ``"precision"`` - We reduce floating point precision as follows: + * ``np.float128 -> np.float64`` + * ``np.float96 -> np.float64`` + * ``np.float64 -> np.float32`` + * pandas dataframes are reduced using the downcast option of `pd.to_numeric` + to the lowest possible precision. Returns: self """ - if dataset_name is None: - dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - - # we have to create a logger for at this point for the validator - self._logger = self._get_logger(dataset_name) + self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression) - # Create a validator object to make sure that the data provided by - # the user matches the autopytorch requirements - self.InputValidator = TabularInputValidator( - is_classification=True, - logger_port=self._logger_port, - ) - - # Fit a input validator to check the provided data - # Also, an encoder is fit to both train and test data, - # to prevent unseen categories during inference - self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) - - self.dataset = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test, - validator=self.InputValidator, - dataset_name=dataset_name, + self.dataset, self.input_validator = self._get_dataset_input_validator( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, - ) + dataset_name=dataset_name, + dataset_compression=self._dataset_compression) return self._search( dataset=self.dataset, @@ -331,28 +456,28 @@ def predict( Returns: Array with estimator predictions. """ - if self.InputValidator is None or not self.InputValidator._is_fitted: + if self.input_validator is None or not self.input_validator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " - "the estimator fit() method.") + "the estimator search() method.") - X_test = self.InputValidator.feature_validator.transform(X_test) + X_test = self.input_validator.feature_validator.transform(X_test) predicted_probabilities = super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) - if self.InputValidator.target_validator.is_single_column_target(): + if self.input_validator.target_validator.is_single_column_target(): predicted_indexes = np.argmax(predicted_probabilities, axis=1) else: predicted_indexes = (predicted_probabilities > 0.5).astype(int) # Allow to predict in the original domain -- that is, the user is not interested # in our encoded values - return self.InputValidator.target_validator.inverse_transform(predicted_indexes) + return self.input_validator.target_validator.inverse_transform(predicted_indexes) def predict_proba(self, X_test: Union[np.ndarray, pd.DataFrame, List], batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray: - if self.InputValidator is None or not self.InputValidator._is_fitted: + if self.input_validator is None or not self.input_validator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " - "the estimator fit() method.") - X_test = self.InputValidator.feature_validator.transform(X_test) + "the estimator search() method.") + X_test = self.input_validator.feature_validator.transform(X_test) return super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) diff --git a/autoPyTorch/api/tabular_regression.py b/autoPyTorch/api/tabular_regression.py index a68990732..d766bad68 100644 --- a/autoPyTorch/api/tabular_regression.py +++ b/autoPyTorch/api/tabular_regression.py @@ -1,6 +1,4 @@ -import os -import uuid -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union import numpy as np @@ -13,11 +11,16 @@ TASK_TYPES_TO_STRING ) from autoPyTorch.data.tabular_validator import TabularInputValidator +from autoPyTorch.data.utils import ( + get_dataset_compression_mapping +) +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.datasets.resampling_strategy import ( - CrossValTypes, HoldoutValTypes, + ResamplingStrategies, ) from autoPyTorch.datasets.tabular_dataset import TabularDataset +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.tabular_regression import TabularRegressionPipeline from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -54,15 +57,25 @@ class TabularRegressionTask(BaseTask): delete_tmp_folder_after_terminate (bool): Determines whether to delete the temporary directory, when finished - include_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components to use. - exclude_components (Optional[Dict]): - If None, all possible components are used. - Otherwise specifies set of components not to use. - Incompatible with include components. + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + resampling_strategy resampling_strategy (RESAMPLING_STRATEGIES), + (default=HoldoutValTypes.holdout_validation): + strategy to split the training data. + resampling_strategy_args (Optional[Dict[str, Any]]): arguments + required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): - search space updates that can be used to modify the search + Search space updates that can be used to modify the search space of particular components or choice modules of the pipeline """ @@ -79,9 +92,9 @@ def __init__( output_directory: Optional[str] = None, delete_tmp_folder_after_terminate: bool = True, delete_output_folder_after_terminate: bool = True, - include_components: Optional[Dict] = None, - exclude_components: Optional[Dict] = None, - resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, backend: Optional[Backend] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None @@ -107,18 +120,111 @@ def __init__( task_type=TASK_TYPES_TO_STRING[TABULAR_REGRESSION], ) - def build_pipeline(self, dataset_properties: Dict[str, Any]) -> TabularRegressionPipeline: + def build_pipeline( + self, + dataset_properties: Dict[str, BaseDatasetPropertiesType], + include_components: Optional[Dict[str, Any]] = None, + exclude_components: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> TabularRegressionPipeline: """ - Build pipeline according to current task and for the passed dataset properties + Build pipeline according to current task + and for the passed dataset properties Args: - dataset_properties (Dict[str,Any]) + dataset_properties (Dict[str, Any]): + Characteristics of the dataset to guide the pipeline + choices of components + include_components (Optional[Dict[str, Any]]): + Dictionary containing components to include. Key is the node + name and Value is an Iterable of the names of the components + to include. Only these components will be present in the + search space. + exclude_components (Optional[Dict[str, Any]]): + Dictionary containing components to exclude. Key is the node + name and Value is an Iterable of the names of the components + to exclude. All except these components will be present in + the search space. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + Search space updates that can be used to modify the search + space of particular components or choice modules of the pipeline Returns: TabularRegressionPipeline: - Pipeline compatible with the given dataset properties. + """ - return TabularRegressionPipeline(dataset_properties=dataset_properties) + return TabularRegressionPipeline(dataset_properties=dataset_properties, + include=include_components, + exclude=exclude_components, + search_space_updates=search_space_updates) + + def _get_dataset_input_validator( + self, + X_train: Union[List, pd.DataFrame, np.ndarray], + y_train: Union[List, pd.DataFrame, np.ndarray], + X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None, + resampling_strategy: Optional[ResamplingStrategies] = None, + resampling_strategy_args: Optional[Dict[str, Any]] = None, + dataset_name: Optional[str] = None, + dataset_compression: Optional[Mapping[str, Any]] = None, + ) -> Tuple[TabularDataset, TabularInputValidator]: + """ + Returns an object of `TabularDataset` and an object of + `TabularInputValidator` according to the current task. + + Args: + X_train (Union[List, pd.DataFrame, np.ndarray]): + Training feature set. + y_train (Union[List, pd.DataFrame, np.ndarray]): + Training target set. + X_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing feature set + y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]): + Testing target set + resampling_strategy (Optional[RESAMPLING_STRATEGIES]): + Strategy to split the training data. if None, uses + HoldoutValTypes.holdout_validation. + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses + the default values provided in DEFAULT_RESAMPLING_PARAMETERS + in ```datasets/resampling_strategy.py```. + dataset_name (Optional[str]): + name of the dataset, used as experiment name. + Returns: + TabularDataset: + the dataset object. + TabularInputValidator: + the input validator fitted on the data. + """ + + resampling_strategy = resampling_strategy if resampling_strategy is not None else self.resampling_strategy + resampling_strategy_args = resampling_strategy_args if resampling_strategy_args is not None else \ + self.resampling_strategy_args + + # Create a validator object to make sure that the data provided by + # the user matches the autopytorch requirements + input_validator = TabularInputValidator( + is_classification=False, + logger_port=self._logger_port, + dataset_compression=dataset_compression + ) + + # Fit a input validator to check the provided data + # Also, an encoder is fit to both train and test data, + # to prevent unseen categories during inference + input_validator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) + + dataset = TabularDataset( + X=X_train, Y=y_train, + X_test=X_test, Y_test=y_test, + validator=input_validator, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args, + dataset_name=dataset_name + ) + + return dataset, input_validator def search( self, @@ -134,14 +240,15 @@ def search( total_walltime_limit: int = 100, func_eval_time_limit_secs: Optional[int] = None, enable_traditional_pipeline: bool = True, - memory_limit: Optional[int] = 4096, + memory_limit: int = 4096, smac_scenario_args: Optional[Dict[str, Any]] = None, get_smac_object_callback: Optional[Callable] = None, all_supported_metrics: bool = True, precision: int = 32, - disable_file_output: List = [], + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, load_models: bool = True, portfolio_selection: Optional[str] = None, + dataset_compression: Union[Mapping[str, Any], bool] = False, ) -> 'BaseTask': """ Search for the best pipeline configuration for the given dataset. @@ -155,8 +262,8 @@ def search( A pair of features (X_train) and targets (y_train) used to fit a pipeline. Additionally, a holdout of this pairs (X_test, y_test) can be provided to track the generalization performance of each stage. - optimize_metric (str): name of the metric that is used to - evaluate a pipeline. + optimize_metric (str): + Name of the metric that is used to evaluate a pipeline. budget_type (str): Type of budget to be used when fitting the pipeline. It can be one of: @@ -210,7 +317,7 @@ def search( feature by turning this flag to False. All machine learning algorithms that are fitted during search() are considered for ensemble building. - memory_limit (Optional[int]: default=4096): + memory_limit (int: default=4096): Memory limit in MB for the machine learning algorithm. Autopytorch will stop fitting the machine learning algorithm if it tries to allocate more than memory_limit MB. If None @@ -238,10 +345,10 @@ def search( precision (int: default=32): Numeric precision used when loading ensemble data. Can be either '16', '32' or '64'. - disable_file_output (Union[bool, List]): - If True, disable model and prediction output. - Can also be used as a list to pass more fine-grained - information on what to save. Allowed elements in the list are: + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + `y_optimization`: do not save the predictions for the optimization set, @@ -254,6 +361,9 @@ def search( pipelines fit on each fold. + `y_test`: do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. load_models (bool: default=True): Whether to load the models after fitting AutoPyTorch. portfolio_selection (Optional[str]): @@ -265,37 +375,53 @@ def search( Additionally, the keyword 'greedy' is supported, which would use the default portfolio from `AutoPyTorch Tabular `_. + dataset_compression: Union[bool, Mapping[str, Any]] = True + We compress datasets so that they fit into some predefined amount of memory. + **NOTE** + + Default configuration when left as ``True``: + .. code-block:: python + { + "memory_allocation": 0.1, + "methods": ["precision"] + } + You can also pass your own configuration with the same keys and choosing + from the available ``"methods"``. + The available options are described here: + **memory_allocation** + By default, we attempt to fit the dataset into ``0.1 * memory_limit``. This + float value can be set with ``"memory_allocation": 0.1``. We also allow for + specifying absolute memory in MB, e.g. 10MB is ``"memory_allocation": 10``. + The memory used by the dataset is checked after each reduction method is + performed. If the dataset fits into the allocated memory, any further methods + listed in ``"methods"`` will not be performed. + + **methods** + We currently provide the following methods for reducing the dataset size. + These can be provided in a list and are performed in the order as given. + * ``"precision"`` - We reduce floating point precision as follows: + * ``np.float128 -> np.float64`` + * ``np.float96 -> np.float64`` + * ``np.float64 -> np.float32`` + * pandas dataframes are reduced using the downcast option of `pd.to_numeric` + to the lowest possible precision. Returns: self """ - if dataset_name is None: - dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) - # we have to create a logger for at this point for the validator - self._logger = self._get_logger(dataset_name) + self._dataset_compression = get_dataset_compression_mapping(memory_limit, dataset_compression) - # Create a validator object to make sure that the data provided by - # the user matches the autopytorch requirements - self.InputValidator = TabularInputValidator( - is_classification=False, - logger_port=self._logger_port, - ) - - # Fit a input validator to check the provided data - # Also, an encoder is fit to both train and test data, - # to prevent unseen categories during inference - self.InputValidator.fit(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) - - self.dataset = TabularDataset( - X=X_train, Y=y_train, - X_test=X_test, Y_test=y_test, - validator=self.InputValidator, - dataset_name=dataset_name, + self.dataset, self.input_validator = self._get_dataset_input_validator( + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, resampling_strategy=self.resampling_strategy, resampling_strategy_args=self.resampling_strategy_args, - ) + dataset_name=dataset_name, + dataset_compression=self._dataset_compression) return self._search( dataset=self.dataset, @@ -322,14 +448,14 @@ def predict( batch_size: Optional[int] = None, n_jobs: int = 1 ) -> np.ndarray: - if self.InputValidator is None or not self.InputValidator._is_fitted: + if self.input_validator is None or not self.input_validator._is_fitted: raise ValueError("predict() is only supported after calling search. Kindly call first " - "the estimator fit() method.") + "the estimator search() method.") - X_test = self.InputValidator.feature_validator.transform(X_test) + X_test = self.input_validator.feature_validator.transform(X_test) predicted_values = super().predict(X_test, batch_size=batch_size, n_jobs=n_jobs) # Allow to predict in the original domain -- that is, the user is not interested # in our encoded values - return self.InputValidator.target_validator.inverse_transform(predicted_values) + return self.input_validator.target_validator.inverse_transform(predicted_values) diff --git a/autoPyTorch/configs/greedy_portfolio.json b/autoPyTorch/configs/greedy_portfolio.json index a8e640a4e..bdcb45401 100644 --- a/autoPyTorch/configs/greedy_portfolio.json +++ b/autoPyTorch/configs/greedy_portfolio.json @@ -1,7 +1,7 @@ [{"data_loader:batch_size": 60, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -31,8 +31,8 @@ "network_backbone:ShapedMLPBackbone:max_dropout": 0.023271935735825866}, {"data_loader:batch_size": 255, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -65,8 +65,8 @@ "network_backbone:ShapedResNetBackbone:max_dropout": 0.7662454727603789}, {"data_loader:batch_size": 165, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -96,8 +96,8 @@ "network_head:fully_connected:units_layer_1": 128}, {"data_loader:batch_size": 299, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -128,8 +128,8 @@ "network_head:fully_connected:units_layer_1": 128}, {"data_loader:batch_size": 183, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -162,8 +162,8 @@ "network_backbone:ShapedResNetBackbone:max_dropout": 0.27204101593048097}, {"data_loader:batch_size": 21, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -191,8 +191,8 @@ "network_head:fully_connected:units_layer_1": 128}, {"data_loader:batch_size": 159, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -221,8 +221,8 @@ "network_head:fully_connected:units_layer_1": 128}, {"data_loader:batch_size": 442, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -254,8 +254,8 @@ "network_head:fully_connected:units_layer_1": 128}, {"data_loader:batch_size": 140, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -287,8 +287,8 @@ "network_head:fully_connected:units_layer_1": 128}, {"data_loader:batch_size": 48, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -315,8 +315,8 @@ "network_head:fully_connected:units_layer_1": 128}, {"data_loader:batch_size": 168, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -348,8 +348,8 @@ "network_backbone:ShapedResNetBackbone:max_dropout": 0.8992826006547855}, {"data_loader:batch_size": 21, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -377,8 +377,8 @@ "network_head:fully_connected:units_layer_1": 128}, {"data_loader:batch_size": 163, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -410,8 +410,8 @@ "network_backbone:ShapedResNetBackbone:max_dropout": 0.6341848343636569}, {"data_loader:batch_size": 150, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", @@ -444,8 +444,8 @@ "network_backbone:ShapedResNetBackbone:max_dropout": 0.7133813761319248}, {"data_loader:batch_size": 151, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedMLPBackbone", @@ -474,8 +474,8 @@ "network_head:fully_connected:units_layer_1": 128}, {"data_loader:batch_size": 42, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "TruncatedSVD", - "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", "network_backbone:__choice__": "ShapedResNetBackbone", diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 6ef7cae6b..11c6cf577 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -5,25 +5,14 @@ import pandas as pd -import scipy.sparse +from scipy.sparse import spmatrix from sklearn.base import BaseEstimator from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_FEAT_TYPES = Union[ - List, - pd.DataFrame, - np.ndarray, - scipy.sparse.bsr_matrix, - scipy.sparse.coo_matrix, - scipy.sparse.csc_matrix, - scipy.sparse.csr_matrix, - scipy.sparse.dia_matrix, - scipy.sparse.dok_matrix, - scipy.sparse.lil_matrix, -] +SupportedFeatTypes = Union[List, pd.DataFrame, np.ndarray, spmatrix] class BaseFeatureValidator(BaseEstimator): @@ -68,8 +57,8 @@ def __init__( def fit( self, - X_train: SUPPORTED_FEAT_TYPES, - X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + X_train: SupportedFeatTypes, + X_test: Optional[SupportedFeatTypes] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the features. @@ -77,10 +66,10 @@ def fit( CSR sparse data types are also supported Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SupportedFeatTypes]): A hold out set of data used for checking """ @@ -109,11 +98,11 @@ def fit( def _fit( self, - X: SUPPORTED_FEAT_TYPES, + X: SupportedFeatTypes, ) -> BaseEstimator: """ Args: - X (SUPPORTED_FEAT_TYPES): + X (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding Returns: @@ -124,11 +113,11 @@ def _fit( def transform( self, - X: SUPPORTED_FEAT_TYPES, + X: SupportedFeatTypes, ) -> np.ndarray: """ Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features, whose categorical features are going to be transformed diff --git a/autoPyTorch/data/base_target_validator.py b/autoPyTorch/data/base_target_validator.py index 393f3d85b..530675fbd 100644 --- a/autoPyTorch/data/base_target_validator.py +++ b/autoPyTorch/data/base_target_validator.py @@ -5,26 +5,14 @@ import pandas as pd -import scipy.sparse +from scipy.sparse import spmatrix from sklearn.base import BaseEstimator from autoPyTorch.utils.logging_ import PicklableClientLogger -SUPPORTED_TARGET_TYPES = Union[ - List, - pd.Series, - pd.DataFrame, - np.ndarray, - scipy.sparse.bsr_matrix, - scipy.sparse.coo_matrix, - scipy.sparse.csc_matrix, - scipy.sparse.csr_matrix, - scipy.sparse.dia_matrix, - scipy.sparse.dok_matrix, - scipy.sparse.lil_matrix, -] +SupportedTargetTypes = Union[List, pd.Series, pd.DataFrame, np.ndarray, spmatrix] class BaseTargetValidator(BaseEstimator): @@ -69,17 +57,17 @@ def __init__(self, def fit( self, - y_train: SUPPORTED_TARGET_TYPES, - y_test: Optional[SUPPORTED_TARGET_TYPES] = None, + y_train: SupportedTargetTypes, + y_test: Optional[SupportedTargetTypes] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the targets The supported data types are List, numpy arrays and pandas DataFrames. Args: - y_train (SUPPORTED_TARGET_TYPES) + y_train (SupportedTargetTypes) A set of targets set aside for training - y_test (Union[SUPPORTED_TARGET_TYPES]) + y_test (Union[SupportedTargetTypes]) A hold out set of data used of the targets. It is also used to fit the categories of the encoder. """ @@ -128,26 +116,26 @@ def fit( def _fit( self, - y_train: SUPPORTED_TARGET_TYPES, - y_test: Optional[SUPPORTED_TARGET_TYPES] = None, + y_train: SupportedTargetTypes, + y_test: Optional[SupportedTargetTypes] = None, ) -> BaseEstimator: """ Args: - y_train (SUPPORTED_TARGET_TYPES) + y_train (SupportedTargetTypes) The labels of the current task. They are going to be encoded in case of classification - y_test (Optional[SUPPORTED_TARGET_TYPES]) + y_test (Optional[SupportedTargetTypes]) A holdout set of labels """ raise NotImplementedError() def transform( self, - y: Union[SUPPORTED_TARGET_TYPES], + y: Union[SupportedTargetTypes], ) -> np.ndarray: """ Args: - y (SUPPORTED_TARGET_TYPES) + y (SupportedTargetTypes) A set of targets that are going to be encoded if the current task is classification Returns: @@ -158,7 +146,7 @@ def transform( def inverse_transform( self, - y: SUPPORTED_TARGET_TYPES, + y: SupportedTargetTypes, ) -> np.ndarray: """ Revert any encoding transformation done on a target array diff --git a/autoPyTorch/data/base_validator.py b/autoPyTorch/data/base_validator.py index 13bb421c7..bebddff49 100644 --- a/autoPyTorch/data/base_validator.py +++ b/autoPyTorch/data/base_validator.py @@ -7,8 +7,8 @@ from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError -from autoPyTorch.data.base_feature_validator import SUPPORTED_FEAT_TYPES -from autoPyTorch.data.base_target_validator import SUPPORTED_TARGET_TYPES +from autoPyTorch.data.base_feature_validator import SupportedFeatTypes +from autoPyTorch.data.base_target_validator import SupportedTargetTypes class BaseInputValidator(BaseEstimator): @@ -40,10 +40,10 @@ def __init__( def fit( self, - X_train: SUPPORTED_FEAT_TYPES, - y_train: SUPPORTED_TARGET_TYPES, - X_test: Optional[SUPPORTED_FEAT_TYPES] = None, - y_test: Optional[SUPPORTED_TARGET_TYPES] = None, + X_train: SupportedFeatTypes, + y_train: SupportedTargetTypes, + X_test: Optional[SupportedFeatTypes] = None, + y_test: Optional[SupportedTargetTypes] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the features, and @@ -59,15 +59,15 @@ def fit( + If performing a classification task, the data is going to be encoded Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks). If this data contains categorical columns, an encoder is going to be instantiated and trained with this data. - y_train (SUPPORTED_TARGET_TYPES): + y_train (SupportedTargetTypes): A set of targets that are going to be encoded if the task is for classification - X_test (Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SupportedFeatTypes]): A hold out set of features used for checking - y_test (SUPPORTED_TARGET_TYPES): + y_test (SupportedTargetTypes): A hold out set of targets used for checking. Additionally, if the current task is a classification task, this y_test categories are also going to be used to fit a pre-processing encoding (to prevent errors on unseen classes). @@ -96,16 +96,16 @@ def fit( def transform( self, - X: SUPPORTED_FEAT_TYPES, - y: Optional[SUPPORTED_TARGET_TYPES] = None, + X: SupportedFeatTypes, + y: Optional[SupportedTargetTypes] = None, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Transform the given target or features to a numpy array Args: - X (SUPPORTED_FEAT_TYPES): + X (SupportedFeatTypes): A set of features to transform - y (Optional[SUPPORTED_TARGET_TYPES]): + y (Optional[SupportedTargetTypes]): A set of targets to transform Returns: diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 27ed18cfc..ffb04feb1 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -1,12 +1,13 @@ import functools -from typing import Dict, List, Optional, Tuple, cast +from logging import Logger +from typing import Any, Dict, List, Mapping, Optional, Set, Tuple, Union, cast import numpy as np import pandas as pd from pandas.api.types import is_numeric_dtype -import scipy.sparse +from scipy.sparse import issparse, spmatrix import sklearn.utils from sklearn import preprocessing @@ -16,7 +17,15 @@ from sklearn.impute import SimpleImputer from sklearn.pipeline import make_pipeline -from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SUPPORTED_FEAT_TYPES +from autoPyTorch.data.base_feature_validator import BaseFeatureValidator, SupportedFeatTypes +from autoPyTorch.data.utils import ( + DatasetCompressionInputType, + DatasetDTypeContainerType, + has_object_columns, + reduce_dataset_size_if_too_large +) +from autoPyTorch.utils.common import ispandas +from autoPyTorch.utils.logging_ import PicklableClientLogger def _create_column_transformer( @@ -92,6 +101,16 @@ class TabularFeatureValidator(BaseFeatureValidator): categorical_columns (List[int]): List of indices of categorical columns """ + def __init__( + self, + logger: Optional[Union[PicklableClientLogger, Logger]] = None, + dataset_compression: Optional[Mapping[str, Any]] = None, + ) -> None: + super().__init__(logger) + self._dataset_compression = dataset_compression + self._reduced_dtype: Optional[DatasetDTypeContainerType] = None + self.all_nan_columns: Optional[Set[str]] = None + @staticmethod def _comparator(cmp1: str, cmp2: str) -> int: """Order so that categorical columns come left and numerical columns come right @@ -115,9 +134,44 @@ def _comparator(cmp1: str, cmp2: str) -> int: idx1, idx2 = choices.index(cmp1), choices.index(cmp2) return idx1 - idx2 + def _convert_all_nan_columns_to_numeric(self, X: pd.DataFrame, fit: bool = False) -> pd.DataFrame: + """ + Convert columns whose values were all nan in the training dataset to numeric. + + Args: + X (pd.DataFrame): + The data to transform. + fit (bool): + Whether this call is the fit to X or the transform using pre-fitted transformer. + """ + if not fit and self.all_nan_columns is None: + raise ValueError('_fit must be called before calling transform') + + if fit: + all_nan_columns = X.columns[X.isna().all()] + else: + assert self.all_nan_columns is not None + all_nan_columns = list(self.all_nan_columns) + + for col in all_nan_columns: + X[col] = np.nan + X[col] = pd.to_numeric(X[col]) + if fit and len(self.dtypes): + self.dtypes[list(X.columns).index(col)] = X[col].dtype + + if has_object_columns(X.dtypes.values): + X = self.infer_objects(X) + + if fit: + # TODO: Check how to integrate below + # self.dtypes = [dt.name for dt in X.dtypes] + self.all_nan_columns = set(all_nan_columns) + + return X + def _fit( self, - X: SUPPORTED_FEAT_TYPES, + X: SupportedFeatTypes, ) -> BaseEstimator: """ In case input data is a pandas DataFrame, this utility encodes the user provided @@ -125,7 +179,7 @@ def _fit( will be able to use Args: - X (SUPPORTED_FEAT_TYPES): + X (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and an encoder fitted in the case the data needs encoding @@ -139,24 +193,9 @@ def _fit( if isinstance(X, np.ndarray): X = self.numpy_array_to_pandas(X) - if hasattr(X, "iloc") and not scipy.sparse.issparse(X): + if ispandas(X) and not issparse(X): X = cast(pd.DataFrame, X) - # Treat a column with all instances a NaN as numerical - # This will prevent doing encoding to a categorical column made completely - # out of nan values -- which will trigger a fail, as encoding is not supported - # with nan values. - # Columns that are completely made of NaN values are provided to the pipeline - # so that later stages decide how to handle them - if np.any(pd.isnull(X)): - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) - # Also note this change in self.dtypes - if len(self.dtypes) != 0: - self.dtypes[list(X.columns).index(column)] = X[column].dtype - - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) + X = self._convert_all_nan_columns_to_numeric(X, fit=True) self.transformed_columns, self.feat_type = self._get_columns_to_encode(X) @@ -204,14 +243,14 @@ def _fit( def transform( self, - X: SUPPORTED_FEAT_TYPES, - ) -> np.ndarray: + X: SupportedFeatTypes, + ) -> Union[np.ndarray, spmatrix, pd.DataFrame]: """ Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features, whose categorical features are going to be transformed @@ -229,21 +268,14 @@ def transform( if isinstance(X, np.ndarray): X = self.numpy_array_to_pandas(X) - if hasattr(X, "iloc") and not scipy.sparse.issparse(X): - if np.any(pd.isnull(X)): - for column in X.columns: - if X[column].isna().all(): - X[column] = pd.to_numeric(X[column]) - - # Also remove the object dtype for new data - if not X.select_dtypes(include='object').empty: - X = self.infer_objects(X) + if ispandas(X) and not issparse(X): + X = self._convert_all_nan_columns_to_numeric(X) # Check the data here so we catch problems on new test data self._check_data(X) # Pandas related transformations - if hasattr(X, "iloc") and self.column_transformer is not None: + if ispandas(X) and self.column_transformer is not None: if np.any(pd.isnull(X)): # After above check it means that if there is a NaN # the whole column must be NaN @@ -256,7 +288,7 @@ def transform( # Sparse related transformations # Not all sparse format support index sorting - if scipy.sparse.issparse(X) and hasattr(X, 'sort_indices'): + if issparse(X) and hasattr(X, 'sort_indices'): X.sort_indices() try: @@ -272,22 +304,53 @@ def transform( "Please try to manually cast it to a supported " "numerical or categorical values.") raise e + + X = self._compress_dataset(X) + return X + # TODO: modify once we have added subsampling as well. + def _compress_dataset(self, X: DatasetCompressionInputType) -> DatasetCompressionInputType: + """ + Compress the dataset. This function ensures that + the testing data is converted to the same dtype as + the training data. + + + Args: + X (DatasetCompressionInputType): + Dataset + + Returns: + DatasetCompressionInputType: + Compressed dataset. + """ + is_dataframe = ispandas(X) + is_reducible_type = isinstance(X, np.ndarray) or issparse(X) or is_dataframe + if not is_reducible_type or self._dataset_compression is None: + return X + elif self._reduced_dtype is not None: + X = X.astype(self._reduced_dtype) + return X + else: + X = reduce_dataset_size_if_too_large(X, **self._dataset_compression) + self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype + return X + def _check_data( self, - X: SUPPORTED_FEAT_TYPES, + X: SupportedFeatTypes, ) -> None: """ Feature dimensionality and data type checks Args: - X (SUPPORTED_FEAT_TYPES): + X (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and an encoder fitted in the case the data needs encoding """ - if not isinstance(X, (np.ndarray, pd.DataFrame)) and not scipy.sparse.issparse(X): + if not isinstance(X, (np.ndarray, pd.DataFrame)) and not issparse(X): raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames," " scipy sparse and Python Lists, yet, the provided input is" " of type {}".format(type(X)) @@ -316,12 +379,12 @@ def _check_data( ) # Then for Pandas, we do not support Nan in categorical columns - if hasattr(X, "iloc"): + if ispandas(X): # If entered here, we have a pandas dataframe X = cast(pd.DataFrame, X) # Handle objects if possible - if not X.select_dtypes(include='object').empty: + if has_object_columns(X.dtypes.values): X = self.infer_objects(X) # Define the column to be encoded here as the feature validator is fitted once @@ -429,8 +492,8 @@ def _get_columns_to_encode( def list_to_dataframe( self, - X_train: SUPPORTED_FEAT_TYPES, - X_test: Optional[SUPPORTED_FEAT_TYPES] = None, + X_train: SupportedFeatTypes, + X_test: Optional[SupportedFeatTypes] = None, ) -> Tuple[pd.DataFrame, Optional[pd.DataFrame]]: """ Converts a list to a pandas DataFrame. In this process, column types are inferred. @@ -438,10 +501,10 @@ def list_to_dataframe( If test data is provided, we proactively match it to train data Args: - X_train (SUPPORTED_FEAT_TYPES): + X_train (SupportedFeatTypes): A set of features that are going to be validated (type and dimensionality checks) and a encoder fitted in the case the data needs encoding - X_test (Optional[SUPPORTED_FEAT_TYPES]): + X_test (Optional[SupportedFeatTypes]): A hold out set of data used for checking Returns: diff --git a/autoPyTorch/data/tabular_target_validator.py b/autoPyTorch/data/tabular_target_validator.py index c37dc81c3..22cabb999 100644 --- a/autoPyTorch/data/tabular_target_validator.py +++ b/autoPyTorch/data/tabular_target_validator.py @@ -5,7 +5,7 @@ import pandas as pd from pandas.api.types import is_numeric_dtype -import scipy.sparse +from scipy.sparse import issparse, spmatrix import sklearn.utils from sklearn import preprocessing @@ -13,14 +13,43 @@ from sklearn.exceptions import NotFittedError from sklearn.utils.multiclass import type_of_target -from autoPyTorch.data.base_target_validator import BaseTargetValidator, SUPPORTED_TARGET_TYPES +from autoPyTorch.data.base_target_validator import BaseTargetValidator, SupportedTargetTypes +from autoPyTorch.utils.common import ispandas + + +ArrayType = Union[np.ndarray, spmatrix] + + +def _check_and_to_array(y: SupportedTargetTypes) -> ArrayType: + """ sklearn check array will make sure we have the correct numerical features for the array """ + return sklearn.utils.check_array(y, force_all_finite=True, accept_sparse='csr', ensure_2d=False) + + +def _modify_regression_target(y: ArrayType) -> ArrayType: + # Regression targets must have numbers after a decimal point. + # Ref: https://github.com/scikit-learn/scikit-learn/issues/8952 + y_min = np.abs(y).min() + offset = max(y_min, 1e-13) * 1e-13 # Sufficiently small number + if y_min > 1e12: + raise ValueError( + "The minimum value for the target labels of regression tasks must be smaller than " + f"1e12 to avoid errors caused by an overflow, but got {y_min}" + ) + + # Since it is all integer, we can just add a random small number + if isinstance(y, np.ndarray): + y = y.astype(dtype=np.float64) + offset + else: + y.data = y.data.astype(dtype=np.float64) + offset + + return y class TabularTargetValidator(BaseTargetValidator): def _fit( self, - y_train: SUPPORTED_TARGET_TYPES, - y_test: Optional[SUPPORTED_TARGET_TYPES] = None, + y_train: SupportedTargetTypes, + y_test: Optional[SupportedTargetTypes] = None, ) -> BaseEstimator: """ If dealing with classification, this utility encodes the targets. @@ -29,10 +58,10 @@ def _fit( errors Args: - y_train (SUPPORTED_TARGET_TYPES) + y_train (SupportedTargetTypes) The labels of the current task. They are going to be encoded in case of classification - y_test (Optional[SUPPORTED_TARGET_TYPES]) + y_test (Optional[SupportedTargetTypes]) A holdout set of labels """ if not self.is_classification or self.type_of_target == 'multilabel-indicator': @@ -42,7 +71,7 @@ def _fit( return self if y_test is not None: - if hasattr(y_train, "iloc"): + if ispandas(y_train): y_train = pd.concat([y_train, y_test], ignore_index=True, sort=False) elif isinstance(y_train, list): y_train = y_train + y_test @@ -71,7 +100,7 @@ def _fit( if ndim > 1: self.encoder.fit(y_train) else: - if hasattr(y_train, 'iloc'): + if ispandas(y_train): y_train = cast(pd.DataFrame, y_train) self.encoder.fit(y_train.to_numpy().reshape(-1, 1)) else: @@ -94,16 +123,31 @@ def _fit( return self - def transform( - self, - y: Union[SUPPORTED_TARGET_TYPES], - ) -> np.ndarray: + def _transform_by_encoder(self, y: SupportedTargetTypes) -> np.ndarray: + if self.encoder is None: + return _check_and_to_array(y) + + # remove ravel warning from pandas Series + shape = np.shape(y) + if len(shape) > 1: + y = self.encoder.transform(y) + elif ispandas(y): + # The Ordinal encoder expects a 2 dimensional input. + # The targets are 1 dimensional, so reshape to match the expected shape + y = cast(pd.DataFrame, y) + y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1) + else: + y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1) + + return _check_and_to_array(y) + + def transform(self, y: SupportedTargetTypes) -> np.ndarray: """ Validates and fit a categorical encoder (if needed) to the features. The supported data types are List, numpy arrays and pandas DataFrames. Args: - y (SUPPORTED_TARGET_TYPES) + y (SupportedTargetTypes) A set of targets that are going to be encoded if the current task is classification @@ -116,47 +160,23 @@ def transform( # Check the data here so we catch problems on new test data self._check_data(y) + y = self._transform_by_encoder(y) - if self.encoder is not None: - # remove ravel warning from pandas Series - shape = np.shape(y) - if len(shape) > 1: - y = self.encoder.transform(y) - else: - # The Ordinal encoder expects a 2 dimensional input. - # The targets are 1 dimensional, so reshape to match the expected shape - if hasattr(y, 'iloc'): - y = cast(pd.DataFrame, y) - y = self.encoder.transform(y.to_numpy().reshape(-1, 1)).reshape(-1) - else: - y = self.encoder.transform(np.array(y).reshape(-1, 1)).reshape(-1) - - # sklearn check array will make sure we have the - # correct numerical features for the array - # Also, a numpy array will be created - y = sklearn.utils.check_array( - y, - force_all_finite=True, - accept_sparse='csr', - ensure_2d=False, - ) - - # When translating a dataframe to numpy, make sure we - # honor the ravel requirement + # When translating a dataframe to numpy, make sure we honor the ravel requirement if y.ndim == 2 and y.shape[1] == 1: y = np.ravel(y) + if not self.is_classification and "continuous" not in type_of_target(y): + y = _modify_regression_target(y) + return y - def inverse_transform( - self, - y: SUPPORTED_TARGET_TYPES, - ) -> np.ndarray: + def inverse_transform(self, y: SupportedTargetTypes) -> np.ndarray: """ Revert any encoding transformation done on a target array Args: - y (Union[np.ndarray, pd.DataFrame, pd.Series]): + y (SupportedTargetTypes): Target array to be transformed back to original form before encoding Returns: np.ndarray: @@ -172,7 +192,7 @@ def inverse_transform( y = self.encoder.inverse_transform(y) else: # The targets should be a flattened array, hence reshape with -1 - if hasattr(y, 'iloc'): + if ispandas(y): y = cast(pd.DataFrame, y) y = self.encoder.inverse_transform(y.to_numpy().reshape(-1, 1)).reshape(-1) else: @@ -185,21 +205,18 @@ def inverse_transform( y = y.astype(self.dtype) return y - def _check_data( - self, - y: SUPPORTED_TARGET_TYPES, - ) -> None: + def _check_data(self, y: SupportedTargetTypes) -> None: """ Perform dimensionality and data type checks on the targets Args: - y (Union[np.ndarray, pd.DataFrame, pd.Series]): + y (SupportedTargetTypes): A set of features whose dimensionality and data type is going to be checked """ if not isinstance(y, (np.ndarray, pd.DataFrame, List, pd.Series)) \ - and not scipy.sparse.issparse(y): # type: ignore[misc] + and not issparse(y): # type: ignore[misc] raise ValueError("AutoPyTorch only supports Numpy arrays, Pandas DataFrames," " pd.Series, sparse data and Python Lists as targets, yet, " "the provided input is of type {}".format( @@ -208,8 +225,8 @@ def _check_data( # Sparse data muss be numerical # Type ignore on attribute because sparse targets have a dtype - if scipy.sparse.issparse(y) and not np.issubdtype(y.dtype.type, # type: ignore[union-attr] - np.number): + if issparse(y) and not np.issubdtype(y.dtype.type, # type: ignore[union-attr] + np.number): raise ValueError("When providing a sparse matrix as targets, the only supported " "values are numerical. Please consider using a dense" " instead." @@ -228,10 +245,10 @@ def _check_data( # No Nan is supported has_nan_values = False - if hasattr(y, 'iloc'): + if ispandas(y): has_nan_values = cast(pd.DataFrame, y).isnull().values.any() - if scipy.sparse.issparse(y): - y = cast(scipy.sparse.spmatrix, y) + if issparse(y): + y = cast(spmatrix, y) has_nan_values = not np.array_equal(y.data, y.data) else: # List and array like values are considered here diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py index 677b55d4b..4db415f93 100644 --- a/autoPyTorch/data/tabular_validator.py +++ b/autoPyTorch/data/tabular_validator.py @@ -1,6 +1,6 @@ # -*- encoding: utf-8 -*- import logging -from typing import Optional, Union +from typing import Any, Mapping, Optional, Union from autoPyTorch.data.base_validator import BaseInputValidator from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator @@ -32,9 +32,11 @@ def __init__( self, is_classification: bool = False, logger_port: Optional[int] = None, + dataset_compression: Optional[Mapping[str, Any]] = None, ) -> None: self.is_classification = is_classification self.logger_port = logger_port + self.dataset_compression = dataset_compression if self.logger_port is not None: self.logger: Union[logging.Logger, PicklableClientLogger] = get_named_client_logger( name='Validation', @@ -43,7 +45,9 @@ def __init__( else: self.logger = logging.getLogger('Validation') - self.feature_validator = TabularFeatureValidator(logger=self.logger) + self.feature_validator = TabularFeatureValidator( + dataset_compression=self.dataset_compression, + logger=self.logger) self.target_validator = TabularTargetValidator( is_classification=self.is_classification, logger=self.logger diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py new file mode 100644 index 000000000..54499c973 --- /dev/null +++ b/autoPyTorch/data/utils.py @@ -0,0 +1,353 @@ +# Implementation used from https://github.com/automl/auto-sklearn/blob/development/autosklearn/util/data.py +import warnings +from math import floor +from typing import ( + Any, + Dict, + Iterator, + List, + Mapping, + Optional, + Sequence, + Tuple, + Type, + Union, + cast +) + +import numpy as np + +import pandas as pd + +from scipy.sparse import issparse, spmatrix + +from autoPyTorch.utils.common import ispandas + + +# TODO: TypedDict with python 3.8 +# +# When upgrading to python 3.8 as minimum version, this should be a TypedDict +# so that mypy can identify the fields types +DatasetCompressionSpec = Dict[str, Union[int, float, List[str]]] +DatasetDTypeContainerType = Union[Type, Dict[str, Type]] +DatasetCompressionInputType = Union[np.ndarray, spmatrix, pd.DataFrame] + +# Default specification for arg `dataset_compression` +default_dataset_compression_arg: DatasetCompressionSpec = { + "memory_allocation": 0.1, + "methods": ["precision"] +} + + +def has_object_columns(feature_types: pd.Series) -> bool: + """ + Indicate whether on a Series of dtypes for a Pandas DataFrame + there exists one or more object columns. + Args: + feature_types (pd.Series): The feature types for a DataFrame. + Returns: + bool: + True if the DataFrame dtypes contain an object column, False + otherwise. + """ + return np.dtype('O') in feature_types + + +def get_dataset_compression_mapping( + memory_limit: int, + dataset_compression: Union[bool, Mapping[str, Any]] +) -> Optional[DatasetCompressionSpec]: + """ + Internal function to get value for `BaseTask._dataset_compression` + based on the value of `dataset_compression` passed. + + If True, it returns the default_dataset_compression_arg. In case + of a mapping, it is validated and returned as a `DatasetCompressionSpec`. + + If False, it returns None. + + Args: + memory_limit (int): + memory limit of the current search. + dataset_compression (Union[bool, Mapping[str, Any]]): + mapping passed to the `search` function. + + Returns: + Optional[DatasetCompressionSpec]: + Validated data compression spec or None. + """ + dataset_compression_mapping: Optional[Mapping[str, Any]] = None + + if not isinstance(dataset_compression, bool): + dataset_compression_mapping = dataset_compression + elif dataset_compression: + dataset_compression_mapping = default_dataset_compression_arg + + if dataset_compression_mapping is not None: + dataset_compression_mapping = validate_dataset_compression_arg( + dataset_compression_mapping, memory_limit=memory_limit) + + return dataset_compression_mapping + + +def validate_dataset_compression_arg( + dataset_compression: Mapping[str, Any], + memory_limit: int +) -> DatasetCompressionSpec: + """Validate and return a correct dataset_compression argument + + The returned value can be safely used with `reduce_dataset_size_if_too_large`. + + Args: + dataset_compression: Mapping[str, Any] + The argumnents to validate + + Returns: + DatasetCompressionSpec + The validated and correct dataset compression spec + """ + if not isinstance(dataset_compression, Mapping): + raise ValueError( + f"Unknown type for `dataset_compression` {type(dataset_compression)}" + f"\ndataset_compression = {dataset_compression}" + ) + + # Fill with defaults if they don't exist + dataset_compression = { + **default_dataset_compression_arg, + **dataset_compression + } + + # Must contain known keys + if set(dataset_compression.keys()) != set(default_dataset_compression_arg.keys()): + raise ValueError( + f"Unknown key in dataset_compression, {list(dataset_compression.keys())}." + f"\nPossible keys are {list(default_dataset_compression_arg.keys())}" + ) + + memory_allocation = dataset_compression["memory_allocation"] + + # "memory_allocation" must be float or int + if not (isinstance(memory_allocation, float) or isinstance(memory_allocation, int)): + raise ValueError( + "key 'memory_allocation' must be an `int` or `float`" + f"\ntype = {memory_allocation}" + f"\ndataset_compression = {dataset_compression}" + ) + + # "memory_allocation" if absolute, should be > 0 and < memory_limit + if isinstance(memory_allocation, int) and not (0 < memory_allocation < memory_limit): + raise ValueError( + f"key 'memory_allocation' if int must be in (0, memory_limit={memory_limit})" + f"\nmemory_allocation = {memory_allocation}" + f"\ndataset_compression = {dataset_compression}" + ) + + # "memory_allocation" must be in (0,1) if float + if isinstance(memory_allocation, float): + if not (0.0 < memory_allocation < 1.0): + raise ValueError( + "key 'memory_allocation' if float must be in (0, 1)" + f"\nmemory_allocation = {memory_allocation}" + f"\ndataset_compression = {dataset_compression}" + ) + # convert to int so we can directly use + dataset_compression["memory_allocation"] = floor(memory_allocation * memory_limit) + + # "methods" must be non-empty sequence + if ( + not isinstance(dataset_compression["methods"], Sequence) + or len(dataset_compression["methods"]) <= 0 + ): + raise ValueError( + "key 'methods' must be a non-empty list" + f"\nmethods = {dataset_compression['methods']}" + f"\ndataset_compression = {dataset_compression}" + ) + + # "methods" must contain known methods + if any( + method not in cast(Sequence, default_dataset_compression_arg["methods"]) # mypy + for method in dataset_compression["methods"] + ): + raise ValueError( + f"key 'methods' can only contain {default_dataset_compression_arg['methods']}" + f"\nmethods = {dataset_compression['methods']}" + f"\ndataset_compression = {dataset_compression}" + ) + + return cast(DatasetCompressionSpec, dataset_compression) + + +class _DtypeReductionMapping(Mapping): + """ + Unfortuantly, mappings compare by hash(item) and not the __eq__ operator + between the key and the item. + + Hence we wrap the dict in a Mapping class and implement our own __getitem__ + such that we do use __eq__ between keys and query items. + + >>> np.float32 == dtype('float32') # True, they are considered equal + >>> + >>> mydict = { np.float32: 'hello' } + >>> + >>> # Equal by __eq__ but dict operations fail + >>> np.dtype('float32') in mydict # False + >>> mydict[dtype('float32')] # KeyError + + This mapping class fixes that supporting the `in` operator as well as `__getitem__` + + >>> reduction_mapping = _DtypeReductionMapping() + >>> + >>> reduction_mapping[np.dtype('float64')] # np.float32 + >>> np.dtype('float32') in reduction_mapping # True + """ + + # Information about dtype support + _mapping: Dict[type, type] = { + np.float32: np.float32, + np.float64: np.float32, + np.int32: np.int32, + np.int64: np.int32 + } + + # In spite of the names, np.float96 and np.float128 + # provide only as much precision as np.longdouble, + # that is, 80 bits on most x86 machines and 64 bits + # in standard Windows builds. + _mapping.update({getattr(np, s): np.float64 for s in ['float96', 'float128'] if hasattr(np, s)}) + + @classmethod + def __getitem__(cls, item: type) -> type: + for k, v in cls._mapping.items(): + if k == item: + return v + raise KeyError(item) + + @classmethod + def __iter__(cls) -> Iterator[type]: + return iter(cls._mapping.keys()) + + @classmethod + def __len__(cls) -> int: + return len(cls._mapping) + + +reduction_mapping = _DtypeReductionMapping() +supported_precision_reductions = list(reduction_mapping) + + +def reduce_precision( + X: DatasetCompressionInputType +) -> Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType]: + """ Reduce the precision of a dataset containing floats or ints + + Note: + For dataframe, the column's precision is reduced using pd.to_numeric. + + Args: + X (DatasetCompressionInputType): + The data to reduce precision of. + + Returns: + Tuple[DatasetCompressionInputType, DatasetDTypeContainerType, DatasetDTypeContainerType] + Returns the reduced data X along with the dtypes it and the dtypes it was reduced to. + """ + reduced_dtypes: Optional[DatasetDTypeContainerType] = None + if isinstance(X, np.ndarray) or issparse(X): + dtypes = X.dtype + if X.dtype not in supported_precision_reductions: + raise ValueError(f"X.dtype = {X.dtype} not equal to any supported" + f" {supported_precision_reductions}") + reduced_dtypes = reduction_mapping[X.dtype] + X = X.astype(reduced_dtypes) + + elif ispandas(X): + dtypes = dict(X.dtypes) + + col_names = X.dtypes.index + + float_cols = col_names[[dt.name.startswith("float") for dt in X.dtypes.values]] + int_cols = col_names[[dt.name.startswith("int") for dt in X.dtypes.values]] + X[int_cols] = X[int_cols].apply(lambda column: pd.to_numeric(column, downcast='integer')) + X[float_cols] = X[float_cols].apply(lambda column: pd.to_numeric(column, downcast='float')) + + reduced_dtypes = dict(X.dtypes) + else: + raise ValueError(f"Unrecognised data type of X, expected data type to " + f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(X)}") + + return X, reduced_dtypes, dtypes + + +def megabytes(arr: DatasetCompressionInputType) -> float: + + if isinstance(arr, np.ndarray): + memory_in_bytes = arr.nbytes + elif issparse(arr): + memory_in_bytes = arr.data.nbytes + elif ispandas(arr): + memory_in_bytes = arr.memory_usage(index=True, deep=True).sum() + else: + raise ValueError(f"Unrecognised data type of X, expected data type to " + f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}") + + return float(memory_in_bytes / (2**20)) + + +def reduce_dataset_size_if_too_large( + X: DatasetCompressionInputType, + memory_allocation: int, + methods: List[str] = ['precision'], +) -> DatasetCompressionInputType: + f""" Reduces the size of the dataset if it's too close to the memory limit. + + Follows the order of the operations passed in and retains the type of its + input. + + Precision reduction will only work on the following data types: + - {supported_precision_reductions} + + Precision reduction will only perform one level of precision reduction. + Technically, you could supply multiple rounds of precision reduction, i.e. + to reduce np.float128 to np.float32 you could use `methods = ['precision'] * 2`. + + However, if that's the use case, it'd be advised to simply use the function + `autoPyTorch.data.utils.reduce_precision`. + + Args: + X: DatasetCompressionInputType + The features of the dataset. + + methods: List[str] = ['precision'] + A list of operations that are permitted to be performed to reduce + the size of the dataset. + + **precision** + + Reduce the precision of float types + + memory_allocation: int + The amount of memory to allocate to the dataset. It should specify an + absolute amount. + + Returns: + DatasetCompressionInputType + The reduced X if reductions were needed + """ + + for method in methods: + + if method == 'precision': + # If the dataset is too big for the allocated memory, + # we then try to reduce the precision if it's a high precision dataset + if megabytes(X) > memory_allocation: + X, reduced_dtypes, dtypes = reduce_precision(X) + warnings.warn( + f'Dataset too large for allocated memory {memory_allocation}MB, ' + f'reduced the precision from {dtypes} to {reduced_dtypes}', + ) + else: + raise ValueError(f"Unknown operation `{method}`") + + return X diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index a3838007a..7761d07c2 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -21,9 +21,13 @@ DEFAULT_RESAMPLING_PARAMETERS, HoldOutFunc, HoldOutFuncs, - HoldoutValTypes + HoldoutValTypes, + NoResamplingFunc, + NoResamplingFuncs, + NoResamplingStrategyTypes, + ResamplingStrategies ) -from autoPyTorch.utils.common import FitRequirement +from autoPyTorch.utils.common import FitRequirement, ispandas BaseDatasetInputType = Union[Tuple[np.ndarray, np.ndarray], Dataset] BaseDatasetPropertiesType = Union[int, float, str, List, bool] @@ -45,6 +49,36 @@ def type_check(train_tensors: BaseDatasetInputType, check_valid_data(val_tensors[i]) +def _get_output_properties(train_tensors: BaseDatasetInputType) -> Tuple[int, str]: + """ + Return the dimension of output given a target_labels and output_type. + + Args: + train_tensors (BaseDatasetInputType): + Training data. + + Returns: + output_dim (int): + The dimension of outputs. + output_type (str): + The output type according to sklearn specification. + """ + if isinstance(train_tensors, Dataset): + target_labels = np.array([sample[-1] for sample in train_tensors]) + else: + target_labels = np.array(train_tensors[1]) + + output_type: str = type_of_target(target_labels) + if STRING_TO_OUTPUT_TYPES[output_type] in CLASSIFICATION_OUTPUTS: + output_dim = len(np.unique(target_labels)) + elif target_labels.ndim > 1: + output_dim = target_labels.shape[-1] + else: + output_dim = 1 + + return output_dim, output_type + + class TransformSubset(Subset): """Wrapper of BaseDataset for splitted datasets @@ -78,7 +112,7 @@ def __init__( dataset_name: Optional[str] = None, val_tensors: Optional[BaseDatasetInputType] = None, test_tensors: Optional[BaseDatasetInputType] = None, - resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, + resampling_strategy: ResamplingStrategies = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, shuffle: Optional[bool] = True, seed: Optional[int] = 42, @@ -95,8 +129,7 @@ def __init__( validation data test_tensors (An optional tuple of objects that have a __len__ and a __getitem__ attribute): test data - resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), - (default=HoldoutValTypes.holdout_validation): + resampling_strategy (RESAMPLING_STRATEGIES: default=HoldoutValTypes.holdout_validation): strategy to split the training data. resampling_strategy_args (Optional[Dict[str, Any]]): arguments required for the chosen resampling strategy. If None, uses @@ -109,16 +142,18 @@ def __init__( val_transforms (Optional[torchvision.transforms.Compose]): Additional Transforms to be applied to the validation/test data """ - self.dataset_name = dataset_name - if self.dataset_name is None: + if dataset_name is None: self.dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) + else: + self.dataset_name = dataset_name if not hasattr(train_tensors[0], 'shape'): type_check(train_tensors, val_tensors) self.train_tensors, self.val_tensors, self.test_tensors = train_tensors, val_tensors, test_tensors self.cross_validators: Dict[str, CrossValFunc] = {} self.holdout_validators: Dict[str, HoldOutFunc] = {} + self.no_resampling_validators: Dict[str, NoResamplingFunc] = {} self.random_state = np.random.RandomState(seed=seed) self.shuffle = shuffle self.resampling_strategy = resampling_strategy @@ -127,15 +162,7 @@ def __init__( self.issparse: bool = issparse(self.train_tensors[0]) self.input_shape: Tuple[int] = self.train_tensors[0].shape[1:] if len(self.train_tensors) == 2 and self.train_tensors[1] is not None: - self.output_type: str = type_of_target(self.train_tensors[1]) - - if ( - self.output_type in STRING_TO_OUTPUT_TYPES - and STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS - ): - self.output_shape = len(np.unique(self.train_tensors[1])) - else: - self.output_shape = self.train_tensors[1].shape[-1] if self.train_tensors[1].ndim > 1 else 1 + self.output_shape, self.output_type = _get_output_properties(self.train_tensors) # TODO: Look for a criteria to define small enough to preprocess self.is_small_preprocess = True @@ -143,6 +170,8 @@ def __init__( # Make sure cross validation splits are created once self.cross_validators = CrossValFuncs.get_cross_validators(*CrossValTypes) self.holdout_validators = HoldOutFuncs.get_holdout_validators(*HoldoutValTypes) + self.no_resampling_validators = NoResamplingFuncs.get_no_resampling_validators(*NoResamplingStrategyTypes) + self.splits = self.get_splits_from_resampling_strategy() # We also need to be able to transform the data, be it for pre-processing @@ -191,7 +220,7 @@ def __getitem__(self, index: int, train: bool = True) -> Tuple[np.ndarray, ...]: A transformed single point prediction """ - X = self.train_tensors[0].iloc[[index]] if hasattr(self.train_tensors[0], 'loc') \ + X = self.train_tensors[0].iloc[[index]] if ispandas(self.train_tensors[0]) \ else self.train_tensors[0][index] if self.train_transform is not None and train: @@ -210,7 +239,7 @@ def __len__(self) -> int: def _get_indices(self) -> np.ndarray: return self.random_state.permutation(len(self)) if self.shuffle else np.arange(len(self)) - def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int]]]: + def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], Optional[List[int]]]]: """ Creates a set of splits based on a resampling strategy provided @@ -241,6 +270,9 @@ def get_splits_from_resampling_strategy(self) -> List[Tuple[List[int], List[int] num_splits=cast(int, num_splits), ) ) + elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes): + splits.append((self.no_resampling_validators[self.resampling_strategy.name](self.random_state, + self._get_indices()), None)) else: raise ValueError(f"Unsupported resampling strategy={self.resampling_strategy}") return splits @@ -312,7 +344,7 @@ def create_holdout_val_split( self.random_state, val_share, self._get_indices(), **kwargs) return train, val - def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]: + def get_dataset(self, split_id: int, train: bool) -> Dataset: """ The above split methods employ the Subset to internally subsample the whole dataset. @@ -320,14 +352,21 @@ def get_dataset_for_training(self, split_id: int) -> Tuple[Dataset, Dataset]: to provide training data to fit a pipeline Args: - split (int): The desired subset of the dataset to split and use + split_id (int): which split id to get from the splits + train (bool): whether the dataset is required for training or evaluating. Returns: Dataset: the reduced dataset to be used for testing """ # Subset creates a dataset. Splits is a (train_indices, test_indices) tuple - return (TransformSubset(self, self.splits[split_id][0], train=True), - TransformSubset(self, self.splits[split_id][1], train=False)) + if split_id >= len(self.splits): # old version: split_id > len(self.splits) + raise IndexError(f"self.splits index out of range, got split_id={split_id}" + f" (>= num_splits={len(self.splits)})") + indices = self.splits[split_id][int(not train)] # 0: for training, 1: for evaluation + if indices is None: + raise ValueError("Specified fold (or subset) does not exist") + + return TransformSubset(self, indices, train=train) def replace_data(self, X_train: BaseDatasetInputType, X_test: Optional[BaseDatasetInputType]) -> 'BaseDataset': diff --git a/autoPyTorch/datasets/image_dataset.py b/autoPyTorch/datasets/image_dataset.py index 9da55ebc0..74b79db15 100644 --- a/autoPyTorch/datasets/image_dataset.py +++ b/autoPyTorch/datasets/image_dataset.py @@ -24,6 +24,7 @@ from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, + NoResamplingStrategyTypes ) IMAGE_DATASET_INPUT = Union[Dataset, Tuple[Union[np.ndarray, List[str]], np.ndarray]] @@ -39,7 +40,7 @@ class ImageDataset(BaseDataset): validation data test (Union[Dataset, Tuple[Union[np.ndarray, List[str]], np.ndarray]]): testing data - resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), + resampling_strategy (Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]), (default=HoldoutValTypes.holdout_validation): strategy to split the training data. resampling_strategy_args (Optional[Dict[str, Any]]): arguments @@ -57,7 +58,9 @@ def __init__(self, train: IMAGE_DATASET_INPUT, val: Optional[IMAGE_DATASET_INPUT] = None, test: Optional[IMAGE_DATASET_INPUT] = None, - resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, + resampling_strategy: Union[CrossValTypes, + HoldoutValTypes, + NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, shuffle: Optional[bool] = True, seed: Optional[int] = 42, diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py index 86e0ec733..78447a04e 100644 --- a/autoPyTorch/datasets/resampling_strategy.py +++ b/autoPyTorch/datasets/resampling_strategy.py @@ -16,6 +16,13 @@ # Use callback protocol as workaround, since callable with function fields count 'self' as argument +class NoResamplingFunc(Protocol): + def __call__(self, + random_state: np.random.RandomState, + indices: np.ndarray) -> np.ndarray: + ... + + class CrossValFunc(Protocol): def __call__(self, random_state: np.random.RandomState, @@ -76,10 +83,20 @@ def is_stratified(self) -> bool: return getattr(self, self.name) in stratified +class NoResamplingStrategyTypes(IntEnum): + no_resampling = 8 + + def is_stratified(self) -> bool: + return False + + # TODO: replace it with another way -RESAMPLING_STRATEGIES = [CrossValTypes, HoldoutValTypes] +ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes] -DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[HoldoutValTypes, CrossValTypes], Dict[str, Any]] = { +DEFAULT_RESAMPLING_PARAMETERS: Dict[ + ResamplingStrategies, + Dict[str, Any] +] = { HoldoutValTypes.holdout_validation: { 'val_share': 0.33, }, @@ -98,6 +115,7 @@ def is_stratified(self) -> bool: CrossValTypes.time_series_cross_validation: { 'num_splits': 5, }, + NoResamplingStrategyTypes.no_resampling: {} } @@ -225,3 +243,30 @@ def get_cross_validators(cls, *cross_val_types: CrossValTypes) -> Dict[str, Cros for cross_val_type in cross_val_types } return cross_validators + + +class NoResamplingFuncs(): + @classmethod + def get_no_resampling_validators(cls, *no_resampling_types: NoResamplingStrategyTypes + ) -> Dict[str, NoResamplingFunc]: + no_resampling_strategies: Dict[str, NoResamplingFunc] = { + no_resampling_type.name: getattr(cls, no_resampling_type.name) + for no_resampling_type in no_resampling_types + } + return no_resampling_strategies + + @staticmethod + def no_resampling(random_state: np.random.RandomState, + indices: np.ndarray) -> np.ndarray: + """ + Returns the indices without performing + any operation on them. To be used for + fitting on the whole dataset. + This strategy is not compatible with + HPO search. + Args: + indices: array of indices + Returns: + np.ndarray: array of indices + """ + return indices diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index c2e229868..6cabfe525 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -21,6 +21,7 @@ from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, + NoResamplingStrategyTypes ) @@ -32,11 +33,11 @@ class TabularDataset(BaseDataset): Y (Union[np.ndarray, pd.Series]): training data targets. X_test (Optional[Union[np.ndarray, pd.DataFrame]]): input testing data. Y_test (Optional[Union[np.ndarray, pd.DataFrame]]): testing data targets - resampling_strategy (Union[CrossValTypes, HoldoutValTypes]), + resampling_strategy (Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]), (default=HoldoutValTypes.holdout_validation): strategy to split the training data. - resampling_strategy_args (Optional[Dict[str, Any]]): arguments - required for the chosen resampling strategy. If None, uses + resampling_strategy_args (Optional[Dict[str, Any]]): + arguments required for the chosen resampling strategy. If None, uses the default values provided in DEFAULT_RESAMPLING_PARAMETERS in ```datasets/resampling_strategy.py```. shuffle: Whether to shuffle the data before performing splits @@ -55,7 +56,9 @@ def __init__(self, Y: Union[np.ndarray, pd.Series], X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None, Y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None, - resampling_strategy: Union[CrossValTypes, HoldoutValTypes] = HoldoutValTypes.holdout_validation, + resampling_strategy: Union[CrossValTypes, + HoldoutValTypes, + NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, shuffle: Optional[bool] = True, seed: Optional[int] = 42, @@ -86,6 +89,7 @@ def __init__(self, seed=seed, train_transforms=train_transforms, dataset_name=dataset_name, val_transforms=val_transforms) + self.issigned = bool(np.any((X.data if self.issparse else X) < 0)) if self.output_type is not None: if STRING_TO_OUTPUT_TYPES[self.output_type] in CLASSIFICATION_OUTPUTS: self.task_type = TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION] @@ -124,6 +128,7 @@ def get_required_dataset_info(self) -> Dict[str, BaseDatasetPropertiesType]: info.update({ 'numerical_columns': self.numerical_columns, 'categorical_columns': self.categorical_columns, - 'task_type': self.task_type + 'task_type': self.task_type, + 'issigned': self.issigned }) return info diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 027c7211a..2af333d11 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -33,8 +33,9 @@ ) from autoPyTorch.datasets.base_dataset import BaseDataset, BaseDatasetPropertiesType from autoPyTorch.evaluation.utils import ( + DisableFileOutputParameters, VotingRegressorWrapper, - convert_multioutput_multiclass_to_multilabel + convert_multioutput_multiclass_to_multilabel, ) from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric @@ -375,10 +376,25 @@ class AbstractEvaluator(object): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (Union[bool, List[str]]): - By default, the model, it's predictions and other metadata is stored on disk - for each finished configuration. This argument allows the user to skip - saving certain file type, for example the model, from being written to disk. + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -404,7 +420,7 @@ def __init__(self, backend: Backend, num_run: Optional[int] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Union[bool, List[str]] = False, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, all_supported_metrics: bool = True, @@ -417,43 +433,24 @@ def __init__(self, backend: Backend, self.backend: Backend = backend self.queue = queue - self.datamanager: BaseDataset = self.backend.load_datamanager() - - assert self.datamanager.task_type is not None, \ - "Expected dataset {} to have task_type got None".format(self.datamanager.__class__.__name__) - self.task_type = STRING_TO_TASK_TYPES[self.datamanager.task_type] - self.output_type = STRING_TO_OUTPUT_TYPES[self.datamanager.output_type] - self.issparse = self.datamanager.issparse - self.include = include self.exclude = exclude self.search_space_updates = search_space_updates - self.X_train, self.y_train = self.datamanager.train_tensors - - if self.datamanager.val_tensors is not None: - self.X_valid, self.y_valid = self.datamanager.val_tensors - else: - self.X_valid, self.y_valid = None, None - - if self.datamanager.test_tensors is not None: - self.X_test, self.y_test = self.datamanager.test_tensors - else: - self.X_test, self.y_test = None, None - self.metric = metric self.seed = seed + self._init_datamanager_info() + # Flag to save target for ensemble self.output_y_hat_optimization = output_y_hat_optimization - if isinstance(disable_file_output, bool): - self.disable_file_output: bool = disable_file_output - elif isinstance(disable_file_output, List): - self.disabled_file_outputs: List[str] = disable_file_output - else: - raise ValueError('disable_file_output should be either a bool or a list') + disable_file_output = disable_file_output if disable_file_output is not None else [] + # check compatibility of disable file output + DisableFileOutputParameters.check_compatibility(disable_file_output) + + self.disable_file_output = disable_file_output self.pipeline_class: Optional[Union[BaseEstimator, BasePipeline]] = None if self.task_type in REGRESSION_TASKS: @@ -482,12 +479,6 @@ def __init__(self, backend: Backend, else: raise ValueError('task {} not available'.format(self.task_type)) self.predict_function = self._predict_proba - self.dataset_properties = self.datamanager.get_dataset_properties( - get_dataset_requirements(info=self.datamanager.get_required_dataset_info(), - include=self.include, - exclude=self.exclude, - search_space_updates=self.search_space_updates - )) self.additional_metrics: Optional[List[autoPyTorchMetric]] = None metrics_dict: Optional[Dict[str, List[str]]] = None @@ -527,6 +518,53 @@ def __init__(self, backend: Backend, self.logger.debug("Fit dictionary in Abstract evaluator: {}".format(dict_repr(self.fit_dictionary))) self.logger.debug("Search space updates :{}".format(self.search_space_updates)) + def _init_datamanager_info( + self, + ) -> None: + """ + Initialises instance attributes that come from the datamanager. + For example, + X_train, y_train, etc. + """ + + datamanager: BaseDataset = self.backend.load_datamanager() + + assert datamanager.task_type is not None, \ + "Expected dataset {} to have task_type got None".format(datamanager.__class__.__name__) + self.task_type = STRING_TO_TASK_TYPES[datamanager.task_type] + self.output_type = STRING_TO_OUTPUT_TYPES[datamanager.output_type] + self.issparse = datamanager.issparse + + self.X_train, self.y_train = datamanager.train_tensors + + if datamanager.val_tensors is not None: + self.X_valid, self.y_valid = datamanager.val_tensors + else: + self.X_valid, self.y_valid = None, None + + if datamanager.test_tensors is not None: + self.X_test, self.y_test = datamanager.test_tensors + else: + self.X_test, self.y_test = None, None + + self.resampling_strategy = datamanager.resampling_strategy + + self.num_classes: Optional[int] = getattr(datamanager, "num_classes", None) + + self.dataset_properties = datamanager.get_dataset_properties( + get_dataset_requirements(info=datamanager.get_required_dataset_info(), + include=self.include, + exclude=self.exclude, + search_space_updates=self.search_space_updates + )) + self.splits = datamanager.splits + if self.splits is None: + raise AttributeError(f"create_splits on {datamanager.__class__.__name__} must be called " + f"before the instantiation of {self.__class__.__name__}") + + # delete datamanager from memory + del datamanager + def _init_fit_dictionary( self, logger_port: int, @@ -834,20 +872,17 @@ def file_output( ) # Abort if we don't want to output anything. - if hasattr(self, 'disable_file_output'): - if self.disable_file_output: - return None, {} - else: - self.disabled_file_outputs = [] + if 'all' in self.disable_file_output: + return None, {} # This file can be written independently of the others down bellow - if 'y_optimization' not in self.disabled_file_outputs: + if 'y_optimization' not in self.disable_file_output: if self.output_y_hat_optimization: self.backend.save_targets_ensemble(self.Y_optimization) - if hasattr(self, 'pipelines') and self.pipelines is not None: - if self.pipelines[0] is not None and len(self.pipelines) > 0: - if 'pipelines' not in self.disabled_file_outputs: + if getattr(self, 'pipelines', None) is not None: + if self.pipelines[0] is not None and len(self.pipelines) > 0: # type: ignore[index, arg-type] + if 'pipelines' not in self.disable_file_output: if self.task_type in CLASSIFICATION_TASKS: pipelines = VotingClassifier(estimators=None, voting='soft', ) else: @@ -860,8 +895,8 @@ def file_output( else: pipelines = None - if hasattr(self, 'pipeline') and self.pipeline is not None: - if 'pipeline' not in self.disabled_file_outputs: + if getattr(self, 'pipeline', None) is not None: + if 'pipeline' not in self.disable_file_output: pipeline = self.pipeline else: pipeline = None @@ -877,15 +912,15 @@ def file_output( cv_model=pipelines, ensemble_predictions=( Y_optimization_pred if 'y_optimization' not in - self.disabled_file_outputs else None + self.disable_file_output else None ), valid_predictions=( Y_valid_pred if 'y_valid' not in - self.disabled_file_outputs else None + self.disable_file_output else None ), test_predictions=( Y_test_pred if 'y_test' not in - self.disabled_file_outputs else None + self.disable_file_output else None ), ) @@ -976,21 +1011,20 @@ def _ensure_prediction_array_sizes(self, prediction: np.ndarray, (np.ndarray): The formatted prediction """ - assert self.datamanager.num_classes is not None, "Called function on wrong task" - num_classes: int = self.datamanager.num_classes + assert self.num_classes is not None, "Called function on wrong task" if self.output_type == MULTICLASS and \ - prediction.shape[1] < num_classes: + prediction.shape[1] < self.num_classes: if Y_train is None: raise ValueError('Y_train must not be None!') classes = list(np.unique(Y_train)) mapping = dict() - for class_number in range(num_classes): + for class_number in range(self.num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number - new_predictions = np.zeros((prediction.shape[0], num_classes), + new_predictions = np.zeros((prediction.shape[0], self.num_classes), dtype=np.float32) for index in mapping: diff --git a/autoPyTorch/evaluation/tae.py b/autoPyTorch/evaluation/tae.py index d99251d3d..b109dbb1a 100644 --- a/autoPyTorch/evaluation/tae.py +++ b/autoPyTorch/evaluation/tae.py @@ -22,9 +22,20 @@ from smac.tae import StatusType, TAEAbortException from smac.tae.execute_func import AbstractTAFunc -import autoPyTorch.evaluation.train_evaluator from autoPyTorch.automl_common.common.utils.backend import Backend -from autoPyTorch.evaluation.utils import empty_queue, extract_learning_curve, read_queue +from autoPyTorch.datasets.resampling_strategy import ( + CrossValTypes, + HoldoutValTypes, + NoResamplingStrategyTypes +) +from autoPyTorch.evaluation.test_evaluator import eval_test_function +from autoPyTorch.evaluation.train_evaluator import eval_train_function +from autoPyTorch.evaluation.utils import ( + DisableFileOutputParameters, + empty_queue, + extract_learning_curve, + read_queue +) from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.common import dict_repr, replace_string_bool_to_bool from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -100,6 +111,7 @@ def __init__( cost_for_crash: float, abort_on_first_run_crash: bool, pynisher_context: str, + multi_objectives: List[str], pipeline_config: Optional[Dict[str, Any]] = None, initial_num_run: int = 1, stats: Optional[Stats] = None, @@ -109,7 +121,7 @@ def __init__( include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, memory_limit: Optional[int] = None, - disable_file_output: bool = False, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, init_params: Dict[str, Any] = None, budget_type: str = None, ta: Optional[Callable] = None, @@ -118,7 +130,27 @@ def __init__( search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ): - eval_function = autoPyTorch.evaluation.train_evaluator.eval_function + self.backend = backend + + dm = self.backend.load_datamanager() + if dm.val_tensors is not None: + self._get_validation_loss = True + else: + self._get_validation_loss = False + if dm.test_tensors is not None: + self._get_test_loss = True + else: + self._get_test_loss = False + + self.resampling_strategy = dm.resampling_strategy + self.resampling_strategy_args = dm.resampling_strategy_args + + if isinstance(self.resampling_strategy, (HoldoutValTypes, CrossValTypes)): + eval_function = eval_train_function + self.output_y_hat_optimization = output_y_hat_optimization + elif isinstance(self.resampling_strategy, NoResamplingStrategyTypes): + eval_function = eval_test_function + self.output_y_hat_optimization = False self.worst_possible_result = cost_for_crash @@ -137,12 +169,10 @@ def __init__( abort_on_first_run_crash=abort_on_first_run_crash, ) - self.backend = backend self.pynisher_context = pynisher_context self.seed = seed self.initial_num_run = initial_num_run self.metric = metric - self.output_y_hat_optimization = output_y_hat_optimization self.include = include self.exclude = exclude self.disable_file_output = disable_file_output @@ -170,20 +200,24 @@ def __init__( memory_limit = int(math.ceil(memory_limit)) self.memory_limit = memory_limit - dm = self.backend.load_datamanager() - if dm.val_tensors is not None: - self._get_validation_loss = True - else: - self._get_validation_loss = False - if dm.test_tensors is not None: - self._get_test_loss = True - else: - self._get_test_loss = False + self.search_space_updates = search_space_updates - self.resampling_strategy = dm.resampling_strategy - self.resampling_strategy_args = dm.resampling_strategy_args + def _check_and_get_default_budget(self) -> float: + budget_type_choices = ('epochs', 'runtime') + budget_choices = { + budget_type: float(self.pipeline_config.get(budget_type, np.inf)) + for budget_type in budget_type_choices + } - self.search_space_updates = search_space_updates + # budget is defined by epochs by default + budget_type = str(self.pipeline_config.get('budget_type', 'epochs')) + if self.budget_type is not None: + budget_type = self.budget_type + + if budget_type not in budget_type_choices: + raise ValueError(f"budget type must be in {budget_type_choices}, but got {budget_type}") + else: + return budget_choices[budget_type] def run_wrapper( self, @@ -202,26 +236,19 @@ def run_wrapper( RunValue: Contains information about the status/performance of config """ - if self.budget_type is None: - if run_info.budget != 0: - raise ValueError( - 'If budget_type is None, budget must be.0, but is %f' % run_info.budget - ) - else: - if run_info.budget == 0: - # SMAC can return budget zero for intensifiers that don't have a concept - # of budget, for example a simple bayesian optimization intensifier. - # Budget determines how our pipeline trains, which can be via runtime or epochs - epochs_budget = self.pipeline_config.get('epochs', np.inf) - runtime_budget = self.pipeline_config.get('runtime', np.inf) - run_info = run_info._replace(budget=min(epochs_budget, runtime_budget)) - elif run_info.budget <= 0: - raise ValueError('Illegal value for budget, must be greater than zero but is %f' % - run_info.budget) - if self.budget_type not in ('epochs', 'runtime'): - raise ValueError("Illegal value for budget type, must be one of " - "('epochs', 'runtime'), but is : %s" % - self.budget_type) + # SMAC returns non-zero budget for intensification + # In other words, SMAC returns budget=0 for a simple intensifier (i.e. no intensification) + is_intensified = (run_info.budget != 0) + default_budget = self._check_and_get_default_budget() + + if self.budget_type is None and is_intensified: + raise ValueError(f'budget must be 0 (=no intensification) for budget_type=None, but got {run_info.budget}') + if self.budget_type is not None and run_info.budget < 0: + raise ValueError(f'budget must be greater than zero but got {run_info.budget}') + + if self.budget_type is not None and not is_intensified: + # The budget will be provided in train evaluator when budget_type is None + run_info = run_info._replace(budget=default_budget) remaining_time = self.stats.get_remaing_time_budget() @@ -245,6 +272,10 @@ def run_wrapper( self.logger.info("Starting to evaluate configuration %s" % run_info.config.config_id) run_info, run_value = super().run_wrapper(run_info=run_info) + + if not is_intensified: # It is required for the SMAC compatibility + run_info = run_info._replace(budget=0.0) + return run_info, run_value def run( diff --git a/autoPyTorch/evaluation/test_evaluator.py b/autoPyTorch/evaluation/test_evaluator.py new file mode 100644 index 000000000..4d5b0ae91 --- /dev/null +++ b/autoPyTorch/evaluation/test_evaluator.py @@ -0,0 +1,236 @@ +from multiprocessing.queues import Queue +from typing import Any, Dict, List, Optional, Tuple, Union + +from ConfigSpace.configuration_space import Configuration + +import numpy as np + +from smac.tae import StatusType + +from autoPyTorch.automl_common.common.utils.backend import Backend +from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes +from autoPyTorch.evaluation.abstract_evaluator import ( + AbstractEvaluator, + fit_and_suppress_warnings +) +from autoPyTorch.evaluation.utils import DisableFileOutputParameters +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates + + +__all__ = [ + 'eval_test_function', + 'TestEvaluator' +] + + +class TestEvaluator(AbstractEvaluator): + """ + This class builds a pipeline using the provided configuration. + A pipeline implementing the provided configuration is fitted + using the datamanager object retrieved from disc, via the backend. + After the pipeline is fitted, it is save to disc and the performance estimate + is communicated to the main process via a Queue. It is only compatible + with `NoResamplingStrategyTypes`, i.e, when the training data + is not split and the test set is used for SMBO optimisation. It can not + be used for building ensembles which is ensured by having + `output_y_hat_optimisation`=False + + Attributes: + backend (Backend): + An object to interface with the disk storage. In particular, allows to + access the train and test datasets + queue (Queue): + Each worker available will instantiate an evaluator, and after completion, + it will return the evaluation result via a multiprocessing queue + metric (autoPyTorchMetric): + A scorer object that is able to evaluate how good a pipeline was fit. It + is a wrapper on top of the actual score method (a wrapper on top of scikit + lean accuracy for example) that formats the predictions accordingly. + budget: (float): + The amount of epochs/time a configuration is allowed to run. + budget_type (str): + The budget type, which can be epochs or time + pipeline_config (Optional[Dict[str, Any]]): + Defines the content of the pipeline being evaluated. For example, it + contains pipeline specific settings like logging name, or whether or not + to use tensorboard. + configuration (Union[int, str, Configuration]): + Determines the pipeline to be constructed. A dummy estimator is created for + integer configurations, a traditional machine learning pipeline is created + for string based configuration, and NAS is performed when a configuration + object is passed. + seed (int): + A integer that allows for reproducibility of results + output_y_hat_optimization (bool): + Whether this worker should output the target predictions, so that they are + stored on disk. Fundamentally, the resampling strategy might shuffle the + Y_train targets, so we store the split in order to re-use them for ensemble + selection. + num_run (Optional[int]): + An identifier of the current configuration being fit. This number is unique per + configuration. + include (Optional[Dict[str, Any]]): + An optional dictionary to include components of the pipeline steps. + exclude (Optional[Dict[str, Any]]): + An optional dictionary to exclude components of the pipeline steps. + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. + init_params (Optional[Dict[str, Any]]): + Optional argument that is passed to each pipeline step. It is the equivalent of + kwargs for the pipeline steps. + logger_port (Optional[int]): + Logging is performed using a socket-server scheme to be robust against many + parallel entities that want to write to the same file. This integer states the + socket port for the communication channel. If None is provided, a traditional + logger is used. + all_supported_metrics (bool): + Whether all supported metric should be calculated for every configuration. + search_space_updates (Optional[HyperparameterSearchSpaceUpdates]): + An object used to fine tune the hyperparameter search space of the pipeline + """ + def __init__( + self, + backend: Backend, queue: Queue, + metric: autoPyTorchMetric, + budget: float, + configuration: Union[int, str, Configuration], + budget_type: str = None, + pipeline_config: Optional[Dict[str, Any]] = None, + seed: int = 1, + output_y_hat_optimization: bool = False, + num_run: Optional[int] = None, + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, + init_params: Optional[Dict[str, Any]] = None, + logger_port: Optional[int] = None, + all_supported_metrics: bool = True, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> None: + super().__init__( + backend=backend, + queue=queue, + configuration=configuration, + metric=metric, + seed=seed, + output_y_hat_optimization=output_y_hat_optimization, + num_run=num_run, + include=include, + exclude=exclude, + disable_file_output=disable_file_output, + init_params=init_params, + budget=budget, + budget_type=budget_type, + logger_port=logger_port, + all_supported_metrics=all_supported_metrics, + pipeline_config=pipeline_config, + search_space_updates=search_space_updates + ) + + if not isinstance(self.resampling_strategy, (NoResamplingStrategyTypes)): + raise ValueError( + f'resampling_strategy for TestEvaluator must be in ' + f'NoResamplingStrategyTypes, but got {self.resampling_strategy}' + ) + + def fit_predict_and_loss(self) -> None: + + split_id = 0 + train_indices, test_indices = self.splits[split_id] + + self.pipeline = self._get_pipeline() + X = {'train_indices': train_indices, + 'val_indices': test_indices, + 'split_id': split_id, + 'num_run': self.num_run, + **self.fit_dictionary} # fit dictionary + y = None + fit_and_suppress_warnings(self.logger, self.pipeline, X, y) + train_loss, _ = self.predict_and_loss(train=True) + test_loss, test_pred = self.predict_and_loss() + self.Y_optimization = self.y_test + self.finish_up( + loss=test_loss, + train_loss=train_loss, + opt_pred=test_pred, + valid_pred=None, + test_pred=test_pred, + file_output=True, + additional_run_info=None, + status=StatusType.SUCCESS, + ) + + def predict_and_loss( + self, train: bool = False + ) -> Tuple[Dict[str, float], np.ndarray]: + labels = self.y_train if train else self.y_test + feats = self.X_train if train else self.X_test + preds = self.predict_function( + X=feats, + pipeline=self.pipeline, + Y_train=self.y_train # Need this as we need to know all the classes in train splits + ) + loss_dict = self._loss(labels, preds) + + return loss_dict, preds + + +# create closure for evaluating an algorithm +def eval_test_function( + backend: Backend, + queue: Queue, + metric: autoPyTorchMetric, + budget: float, + config: Optional[Configuration], + seed: int, + output_y_hat_optimization: bool, + num_run: int, + include: Optional[Dict[str, Any]], + exclude: Optional[Dict[str, Any]], + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, + pipeline_config: Optional[Dict[str, Any]] = None, + budget_type: str = None, + init_params: Optional[Dict[str, Any]] = None, + logger_port: Optional[int] = None, + all_supported_metrics: bool = True, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + instance: str = None, +) -> None: + evaluator = TestEvaluator( + backend=backend, + queue=queue, + metric=metric, + configuration=config, + seed=seed, + num_run=num_run, + output_y_hat_optimization=output_y_hat_optimization, + include=include, + exclude=exclude, + disable_file_output=disable_file_output, + init_params=init_params, + budget=budget, + budget_type=budget_type, + logger_port=logger_port, + all_supported_metrics=all_supported_metrics, + pipeline_config=pipeline_config, + search_space_updates=search_space_updates) + + evaluator.fit_predict_and_loss() diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index 010948b55..9f5150889 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -14,15 +14,17 @@ CLASSIFICATION_TASKS, MULTICLASSMULTIOUTPUT, ) +from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes from autoPyTorch.evaluation.abstract_evaluator import ( AbstractEvaluator, fit_and_suppress_warnings ) +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.common import dict_repr, subsampler from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates -__all__ = ['TrainEvaluator', 'eval_function'] +__all__ = ['TrainEvaluator', 'eval_train_function'] def _get_y_array(y: np.ndarray, task_type: int) -> np.ndarray: @@ -39,7 +41,9 @@ class TrainEvaluator(AbstractEvaluator): A pipeline implementing the provided configuration is fitted using the datamanager object retrieved from disc, via the backend. After the pipeline is fitted, it is save to disc and the performance estimate - is communicated to the main process via a Queue. + is communicated to the main process via a Queue. It is only compatible + with `CrossValTypes`, `HoldoutValTypes`, i.e, when the training data + is split and the validation set is used for SMBO optimisation. Attributes: backend (Backend): @@ -79,10 +83,25 @@ class TrainEvaluator(AbstractEvaluator): An optional dictionary to include components of the pipeline steps. exclude (Optional[Dict[str, Any]]): An optional dictionary to exclude components of the pipeline steps. - disable_file_output (Union[bool, List[str]]): - By default, the model, it's predictions and other metadata is stored on disk - for each finished configuration. This argument allows the user to skip - saving certain file type, for example the model, from being written to disk. + disable_file_output (Optional[List[Union[str, DisableFileOutputParameters]]]): + Used as a list to pass more fine-grained + information on what to save. Must be a member of `DisableFileOutputParameters`. + Allowed elements in the list are: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + For more information check `autoPyTorch.evaluation.utils.DisableFileOutputParameters`. init_params (Optional[Dict[str, Any]]): Optional argument that is passed to each pipeline step. It is the equivalent of kwargs for the pipeline steps. @@ -107,7 +126,7 @@ def __init__(self, backend: Backend, queue: Queue, num_run: Optional[int] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, - disable_file_output: Union[bool, List] = False, + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, init_params: Optional[Dict[str, Any]] = None, logger_port: Optional[int] = None, keep_models: Optional[bool] = None, @@ -133,9 +152,12 @@ def __init__(self, backend: Backend, queue: Queue, search_space_updates=search_space_updates ) - self.splits = self.datamanager.splits - if self.splits is None: - raise AttributeError("Must have called create_splits on {}".format(self.datamanager.__class__.__name__)) + if not isinstance(self.resampling_strategy, (CrossValTypes, HoldoutValTypes)): + raise ValueError( + f'resampling_strategy for TrainEvaluator must be in ' + f'(CrossValTypes, HoldoutValTypes), but got {self.resampling_strategy}' + ) + self.num_folds: int = len(self.splits) self.Y_targets: List[Optional[np.ndarray]] = [None] * self.num_folds self.Y_train_targets: np.ndarray = np.ones(self.y_train.shape) * np.NaN @@ -254,10 +276,15 @@ def fit_predict_and_loss(self) -> None: # train_losses is a list of dicts. It is # computed using the target metric (self.metric). - train_loss = np.average([train_losses[i][str(self.metric)] - for i in range(self.num_folds)], - weights=train_fold_weights, - ) + train_loss = {} + for metric in train_losses[0].keys(): + train_loss[metric] = np.average( + [ + train_losses[i][metric] + for i in range(self.num_folds) + ], + weights=train_fold_weights + ) opt_loss = {} # self.logger.debug("OPT LOSSES: {}".format(opt_losses if opt_losses is not None else None)) @@ -381,25 +408,25 @@ def _predict(self, pipeline: BaseEstimator, # create closure for evaluating an algorithm -def eval_function( - backend: Backend, - queue: Queue, - metric: autoPyTorchMetric, - budget: float, - config: Optional[Configuration], - seed: int, - output_y_hat_optimization: bool, - num_run: int, - include: Optional[Dict[str, Any]], - exclude: Optional[Dict[str, Any]], - disable_file_output: Union[bool, List], - pipeline_config: Optional[Dict[str, Any]] = None, - budget_type: str = None, - init_params: Optional[Dict[str, Any]] = None, - logger_port: Optional[int] = None, - all_supported_metrics: bool = True, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, - instance: str = None, +def eval_train_function( + backend: Backend, + queue: Queue, + metric: autoPyTorchMetric, + budget: float, + config: Optional[Configuration], + seed: int, + output_y_hat_optimization: bool, + num_run: int, + include: Optional[Dict[str, Any]], + exclude: Optional[Dict[str, Any]], + disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None, + pipeline_config: Optional[Dict[str, Any]] = None, + budget_type: str = None, + init_params: Optional[Dict[str, Any]] = None, + logger_port: Optional[int] = None, + all_supported_metrics: bool = True, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None, + instance: str = None, ) -> None: """ This closure allows the communication between the ExecuteTaFuncWithQueue and the diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py index 1bf93fa84..37e5fa36d 100644 --- a/autoPyTorch/evaluation/utils.py +++ b/autoPyTorch/evaluation/utils.py @@ -8,6 +8,9 @@ from smac.runhistory.runhistory import RunValue +from autoPyTorch.utils.common import autoPyTorchEnum + + __all__ = [ 'read_queue', 'convert_multioutput_multiclass_to_multilabel', @@ -102,3 +105,40 @@ def _predict(self, X: np.ndarray) -> np.ndarray: predictions.append(pred.ravel()) return np.asarray(predictions).T + + +class DisableFileOutputParameters(autoPyTorchEnum): + """ + Contains literals that can be passed in to `disable_file_output` list. + These include: + + + `y_optimization`: + do not save the predictions for the optimization set, + which would later on be used to build an ensemble. Note that SMAC + optimizes a metric evaluated on the optimization set. + + `pipeline`: + do not save any individual pipeline files + + `pipelines`: + In case of cross validation, disables saving the joint model of the + pipelines fit on each fold. + + `y_test`: + do not save the predictions for the test set. + + `all`: + do not save any of the above. + """ + pipeline = 'pipeline' + pipelines = 'pipelines' + y_optimization = 'y_optimization' + y_test = 'y_test' + all = 'all' + + @classmethod + def check_compatibility( + cls, + disable_file_output: List[Union[str, 'DisableFileOutputParameters']] + ) -> None: + for item in disable_file_output: + if item not in cls.__members__ and not isinstance(item, cls): + raise ValueError(f"Expected {item} to be in the members (" + f"{list(cls.__members__.keys())}) of {cls.__name__}" + f" or as string value of a member.") diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index aa444c782..898afd7f5 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -18,11 +18,11 @@ from smac.utils.io.traj_logging import TrajEntry from autoPyTorch.automl_common.common.utils.backend import Backend -from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, DEFAULT_RESAMPLING_PARAMETERS, HoldoutValTypes, + NoResamplingStrategyTypes ) from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash @@ -98,7 +98,9 @@ def __init__(self, pipeline_config: Dict[str, Any], start_num_run: int = 1, seed: int = 1, - resampling_strategy: Union[HoldoutValTypes, CrossValTypes] = HoldoutValTypes.holdout_validation, + resampling_strategy: Union[HoldoutValTypes, + CrossValTypes, + NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation, resampling_strategy_args: Optional[Dict[str, Any]] = None, include: Optional[Dict[str, Any]] = None, exclude: Optional[Dict[str, Any]] = None, @@ -191,9 +193,8 @@ def __init__(self, super(AutoMLSMBO, self).__init__() # data related self.dataset_name = dataset_name - self.datamanager: Optional[BaseDataset] = None self.metric = metric - self.task: Optional[str] = None + self.backend = backend self.all_supported_metrics = all_supported_metrics @@ -243,24 +244,17 @@ def __init__(self, self.initial_configurations: Optional[List[Configuration]] = None if portfolio_selection is not None: - self.initial_configurations = read_return_initial_configurations(config_space=config_space, - portfolio_selection=portfolio_selection) - - def reset_data_manager(self) -> None: - if self.datamanager is not None: - del self.datamanager - self.datamanager = self.backend.load_datamanager() - - if self.datamanager is not None and self.datamanager.task_type is not None: - self.task = self.datamanager.task_type + initial_configurations = read_return_initial_configurations(config_space=config_space, + portfolio_selection=portfolio_selection) + # incase we dont have any valid configuration from the portfolio + self.initial_configurations = initial_configurations \ + if len(initial_configurations) > 0 else None def run_smbo(self, func: Optional[Callable] = None ) -> Tuple[RunHistory, List[TrajEntry], str]: self.watcher.start_task('SMBO') self.logger.info("Started run of SMBO") - # == first things first: load the datamanager - self.reset_data_manager() # == Initialize non-SMBO stuff # first create a scenario diff --git a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py index cfc1a890b..fb8bbdaa7 100644 --- a/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py +++ b/autoPyTorch/pipeline/components/preprocessing/base_preprocessing.py @@ -6,7 +6,7 @@ import pandas as pd -from scipy.sparse import csr_matrix +from scipy.sparse import spmatrix import torch @@ -24,7 +24,7 @@ def __init__(self) -> None: super().__init__() self.add_fit_requirements([ FitRequirement('X_train', - (np.ndarray, pd.DataFrame, csr_matrix), + (np.ndarray, pd.DataFrame, spmatrix), user_defined=True, dataset_property=False), FitRequirement('backend', (Backend, ), diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index ea47e33b9..02a3085b0 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -1,7 +1,8 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union import numpy as np +from sklearn.base import BaseEstimator from sklearn.compose import ColumnTransformer from sklearn.pipeline import make_pipeline @@ -48,18 +49,25 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": "TabularColumnTransformer": an instance of self """ self.check_requirements(X, y) - numerical_pipeline = 'drop' - categorical_pipeline = 'drop' preprocessors = get_tabular_preprocessers(X) - if len(X['dataset_properties']['numerical_columns']): + column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = [] + if len(preprocessors['numerical']) > 0: numerical_pipeline = make_pipeline(*preprocessors['numerical']) - if len(X['dataset_properties']['categorical_columns']): + column_transformers.append( + ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']) + ) + if len(preprocessors['categorical']) > 0: categorical_pipeline = make_pipeline(*preprocessors['categorical']) - - self.preprocessor = ColumnTransformer([ - ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']), - ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns'])], + column_transformers.append( + ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns']) + ) + + # in case the preprocessing steps are disabled + # i.e, NoEncoder for categorical, we want to + # let the data in categorical columns pass through + self.preprocessor = ColumnTransformer( + column_transformers, remainder='passthrough' ) @@ -70,7 +78,13 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": else: X_train = X['backend'].load_datamanager().train_tensors[0] - self.preprocessor.fit(X_train) + if 'y_train' in X: + y_train = subsampler(X['y_train'], X['train_indices']) + else: + y_train = X['backend'].load_datamanager().train_tensors[1] + + self.preprocessor.fit(X_train, y=y_train) + return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py new file mode 100644 index 000000000..69edfcbb6 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/MinorityCoalescer.py @@ -0,0 +1,44 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import UniformFloatHyperparameter + +import numpy as np + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter +from autoPyTorch.utils.implementations import MinorityCoalesceTransformer + + +class MinorityCoalescer(BaseCoalescer): + """Group together categories whose occurence is less than a specified min_frac """ + def __init__(self, min_frac: float, random_state: np.random.RandomState): + super().__init__() + self.min_frac = min_frac + self.random_state = random_state + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseCoalescer: + self.check_requirements(X, y) + self.preprocessor['categorical'] = MinorityCoalesceTransformer(min_frac=self.min_frac) + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, Any]] = None, + min_frac: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_frac', + value_range=(1e-4, 0.5), + default_value=1e-2, + ), + ) -> ConfigurationSpace: + + cs = ConfigurationSpace() + add_hyperparameter(cs, min_frac, UniformFloatHyperparameter) + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'MinorityCoalescer', + 'name': 'MinorityCoalescer', + 'handles_sparse': False + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py new file mode 100644 index 000000000..fdc13dec6 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/NoCoalescer.py @@ -0,0 +1,37 @@ +from typing import Any, Dict, Optional, Union + +import numpy as np + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer + + +class NoCoalescer(BaseCoalescer): + def __init__(self, random_state: np.random.RandomState): + super().__init__() + self.random_state = random_state + self._processing = False + + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseCoalescer: + """ + As no coalescing happens, only check the requirements. + + Args: + X (Dict[str, Any]): + fit dictionary + y (Optional[Any]): + Parameter to comply with scikit-learn API. Not used. + + Returns: + instance of self + """ + self.check_requirements(X, y) + + return self + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'NoCoalescer', + 'name': 'NoCoalescer', + 'handles_sparse': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py new file mode 100644 index 000000000..1139106ce --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/__init__.py @@ -0,0 +1,254 @@ +import os +from collections import OrderedDict +from typing import Dict, List, Optional, Sequence + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.base_coalescer import BaseCoalescer +from autoPyTorch.utils.common import HyperparameterSearchSpace, HyperparameterValueType + + +coalescer_directory = os.path.split(__file__)[0] +_coalescer = find_components(__package__, + coalescer_directory, + BaseCoalescer) +_addons = ThirdPartyComponents(BaseCoalescer) + + +def add_coalescer(coalescer: BaseCoalescer) -> None: + _addons.add_component(coalescer) + + +class CoalescerChoice(autoPyTorchChoice): + """ + Allows for dynamically choosing coalescer component at runtime + """ + proc_name = "coalescer" + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available coalescer components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all BaseCoalescer components available + as choices for coalescer the categorical columns + """ + # TODO: Create `@property def components(): ...`. + components = OrderedDict() + components.update(_coalescer) + components.update(_addons.components) + return components + + @staticmethod + def _get_default_choice( + avail_components: Dict[str, autoPyTorchComponent], + include: List[str], + exclude: List[str], + defaults: List[str] = ['NoCoalescer', 'MinorityCoalescer'], + ) -> str: + # TODO: Make it a base method + for choice in defaults: + if choice in avail_components and choice in include and choice not in exclude: + return choice + else: + raise RuntimeError( + f"Available components is either not included in `include` {include} or " + f"included in `exclude` {exclude}" + ) + + def _update_config_space( + self, + component: CSH.Hyperparameter, + avail_components: Dict[str, autoPyTorchComponent], + dataset_properties: Dict[str, BaseDatasetPropertiesType] + ) -> None: + # TODO: Make it a base method + cs = ConfigurationSpace() + cs.add_hyperparameter(component) + + # add only child hyperparameters of early_preprocessor choices + for name in component.choices: + updates = self._get_search_space_updates(prefix=name) + func4cs = avail_components[name].get_hyperparameter_search_space + + # search space provides different args, so ignore it + component_config_space = func4cs(dataset_properties, **updates) # type:ignore[call-arg] + parent_hyperparameter = {'parent': component, 'value': name} + cs.add_configuration_space( + name, + component_config_space, + parent_hyperparameter=parent_hyperparameter + ) + + self.configuration_space = cs + + def _check_choices_in_update( + self, + choices_in_update: Sequence[HyperparameterValueType], + avail_components: Dict[str, autoPyTorchComponent] + ) -> None: + # TODO: Make it a base method + if not set(choices_in_update).issubset(avail_components): + raise ValueError( + f"The update for {self.__class__.__name__} is expected to be " + f"a subset of {avail_components}, but got {choices_in_update}" + ) + + def get_hyperparameter_search_space(self, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None) -> ConfigurationSpace: + # TODO: Make it a base method + + if dataset_properties is None: + dataset_properties = dict() + + dataset_properties = {**self.dataset_properties, **dataset_properties} + + avail_cmps = self.get_available_components( + dataset_properties=dataset_properties, + include=include, + exclude=exclude + ) + + if len(avail_cmps) == 0: + raise ValueError(f"No {self.proc_name} found, please add {self.proc_name} to `include` argument") + + include = include if include is not None else list(avail_cmps.keys()) + exclude = exclude if exclude is not None else [] + if default is None: + default = self._get_default_choice(avail_cmps, include, exclude) + + updates = self._get_search_space_updates() + if "__choice__" in updates: + component = self._get_component_with_updates( + updates=updates, + avail_components=avail_cmps, + dataset_properties=dataset_properties + ) + else: + component = self._get_component_without_updates( + default=default, + include=include, + avail_components=avail_cmps, + dataset_properties=dataset_properties + ) + + self.dataset_properties = dataset_properties + self._update_config_space( + component=component, + avail_components=avail_cmps, + dataset_properties=dataset_properties + ) + return self.configuration_space + + def _check_dataset_properties(self, dataset_properties: Dict[str, BaseDatasetPropertiesType]) -> None: + """ + A mechanism in code to ensure the correctness of the dataset_properties + It recursively makes sure that the children and parent level requirements + are honored. + + Args: + dataset_properties: + """ + # TODO: Make it a base method + super()._check_dataset_properties(dataset_properties) + if any(key not in dataset_properties for key in ['categorical_columns', 'numerical_columns']): + raise ValueError("Dataset properties must contain information about the type of columns") + + def _get_component_with_updates( + self, + updates: Dict[str, HyperparameterSearchSpace], + avail_components: Dict[str, autoPyTorchComponent], + dataset_properties: Dict[str, BaseDatasetPropertiesType], + ) -> CSH.Hyperparameter: + # TODO: Make it a base method + choice_key = '__choice__' + choices_in_update = updates[choice_key].value_range + default_in_update = updates[choice_key].default_value + self._check_choices_in_update( + choices_in_update=choices_in_update, + avail_components=avail_components + ) + self._check_update_compatiblity(choices_in_update, dataset_properties) + return CSH.CategoricalHyperparameter(choice_key, choices_in_update, default_in_update) + + def _get_component_without_updates( + self, + avail_components: Dict[str, autoPyTorchComponent], + dataset_properties: Dict[str, BaseDatasetPropertiesType], + default: str, + include: List[str] + ) -> CSH.Hyperparameter: + """ + A method to get a hyperparameter information for the component. + This method is run when we do not get updates from _get_search_space_updates. + + Args: + avail_components (Dict[str, autoPyTorchComponent]): + Available components for this processing. + dataset_properties (Dict[str, BaseDatasetPropertiesType]): + The properties of the dataset. + default (str): + The default component for this processing. + include (List[str]): + The components to include for the auto-pytorch searching. + + Returns: + (CSH.Hyperparameter): + The hyperparameter information for this processing. + """ + # TODO: Make an abstract method with NotImplementedError + choice_key = '__choice__' + no_proc_key = 'NoCoalescer' + choices = list(avail_components.keys()) + + assert isinstance(dataset_properties['categorical_columns'], list) # mypy check + if len(dataset_properties['categorical_columns']) == 0: + # only no coalescer is compatible if the dataset has only numericals + default, choices = no_proc_key, [no_proc_key] + if no_proc_key not in include: + raise ValueError("Only no coalescer is compatible for a dataset with no categorical column") + + return CSH.CategoricalHyperparameter(choice_key, choices, default_value=default) + + def _check_update_compatiblity( + self, + choices_in_update: Sequence[HyperparameterValueType], + dataset_properties: Dict[str, BaseDatasetPropertiesType] + ) -> None: + """ + Check the compatibility of the updates for the components + in this processing given dataset properties. + For example, some processing is not compatible with datasets + with no numerical columns. + We would like to check such compatibility in this method. + + Args: + choices_in_update (Sequence[HyperparameterValueType]): + The choices of components in updates + dataset_properties (Dict[str, BaseDatasetPropertiesType]): + The properties of the dataset. + """ + # TODO: Make an abstract method with NotImplementedError + assert isinstance(dataset_properties['categorical_columns'], list) # mypy check + if len(dataset_properties['categorical_columns']) > 0: + # no restriction for update if dataset has categorical columns + return + + if 'NoCoalescer' not in choices_in_update or len(choices_in_update) != 1: + raise ValueError( + "Only no coalescer is compatible for a dataset with no categorical column, " + f"but got {choices_in_update}" + ) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py new file mode 100644 index 000000000..b572f8343 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py @@ -0,0 +1,33 @@ +from typing import Any, Dict, List + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import ( + autoPyTorchTabularPreprocessingComponent +) +from autoPyTorch.utils.common import FitRequirement + + +class BaseCoalescer(autoPyTorchTabularPreprocessingComponent): + def __init__(self) -> None: + super().__init__() + self._processing = True + self.add_fit_requirements([ + FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), + FitRequirement('categories', (List,), user_defined=True, dataset_property=True) + ]) + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + """ + Add the preprocessor to the provided fit dictionary `X`. + + Args: + X (Dict[str, Any]): fit dictionary in sklearn + + Returns: + X (Dict[str, Any]): the updated fit dictionary + """ + if self._processing and self.preprocessor['categorical'] is None: + # If we apply minority coalescer, we must have categorical preprocessor! + raise RuntimeError(f"fit() must be called before transform() on {self.__class__.__name__}") + + X.update({'coalescer': self.preprocessor}) + return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py new file mode 100644 index 000000000..274cdc49a --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py @@ -0,0 +1,172 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.feature_selection import SelectFromModel + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + utils import NoneType_ +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, check_none + + +CRITERION_CHOICES = ("gini", "entropy") + + +class ExtraTreesPreprocessorClassification(autoPyTorchFeaturePreprocessingComponent): + """ + Select features based on importance weights calculated using extra trees + """ + def __init__(self, bootstrap: bool = True, n_estimators: int = 10, + criterion: str = "gini", max_features: float = 0.5, + max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2, + min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0, + max_leaf_nodes: Union[int, NoneType_] = "none", + min_impurity_decrease: float = 0, oob_score: bool = False, + verbose: int = 0, + random_state: Optional[np.random.RandomState] = None): + self.bootstrap = bootstrap + self.n_estimators = n_estimators + if criterion not in CRITERION_CHOICES: + raise ValueError(f"`criterion` of {self.__class__.__name__} " + f"must be in {CRITERION_CHOICES}, but got: {criterion}") + self.criterion = criterion + self.max_features = max_features + self.min_impurity_decrease = min_impurity_decrease + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_leaf_nodes = max_leaf_nodes + self.oob_score = oob_score + self.verbose = verbose + + super().__init__(random_state=random_state) + + def get_components_kwargs(self) -> Dict[str, Any]: + """ + returns keyword arguments required by the feature preprocessor + + Returns: + Dict[str, Any]: kwargs + """ + return dict( + bootstrap=self.bootstrap, + n_estimators=self.n_estimators, + criterion=self.criterion, + max_features=self.max_features, + min_impurity_decrease=self.min_impurity_decrease, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_leaf_nodes=self.max_leaf_nodes, + oob_score=self.oob_score, + verbose=self.verbose, + random_state=self.random_state, + ) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + if check_none(self.max_leaf_nodes): + self.max_leaf_nodes = None + elif isinstance(self.max_leaf_nodes, int): + self.max_leaf_nodes = int(self.max_leaf_nodes) + else: + raise ValueError(f"Expected `max_leaf_nodes` to be either " + f"in ('None', 'none', None) or an integer, got {self.max_leaf_nodes}") + + if check_none(self.max_depth): + self.max_depth = None + elif isinstance(self.max_depth, int): + self.max_depth = int(self.max_depth) + else: + raise ValueError(f"Expected `max_depth` to be either " + f"in ('None', 'none', None) or an integer, got {self.max_depth}") + + # TODO: add class_weights + estimator = ExtraTreesClassifier(**self.get_components_kwargs()) + + self.preprocessor['numerical'] = SelectFromModel(estimator=estimator, + threshold='mean', + prefit=False) + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap', + value_range=(True, False), + default_value=True, + ), + n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators', + value_range=(10, 100), + default_value=10, + ), + max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth', + value_range=("none",), + default_value="none", + ), + max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features', + value_range=(0, 1), + default_value=0.5, + ), + min_impurity_decrease: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='min_impurity_decrease', + value_range=(0,), + default_value=0), + criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion', + value_range=CRITERION_CHOICES, + default_value="gini", + ), + min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split', + value_range=(2, 20), + default_value=2, + ), + min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf', + value_range=(1, 20), + default_value=1, + ), + min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='min_weight_fraction_leaf', + value_range=(0,), + default_value=0), + max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes', + value_range=("none",), + default_value="none", + ), + ) -> ConfigurationSpace: + + cs = ConfigurationSpace() + add_hyperparameter(cs, bootstrap, CategoricalHyperparameter) + add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter) + add_hyperparameter(cs, max_features, UniformFloatHyperparameter) + add_hyperparameter(cs, min_impurity_decrease, UniformFloatHyperparameter) + add_hyperparameter(cs, criterion, CategoricalHyperparameter) + add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter) + add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter) + add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter) + add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter) + add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'ETC', + 'name': 'Extra Trees Classifier Preprocessing', + 'handles_sparse': True, + 'handles_regression': False, + 'handles_classification': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py new file mode 100644 index 000000000..3c3db31cd --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py @@ -0,0 +1,175 @@ +from typing import Any, Dict, List, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.ensemble import ExtraTreesRegressor +from sklearn.feature_selection import SelectFromModel + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + utils import NoneType_ +from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, check_none + + +CRITERION_CHOICES = ('mse', 'friedman_mse', 'mae') + + +class ExtraTreesPreprocessorRegression(autoPyTorchFeaturePreprocessingComponent): + """ + Selects features based on importance weights using extra trees + """ + def __init__(self, bootstrap: bool = True, n_estimators: int = 10, + criterion: str = "mse", max_features: float = 1, + max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2, + min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0, + max_leaf_nodes: Union[int, NoneType_] = "none", + oob_score: bool = False, verbose: int = 0, + random_state: Optional[np.random.RandomState] = None): + self.bootstrap = bootstrap + self.n_estimators = n_estimators + if criterion not in CRITERION_CHOICES: + raise ValueError(f"`criterion` of {self.__class__.__name__} " + f"must be in {CRITERION_CHOICES}, but got: {criterion}") + self.criterion = criterion + self.max_features = max_features + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_leaf_nodes = max_leaf_nodes + self.oob_score = oob_score + self.verbose = verbose + + super().__init__(random_state=random_state) + + self.add_fit_requirements([ + FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)]) + + def get_components_kwargs(self) -> Dict[str, Any]: + """ + returns keyword arguments required by the feature preprocessor + + Returns: + Dict[str, Any]: kwargs + """ + return dict( + bootstrap=self.bootstrap, + n_estimators=self.n_estimators, + criterion=self.criterion, + max_features=self.max_features, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + min_weight_fraction_leaf=self.min_weight_fraction_leaf, + max_leaf_nodes=self.max_leaf_nodes, + oob_score=self.oob_score, + verbose=self.verbose, + random_state=self.random_state, + ) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.check_requirements(X, y) + + if check_none(self.max_leaf_nodes): + self.max_leaf_nodes = None + elif isinstance(self.max_leaf_nodes, int): + self.max_leaf_nodes = int(self.max_leaf_nodes) + else: + raise ValueError(f"Expected `max_leaf_nodes` to be either " + f"in ('None', 'none', None) or an integer, got {self.max_leaf_nodes}") + + if check_none(self.max_depth): + self.max_depth = None + elif isinstance(self.max_depth, int): + self.max_depth = int(self.max_depth) + else: + raise ValueError(f"Expected `max_depth` to be either " + f"in ('None', 'none', None) or an integer, got {self.max_depth}") + + num_features = len(X['dataset_properties']['numerical_columns']) + max_features = int( + float(self.max_features) * (np.log(num_features) + 1)) + # Use at most half of the features + max_features = max(1, min(int(num_features / 2), max_features)) + + # TODO: add class_weights + estimator = ExtraTreesRegressor(**self.get_components_kwargs()) + + self.preprocessor['numerical'] = SelectFromModel(estimator=estimator, + threshold='mean', + prefit=False) + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap', + value_range=(True, False), + default_value=True, + ), + n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators', + value_range=(100,), + default_value=100, + ), + max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth', + value_range=("none",), + default_value="none", + ), + max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features', + value_range=(0.1, 1), + default_value=1, + ), + criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion', + value_range=CRITERION_CHOICES, + default_value="mse", + ), + min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split', + value_range=(2, 20), + default_value=2, + ), + min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf', + value_range=(1, 20), + default_value=1, + ), + min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='min_weight_fraction_leaf', + value_range=(0,), + default_value=0), + max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes', + value_range=("none",), + default_value="none", + ), + ) -> ConfigurationSpace: + + cs = ConfigurationSpace() + add_hyperparameter(cs, bootstrap, CategoricalHyperparameter) + add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter) + add_hyperparameter(cs, max_features, UniformFloatHyperparameter) + add_hyperparameter(cs, criterion, CategoricalHyperparameter) + add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter) + add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter) + add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter) + add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter) + add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'ETR', + 'name': 'Extra Trees Regressor Preprocessing', + 'handles_sparse': True, + 'handles_regression': True, + 'handles_classification': False + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FastICA.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FastICA.py new file mode 100644 index 000000000..bded9e093 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FastICA.py @@ -0,0 +1,118 @@ +from typing import Any, Dict, Optional + +from ConfigSpace.conditions import EqualsCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.decomposition import FastICA as SklearnFastICA + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + utils import percentage_value_range_to_integer_range +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter + + +class FastICA(autoPyTorchFeaturePreprocessingComponent): + """ + Reduce number of features by separating a multivariate signal into + additive subcomponents that are maximally independent. + + Args: + n_components (int): + Number of components to use + Note: + This number needs to be less than the total number of + features. To keep the hyperparameter search space general + to different datasets, autoPyTorch defines its value + range as the percentage of the number of features (in float). + This is then used to construct the range of n_components using + n_components = percentage of features * number of features. + Defaults to 100. + algorithm (str): + Apply parallel or deflational algorithm for FastICA. + Defaults to 'parallel'. + whiten (bool): + If whiten is false, the data is already considered to be whitened, + and no whitening is performed. Defaults to False. + fun (str): + The functional form of the G function used in the approximation to neg-entropy. + Defaults to "logcosh". + """ + def __init__(self, n_components: int = 100, + algorithm: str = 'parallel', whiten: bool = False, + fun: str = "logcosh", + random_state: Optional[np.random.RandomState] = None + ): + self.n_components = n_components + self.algorithm = algorithm + self.whiten = whiten + self.fun = fun + + super().__init__(random_state=random_state) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = SklearnFastICA( + n_components=self.n_components, algorithm=self.algorithm, + fun=self.fun, whiten=self.whiten, random_state=self.random_state) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + n_components: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_components', + value_range=(0.5, 0.9), + default_value=0.5, + ), + algorithm: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='algorithm', + value_range=('parallel', 'deflation'), + default_value='parallel', + ), + whiten: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='whiten', + value_range=(True, False), + default_value=False, + ), + fun: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='fun', + value_range=('logcosh', 'exp', 'cube'), + default_value='logcosh', + ), + ) -> ConfigurationSpace: + n_components = percentage_value_range_to_integer_range( + hyperparameter_search_space=n_components, + default_value_range=(10, 2000), + default_value=100, + dataset_properties=dataset_properties, + ) + cs = ConfigurationSpace() + + n_components_hp = get_hyperparameter(n_components, UniformIntegerHyperparameter) + whiten_hp = get_hyperparameter(whiten, CategoricalHyperparameter) + add_hyperparameter(cs, algorithm, CategoricalHyperparameter) + add_hyperparameter(cs, fun, CategoricalHyperparameter) + cs.add_hyperparameter(whiten_hp) + + if True in whiten_hp.choices: + cs.add_hyperparameter(n_components_hp) + cs.add_condition(EqualsCondition(n_components_hp, whiten_hp, True)) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'FastICA', + 'name': 'Fast Independent Component Analysis', + 'handles_sparse': False, + 'handles_classification': True, + 'handles_regression': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FeatureAgglomeration.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FeatureAgglomeration.py new file mode 100644 index 000000000..63519e301 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/FeatureAgglomeration.py @@ -0,0 +1,129 @@ +from typing import Any, Callable, Dict, Optional + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.forbidden import ( + ForbiddenAndConjunction, + ForbiddenEqualsClause, + ForbiddenInClause +) +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.cluster import FeatureAgglomeration as SklearnFeatureAgglomeration + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + utils import percentage_value_range_to_integer_range +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter + + +class FeatureAgglomeration(autoPyTorchFeaturePreprocessingComponent): + """ + Recursively merge pair of clusters of features constructed + using agglomerative clustering. + + Args: + n_clusters (int): + The number of clusters to find. Defaults to 25. + Note: + This number needs to be less than the total number of + features. To keep the hyperparameter search space general + to different datasets, autoPyTorch defines its value + range as the percentage of the number of features (in float). + This is then used to construct the range of n_clusters using + n_clusters = percentage of features * number of features. + affinity (str): + Metric used to compute the linkage. If linkage is “ward”, only + “euclidean” is accepted. Defaults to 'euclidean'. + linkage (str): + Which linkage criterion to use. The linkage criterion determines + which distance to use between sets of features. Defaults to 'ward'. + pooling_func (str): + Combines the values of agglomerated features into a single value, + autoPyTorch uses (max, min and median) functions from numpy. Defaults to "max". + """ + def __init__(self, n_clusters: int = 25, + affinity: str = 'euclidean', linkage: str = 'ward', + pooling_func: str = "max", + random_state: Optional[np.random.RandomState] = None + ): + self.n_clusters = n_clusters + self.affinity = affinity + self.linkage = linkage + self.pooling_func: Callable = getattr(np, pooling_func) + + super().__init__(random_state=random_state) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = SklearnFeatureAgglomeration( + n_clusters=self.n_clusters, affinity=self.affinity, + linkage=self.linkage, pooling_func=self.pooling_func) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + n_clusters: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_clusters', + value_range=(0.5, 0.9), + default_value=0.5, + ), + affinity: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='affinity', + value_range=("euclidean", + "manhattan", + "cosine"), + default_value="euclidean", + ), + linkage: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='linkage', + value_range=("ward", "complete", "average"), + default_value="ward", + ), + pooling_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='pooling_func', + value_range=("mean", "median", "max"), + default_value="max", + ), + ) -> ConfigurationSpace: + n_clusters = percentage_value_range_to_integer_range( + hyperparameter_search_space=n_clusters, + default_value_range=(2, 400), + default_value=25, + dataset_properties=dataset_properties, + ) + cs = ConfigurationSpace() + + add_hyperparameter(cs, n_clusters, UniformIntegerHyperparameter) + affinity_hp = get_hyperparameter(affinity, CategoricalHyperparameter) + linkage_hp = get_hyperparameter(linkage, CategoricalHyperparameter) + add_hyperparameter(cs, pooling_func, CategoricalHyperparameter) + cs.add_hyperparameters([affinity_hp, linkage_hp]) + + # If linkage is “ward”, only “euclidean” is accepted. + non_euclidian_affinity = [choice for choice in ["manhattan", "cosine"] if choice in affinity_hp.choices] + + if "ward" in linkage_hp.choices and len(non_euclidian_affinity) > 0: + forbidden_condition = ForbiddenAndConjunction( + ForbiddenInClause(affinity_hp, non_euclidian_affinity), + ForbiddenEqualsClause(linkage_hp, "ward") + ) + cs.add_forbidden_clause(forbidden_condition) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'FeatureAgglomeration', + 'name': 'Feature Agglomeration', + 'handles_sparse': False, + 'handles_classification': True, + 'handles_regression': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py index afa0334cb..f6a8db28f 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py @@ -1,5 +1,4 @@ -from math import ceil, floor -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace @@ -17,10 +16,35 @@ from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + utils import percentage_value_range_to_integer_range from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter class KernelPCA(autoPyTorchFeaturePreprocessingComponent): + """ + Non-linear dimensionality reduction through the use of kernels + + Args: + n_components (int): + Number of components. + Note: + This number needs to be less than the total number of + features. To keep the hyperparameter search space general + to different datasets, autoPyTorch defines its value + range as the percentage of the number of features (in float). + This is then used to construct the range of n_components using + n_components = percentage of features * number of features. + Defaults to 10. + kernel (str): + Kernel used for PCA. Defaults to 'rbf'. + degree (int): + Degree for poly kernels. Defaults to 3. + gamma (float): + Kernel coefficient for rbf, poly and sigmoid kernels. Defaults to 0.01. + coef0 (float): + Independent term in poly and sigmoid kernels. Defaults to 0.0. + """ def __init__(self, n_components: int = 10, kernel: str = 'rbf', degree: int = 3, gamma: float = 0.01, coef0: float = 0.0, @@ -38,6 +62,8 @@ def __init__(self, n_components: int = 10, def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + self.check_requirements(X, y) + self.preprocessor['numerical'] = sklearn.decomposition.KernelPCA( n_components=self.n_components, kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0, @@ -72,24 +98,12 @@ def get_hyperparameter_search_space( cs = ConfigurationSpace() - if dataset_properties is not None: - n_features = len(dataset_properties['numerical_columns']) if isinstance( - dataset_properties['numerical_columns'], List) else 0 - if n_features == 1: - log = False - else: - log = n_components.log - n_components = HyperparameterSearchSpace(hyperparameter='n_components', - value_range=( - floor(float(n_components.value_range[0]) * n_features), - ceil(float(n_components.value_range[1]) * n_features)), - default_value=ceil(float(n_components.default_value) * n_features), - log=log) - else: - n_components = HyperparameterSearchSpace(hyperparameter='n_components', - value_range=(10, 2000), - default_value=100, - log=n_components.log) + n_components = percentage_value_range_to_integer_range( + hyperparameter_search_space=n_components, + default_value_range=(10, 2000), + default_value=100, + dataset_properties=dataset_properties, + ) add_hyperparameter(cs, n_components, UniformIntegerHyperparameter) kernel_hp = get_hyperparameter(kernel, CategoricalHyperparameter) @@ -121,5 +135,7 @@ def get_hyperparameter_search_space( def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: return {'shortname': 'KernelPCA', 'name': 'Kernel Principal Component Analysis', - 'handles_sparse': True + 'handles_sparse': True, + 'handles_classification': True, + 'handles_regression': True } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/LibLinearSVCPreprocessor.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/LibLinearSVCPreprocessor.py new file mode 100644 index 000000000..f9d0c996d --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/LibLinearSVCPreprocessor.py @@ -0,0 +1,141 @@ +from typing import Any, Dict, Optional + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.forbidden import ( + ForbiddenAndConjunction, + ForbiddenEqualsClause, +) +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.feature_selection import SelectFromModel +from sklearn.svm import LinearSVC + + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter + + +class LibLinearSVCPreprocessor(autoPyTorchFeaturePreprocessingComponent): + """ + Selects features based on importance weights using svm classifier + """ + def __init__(self, dual: bool = False, penalty: str = "l1", + loss: str = "squared_hinge", tol: float = 1e-4, + C: float = 1, multi_class: str = "ovr", + intercept_scaling: float = 1, fit_intercept: bool = True, + random_state: Optional[np.random.RandomState] = None): + + self.dual = dual + self.penalty = penalty + self.loss = loss + self.multi_class = multi_class + self.intercept_scaling = intercept_scaling + self.fit_intercept = fit_intercept + self.tol = tol + self.C = C + + super().__init__(random_state=random_state) + + def get_components_kwargs(self) -> Dict[str, Any]: + """ + returns keyword arguments required by the feature preprocessor + + Returns: + Dict[str, Any]: kwargs + """ + return dict( + dual=self.dual, + penalty=self.penalty, + loss=self.loss, + multi_class=self.multi_class, + intercept_scaling=self.intercept_scaling, + tol=self.tol, + fit_intercept=self.fit_intercept, + C=self.C, + random_state=self.random_state + ) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.check_requirements(X, y) + # TODO: add class_weights + estimator = LinearSVC(**self.get_components_kwargs()) + + self.preprocessor['numerical'] = SelectFromModel(estimator=estimator, + threshold='mean', + prefit=False) + return self + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'LinearSVC Preprocessor', + 'name': 'linear Support Vector Classification Preprocessing', + 'handles_sparse': True, + 'handles_classification': True, + 'handles_regression': False + } + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + penalty: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='penalty', + value_range=("l1",), + default_value="l1", + ), + loss: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='loss', + value_range=("squared_hinge", "hinge"), + default_value="squared_hinge", + ), + dual: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='dual', + value_range=(False,), + default_value=False, + ), + tol: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='tol', + value_range=(1e-5, 1e-1), + default_value=1e-4, + log=True + ), + C: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='C', + value_range=(0.03125, 32768), + default_value=1, + log=True + ), + multi_class: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='multi_class', + value_range=("ovr",), + default_value="ovr"), + fit_intercept: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='fit_intercept', + value_range=(True,), + default_value=True, + ), + intercept_scaling: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='intercept_scaling', + value_range=(1,), + default_value=1, + ), + ) -> ConfigurationSpace: + + cs = ConfigurationSpace() + add_hyperparameter(cs, fit_intercept, CategoricalHyperparameter) + penalty_hp = get_hyperparameter(penalty, CategoricalHyperparameter) + add_hyperparameter(cs, multi_class, CategoricalHyperparameter) + loss_hp = get_hyperparameter(loss, CategoricalHyperparameter) + add_hyperparameter(cs, dual, CategoricalHyperparameter) + add_hyperparameter(cs, tol, UniformFloatHyperparameter) + add_hyperparameter(cs, C, UniformFloatHyperparameter) + add_hyperparameter(cs, intercept_scaling, UniformFloatHyperparameter) + + cs.add_hyperparameters([loss_hp, penalty_hp]) + if "l1" in penalty_hp.choices and "hinge" in loss_hp.choices: + penalty_and_loss = ForbiddenAndConjunction( + ForbiddenEqualsClause(penalty_hp, "l1"), + ForbiddenEqualsClause(loss_hp, "hinge") + ) + cs.add_forbidden_clause(penalty_and_loss) + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py index 9eb83a003..11e12e7df 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py @@ -47,8 +47,9 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: @staticmethod def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None ) -> Dict[str, Union[str, bool]]: - return { - 'shortname': 'NoFeaturePreprocessing', - 'name': 'No Feature Preprocessing', - 'handles_sparse': True - } + return {'shortname': 'NoFeaturePreprocessing', + 'name': 'No Feature Preprocessing', + 'handles_sparse': True, + 'handles_classification': True, + 'handles_regression': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py index a0bd953cb..6fe2a617f 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py @@ -1,5 +1,5 @@ -from math import ceil, floor -from typing import Any, Dict, List, Optional +import warnings +from typing import Any, Dict, Optional from ConfigSpace.conditions import EqualsCondition, InCondition from ConfigSpace.configuration_space import ConfigurationSpace @@ -17,10 +17,34 @@ from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent -from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + utils import percentage_value_range_to_integer_range +from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter class Nystroem(autoPyTorchFeaturePreprocessingComponent): + """ + Construct an approximate feature map for an arbitrary kernel using a subset of the data as basis. + + Args: + n_components (int): + Note: + This number needs to be less than the total number of + features. To keep the hyperparameter search space general + to different datasets, autoPyTorch defines its value + range as the percentage of the number of features (in float). + This is then used to construct the range of n_components using + n_components = percentage of features * number of features. Defaults to 10. + kernel (str): + Kernel map to be approximated. Defaults to 'rbf'. + degree (int): + Degree of the polynomial kernel. Defaults to 3. + gamma (float): + Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and + sigmoid kernels. Defaults to 0.01. + coef0 (float): + Zero coefficient for polynomial and sigmoid kernels. Defaults to 0.0. + """ def __init__(self, n_components: int = 10, kernel: str = 'rbf', degree: int = 3, gamma: float = 0.01, coef0: float = 0.0, @@ -32,6 +56,8 @@ def __init__(self, n_components: int = 10, self.gamma = gamma self.coef0 = coef0 super().__init__(random_state=random_state) + self.add_fit_requirements([ + FitRequirement('issigned', (bool,), user_defined=True, dataset_property=True)]) def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: @@ -50,7 +76,11 @@ def get_hyperparameter_search_space( default_value=0.5, ), kernel: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='kernel', - value_range=('poly', 'rbf', 'sigmoid', 'cosine'), + value_range=('poly', + 'rbf', + 'sigmoid', + 'cosine', + 'chi2'), default_value='rbf', ), gamma: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='gamma', @@ -69,27 +99,42 @@ def get_hyperparameter_search_space( cs = ConfigurationSpace() + n_components = percentage_value_range_to_integer_range( + hyperparameter_search_space=n_components, + default_value_range=(10, 2000), + default_value=100, + dataset_properties=dataset_properties, + ) + + add_hyperparameter(cs, n_components, UniformIntegerHyperparameter) + value_range = list(kernel.value_range) + + allow_chi = True + if dataset_properties is not None: - n_features = len(dataset_properties['numerical_columns']) \ - if isinstance(dataset_properties['numerical_columns'], List) else 0 - # if numerical features are 1, set log to False - if n_features == 1: - log = False + if ( + dataset_properties.get("issigned") + or dataset_properties.get("issparse") + ): + # chi kernel does not support negative numbers or + # a sparse matrix + allow_chi = False else: - log = n_components.log - n_components = HyperparameterSearchSpace(hyperparameter='n_components', - value_range=( - floor(float(n_components.value_range[0]) * n_features), - ceil(float(n_components.value_range[1]) * n_features)), - default_value=ceil(float(n_components.default_value) * n_features), - log=log) - else: - n_components = HyperparameterSearchSpace(hyperparameter='n_components', - value_range=(10, 2000), - default_value=100, - log=n_components.log) + allow_chi = True + if not allow_chi: + value_range = [value for value in value_range if value != "chi2"] + if len(value_range) == 0: + value_range = ["poly"] + + if value_range != list(kernel.value_range): + warnings.warn(f"Given choices for `score_func` are not compatible with the dataset. " + f"Updating choices to {value_range}") + + kernel = HyperparameterSearchSpace(hyperparameter='kernel', + value_range=value_range, + default_value=value_range[-1], + ) - add_hyperparameter(cs, n_components, UniformIntegerHyperparameter) kernel_hp = get_hyperparameter(kernel, CategoricalHyperparameter) gamma = get_hyperparameter(gamma, UniformFloatHyperparameter) coef0 = get_hyperparameter(coef0, UniformFloatHyperparameter) @@ -119,5 +164,8 @@ def get_hyperparameter_search_space( def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: return {'shortname': 'Nystroem', 'name': 'Nystroem kernel approximation', - 'handles_sparse': True + 'handles_sparse': True, + 'handles_classification': True, + 'handles_regression': True, + 'handles_signed': True } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PCA.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PCA.py new file mode 100644 index 000000000..1a64e1ed5 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PCA.py @@ -0,0 +1,69 @@ +from typing import Any, Dict, Optional + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, +) + +import numpy as np + +import sklearn.decomposition +from sklearn.base import BaseEstimator + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter + + +class PCA(autoPyTorchFeaturePreprocessingComponent): + def __init__(self, keep_variance: float = 0.9999, + whiten: bool = False, + random_state: Optional[np.random.RandomState] = None + ): + self.keep_variance = keep_variance + self.whiten = whiten + super().__init__(random_state=random_state) + + self.add_fit_requirements([ + FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)]) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + self.check_requirements(X, y) + + n_components = float(self.keep_variance) + self.preprocessor['numerical'] = sklearn.decomposition.PCA( + n_components=n_components, whiten=self.whiten, + random_state=self.random_state) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + keep_variance: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='keep_variance', + value_range=(0.5, 0.9999), + default_value=0.9999, + log=True), + whiten: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='whiten', + value_range=(True, False), + default_value=False, + ), + ) -> ConfigurationSpace: + + cs = ConfigurationSpace() + + add_hyperparameter(cs, keep_variance, UniformFloatHyperparameter) + add_hyperparameter(cs, whiten, CategoricalHyperparameter) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'PCA', + 'name': 'Principal Component Analysis', + 'handles_sparse': False, + 'handles_classification': True, + 'handles_regression': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py index 38ca15b1c..dfc085d24 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py @@ -37,7 +37,10 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: return {'shortname': 'PolynomialFeatures', 'name': 'PolynomialFeatures', - 'handles_sparse': True} + 'handles_sparse': True, + 'handles_classification': True, + 'handles_regression': True + } @staticmethod def get_hyperparameter_search_space( diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py deleted file mode 100644 index cb3eb2b54..000000000 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py +++ /dev/null @@ -1,49 +0,0 @@ -from typing import Any, Dict, Optional - -from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import ( - CategoricalHyperparameter, -) - -import numpy as np - -import sklearn.preprocessing -from sklearn.base import BaseEstimator - -from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ - base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent -from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter - - -class PowerTransformer(autoPyTorchFeaturePreprocessingComponent): - def __init__(self, standardize: bool = True, - random_state: Optional[np.random.RandomState] = None): - self.standardize = standardize - - super().__init__(random_state=random_state) - - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: - self.preprocessor['numerical'] = sklearn.preprocessing.PowerTransformer(method="yeo-johnson", - standardize=self.standardize, - copy=False) - return self - - @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: - return {'shortname': 'PowerTransformer', - 'name': 'Power Transformer', - 'handles_sparse': True} - - @staticmethod - def get_hyperparameter_search_space( - dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, - standardize: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='standardize', - value_range=(True, False), - default_value=True, - ), - ) -> ConfigurationSpace: - cs = ConfigurationSpace() - add_hyperparameter(cs, standardize, CategoricalHyperparameter) - - return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py index 44cc169f4..cc0b1d628 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py @@ -1,5 +1,4 @@ -from math import ceil, floor -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -15,10 +14,30 @@ from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + utils import percentage_value_range_to_integer_range from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter class RandomKitchenSinks(autoPyTorchFeaturePreprocessingComponent): + """ + Approximate a RBF kernel feature map using random Fourier features. + + Args: + n_components (int): + Number of Monte Carlo samples per original feature. + Equals the dimensionality of the computed feature space. + Note: + This number needs to be less than the total number of + features. To keep the hyperparameter search space general + to different datasets, autoPyTorch defines its value + range as the percentage of the number of features (in float). + This is then used to construct the range of n_components using + n_components = percentage of features * number of features. + Defaults to 100. + gamma (float): + Parameter of RBF kernel: exp(-gamma * x^2). Defaults to 1.0. + """ def __init__(self, n_components: int = 100, gamma: float = 1.0, random_state: Optional[np.random.RandomState] = None @@ -47,24 +66,12 @@ def get_hyperparameter_search_space( ) -> ConfigurationSpace: cs = ConfigurationSpace() - if dataset_properties is not None: - n_features = len(dataset_properties['numerical_columns']) \ - if isinstance(dataset_properties['numerical_columns'], List) else 0 - if n_features == 1: - log = False - else: - log = n_components.log - n_components = HyperparameterSearchSpace(hyperparameter='n_components', - value_range=( - floor(float(n_components.value_range[0]) * n_features), - ceil(float(n_components.value_range[1]) * n_features)), - default_value=ceil(float(n_components.default_value) * n_features), - log=log) - else: - n_components = HyperparameterSearchSpace(hyperparameter='n_components', - value_range=(10, 2000), - default_value=100, - log=n_components.log) + n_components = percentage_value_range_to_integer_range( + hyperparameter_search_space=n_components, + default_value_range=(10, 2000), + default_value=100, + dataset_properties=dataset_properties, + ) add_hyperparameter(cs, n_components, UniformIntegerHyperparameter) @@ -75,5 +82,7 @@ def get_hyperparameter_search_space( def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: return {'shortname': 'KitchenSink', 'name': 'Random Kitchen Sinks', - 'handles_sparse': True + 'handles_sparse': True, + 'handles_classification': True, + 'handles_regression': True } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomTreesEmbedding.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomTreesEmbedding.py new file mode 100644 index 000000000..10c92fdd1 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomTreesEmbedding.py @@ -0,0 +1,104 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformIntegerHyperparameter +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.ensemble import RandomTreesEmbedding as SklearnRandomTreesEmbedding + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + utils import NoneType_ +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, check_none + + +class RandomTreesEmbedding(autoPyTorchFeaturePreprocessingComponent): + def __init__(self, n_estimators: int = 10, + max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2, + min_samples_leaf: int = 1, + max_leaf_nodes: Union[int, NoneType_] = "none", + sparse_output: bool = False, + random_state: Optional[np.random.RandomState] = None): + self.n_estimators = n_estimators + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.max_leaf_nodes = max_leaf_nodes + self.sparse_output = sparse_output + + super().__init__(random_state=random_state) + + def get_components_kwargs(self) -> Dict[str, Any]: + """ + returns keyword arguments required by the feature preprocessor + + Returns: + Dict[str, Any]: kwargs + """ + return dict( + n_estimators=self.n_estimators, + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + max_leaf_nodes=self.max_leaf_nodes, + sparse_output=self.sparse_output, + random_state=self.random_state + ) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + if check_none(self.max_leaf_nodes): + self.max_leaf_nodes = None + if check_none(self.max_depth): + self.max_depth = None + + self.preprocessor['numerical'] = SklearnRandomTreesEmbedding(**self.get_components_kwargs()) + return self + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'RandomTreesEmbedding', + 'name': 'Random Trees Embedding', + 'handles_sparse': True, + 'handles_classification': True, + 'handles_regression': True + } + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators', + value_range=(10, 100), + default_value=10, + ), + max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth', + value_range=(2, 10), + default_value=5, + ), + min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split', + value_range=(2, 20), + default_value=2, + ), + min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf', + value_range=(1, 20), + default_value=1, + ), + max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes', + value_range=("none",), + default_value="none", + ), + ) -> ConfigurationSpace: + + cs = ConfigurationSpace() + add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter) + add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter) + add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter) + add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter) + add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter) + + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileClassification.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileClassification.py new file mode 100644 index 000000000..1ba4d5307 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileClassification.py @@ -0,0 +1,88 @@ +from functools import partial +from typing import Any, Dict, Optional + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.feature_selection import SelectPercentile, chi2, f_classif, mutual_info_classif + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.utils \ + import filter_score_func_choices +from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter + + +SCORE_FUNC_CHOICES = ("chi2", "mutual_info_classif", "f_classif") + + +class SelectPercentileClassification(autoPyTorchFeaturePreprocessingComponent): + """ + Select features according to a percentile of the highest scores. + Scores are calculated using one of SCORE_FUNC_CHOICES + """ + def __init__(self, score_func: str = "chi2", + percentile: int = 50, + random_state: Optional[np.random.RandomState] = None + ): + self.percentile = percentile + if score_func == "chi2": + self.score_func = chi2 + elif score_func == "f_classif": + self.score_func = f_classif + elif score_func == "mutual_info_classif": + self.score_func = partial(mutual_info_classif, random_state=random_state) + else: + raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, " + "but is: {score_func}") + + super().__init__(random_state=random_state) + self.add_fit_requirements([ + FitRequirement('issigned', (bool,), user_defined=True, dataset_property=True)]) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = SelectPercentile( + percentile=self.percentile, score_func=self.score_func) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + percentile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="percentile", + value_range=(1, 99), + default_value=50, + ), + score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func", + value_range=SCORE_FUNC_CHOICES, + default_value="chi2", + ), + ) -> ConfigurationSpace: + score_func = filter_score_func_choices(class_name="SelectPercentileClassification", + dataset_properties=dataset_properties, + score_func=score_func) + cs = ConfigurationSpace() + + add_hyperparameter(cs, score_func, CategoricalHyperparameter) + add_hyperparameter(cs, percentile, UniformIntegerHyperparameter) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'SPC', + 'name': 'Select Percentile Classification', + 'handles_sparse': True, + 'handles_regression': False, + 'handles_classification': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileRegression.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileRegression.py new file mode 100644 index 000000000..7a51b9f86 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectPercentileRegression.py @@ -0,0 +1,81 @@ +from functools import partial +from typing import Any, Dict, Optional + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.feature_selection import SelectPercentile, f_regression, mutual_info_regression + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter + + +SCORE_FUNC_CHOICES = ('f_regression', 'mutual_info') + + +class SelectPercentileRegression(autoPyTorchFeaturePreprocessingComponent): + """ + Select features according to a percentile of the highest scores. + Scores are calculated using one of SCORE_FUNC_CHOICES + """ + def __init__(self, score_func: str = "f_regression", + percentile: int = 50, + random_state: Optional[np.random.RandomState] = None + ): + self.percentile = percentile + if score_func == "f_regression": + self.score_func = f_regression + elif score_func == "mutual_info": + self.score_func = partial(mutual_info_regression, random_state=random_state) + else: + raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, " + "but is: {score_func}") + + super().__init__(random_state=random_state) + self.add_fit_requirements([ + FitRequirement('issigned', (bool,), user_defined=True, dataset_property=True)]) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = SelectPercentile( + percentile=self.percentile, score_func=self.score_func) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + percentile: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="percentile", + value_range=(1, 99), + default_value=50, + ), + score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func", + value_range=SCORE_FUNC_CHOICES, + default_value="f_regression", + ), + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + + add_hyperparameter(cs, score_func, CategoricalHyperparameter) + add_hyperparameter(cs, percentile, UniformIntegerHyperparameter) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'SPR', + 'name': 'Select Percentile Regression', + 'handles_sparse': True, + 'handles_regression': True, + 'handles_classification': False + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesClassification.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesClassification.py new file mode 100644 index 000000000..d760e3f6b --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesClassification.py @@ -0,0 +1,109 @@ +from functools import partial +from typing import Any, Dict, Optional + +from ConfigSpace.conditions import NotEqualsCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.feature_selection import GenericUnivariateSelect, chi2, f_classif, mutual_info_classif + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.utils \ + import filter_score_func_choices +from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, get_hyperparameter + + +SCORE_FUNC_CHOICES = ("chi2", "mutual_info_classif", "f_classif") + + +class SelectRatesClassification(autoPyTorchFeaturePreprocessingComponent): + """ + Univariate feature selector by selecting the best features based on + univariate statistical tests. Tests can be one of SCORE_FUNC_CHOICES + """ + def __init__(self, alpha: float = 0.1, + score_func: str = "chi2", + mode: str = "fpr", + random_state: Optional[np.random.RandomState] = None + ): + self.mode = mode + self.alpha = alpha + if score_func == "chi2": + self.score_func = chi2 + elif score_func == "f_classif": + self.score_func = f_classif + elif score_func == "mutual_info_classif": + self.score_func = partial(mutual_info_classif, + random_state=random_state) + # mutual info classif constantly crashes without mode percentile + self.mode = "percentile" + else: + raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, " + "but is: {score_func}") + + super().__init__(random_state=random_state) + self.add_fit_requirements([ + FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True), + FitRequirement('issigned', (bool,), user_defined=True, dataset_property=True)]) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = GenericUnivariateSelect( + mode=self.mode, score_func=self.score_func, param=self.alpha) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + alpha: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="alpha", + value_range=(0.01, 0.5), + default_value=0.1, + ), + mode: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="mode", + value_range=('fpr', 'fdr', 'fwe', "percentile"), + default_value='fpr', + ), + score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func", + value_range=SCORE_FUNC_CHOICES, + default_value="chi2", + ), + ) -> ConfigurationSpace: + + score_func = filter_score_func_choices(class_name="SelectPercentileClassification", + dataset_properties=dataset_properties, + score_func=score_func) + + cs = ConfigurationSpace() + + score_func_hp = get_hyperparameter(score_func, CategoricalHyperparameter) + add_hyperparameter(cs, alpha, UniformFloatHyperparameter) + mode_hp = get_hyperparameter(mode, CategoricalHyperparameter) + + cs.add_hyperparameters([mode_hp, score_func_hp]) + # mutual_info_classif constantly crashes if mode is not percentile + # as a WA, fix the mode for this score + if "mutual_info_classif" in score_func_hp.choices: + cond = NotEqualsCondition(mode_hp, score_func_hp, 'mutual_info_classif') + cs.add_condition(cond) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'SRC', + 'name': 'Select Rates Classification', + 'handles_sparse': True, + 'handles_regression': False, + 'handles_classification': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesRegression.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesRegression.py new file mode 100644 index 000000000..f683e99c9 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/SelectRatesRegression.py @@ -0,0 +1,83 @@ +from typing import Any, Dict, Optional + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, +) + +import numpy as np + +from sklearn.base import BaseEstimator +from sklearn.feature_selection import GenericUnivariateSelect, f_regression + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter + + +SCORE_FUNC_CHOICES = ('f_regression',) + + +class SelectRatesRegression(autoPyTorchFeaturePreprocessingComponent): + """ + Univariate feature selector by selecting the best features based on + univariate statistical tests. Tests can be one of SCORE_FUNC_CHOICES + """ + def __init__(self, score_func: str = "f_regression", + alpha: float = 0.1, mode: str = "fpr", + random_state: Optional[np.random.RandomState] = None + ): + self.mode = mode + self.alpha = alpha + if score_func == "f_regression": + self.score_func = f_regression + else: + raise ValueError(f"score_func of {self.__class__.__name__} must be in {SCORE_FUNC_CHOICES}, " + "but is: {score_func}") + + super().__init__(random_state=random_state) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = GenericUnivariateSelect( + mode=self.mode, score_func=self.score_func, param=self.alpha) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + alpha: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="alpha", + value_range=(0.01, 0.5), + default_value=0.1, + ), + mode: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="mode", + value_range=('fpr', 'fdr', 'fwe'), + default_value='fpr', + ), + score_func: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="score_func", + value_range=SCORE_FUNC_CHOICES, + default_value="f_regression", + ), + ) -> ConfigurationSpace: + + cs = ConfigurationSpace() + + add_hyperparameter(cs, score_func, CategoricalHyperparameter) + add_hyperparameter(cs, alpha, UniformFloatHyperparameter) + add_hyperparameter(cs, mode, CategoricalHyperparameter) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: + return {'shortname': 'SRR', + 'name': 'Select Rates Regression', + 'handles_sparse': True, + 'handles_regression': True, + 'handles_classification': False + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py index 55576a58f..2b830c8ae 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py @@ -1,5 +1,4 @@ -from math import floor -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( @@ -14,10 +13,26 @@ from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing \ .base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + utils import percentage_value_range_to_integer_range from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter class TruncatedSVD(autoPyTorchFeaturePreprocessingComponent): + """ + Linear dimensionality reduction by means of truncated singular value decomposition (SVD). + + Args: + target_dim (int): + Desired dimensionality of output data. + Note: + This number needs to be less than the total number of + features. To keep the hyperparameter search space general + to different datasets, autoPyTorch defines its value + range as the percentage of the number of features (in float). + This is then used to construct the range of target_dim using + target_dim = percentage of features * number of features. Defaults to 128. + """ def __init__(self, target_dim: int = 128, random_state: Optional[np.random.RandomState] = None): self.target_dim = target_dim @@ -35,7 +50,9 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: return {'shortname': 'TruncSVD', 'name': 'Truncated Singular Value Decomposition', - 'handles_sparse': True} + 'handles_sparse': True, + 'handles_classification': True, + 'handles_regression': True} @staticmethod def get_hyperparameter_search_space( @@ -47,19 +64,12 @@ def get_hyperparameter_search_space( ) -> ConfigurationSpace: cs = ConfigurationSpace() - if dataset_properties is not None: - n_features = len(dataset_properties['numerical_columns']) if isinstance( - dataset_properties['numerical_columns'], List) else 0 - target_dim = HyperparameterSearchSpace(hyperparameter=target_dim.hyperparameter, - value_range=(floor(float(target_dim.value_range[0]) * n_features), - floor(float(target_dim.value_range[1]) * n_features)), - default_value=floor(float(target_dim.default_value) * n_features), - log=target_dim.log) - else: - target_dim = HyperparameterSearchSpace(hyperparameter=target_dim.hyperparameter, - value_range=(10, 256), - default_value=128, - log=target_dim.log) + target_dim = percentage_value_range_to_integer_range( + hyperparameter_search_space=target_dim, + default_value_range=(10, 256), + default_value=128, + dataset_properties=dataset_properties, + ) add_hyperparameter(cs, target_dim, UniformIntegerHyperparameter) return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py index a3937a626..0e964ab56 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py @@ -5,6 +5,7 @@ import ConfigSpace.hyperparameters as CSH from ConfigSpace.configuration_space import ConfigurationSpace +from autoPyTorch.constants import CLASSIFICATION_TASKS, REGRESSION_TASKS, STRING_TO_TASK_TYPES from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice from autoPyTorch.pipeline.components.base_component import ( @@ -46,6 +47,79 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]: components.update(_addons.components) return components + def get_available_components( + self, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + include: List[str] = None, + exclude: List[str] = None, + ) -> Dict[str, autoPyTorchComponent]: + """Filters out components based on user provided + include/exclude directives, as well as the dataset properties + + Args: + include (Optional[Dict[str, Any]]): what hyper-parameter configurations + to honor when creating the configuration space + exclude (Optional[Dict[str, Any]]): what hyper-parameter configurations + to remove from the configuration space + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): Characteristics + of the dataset to guide the pipeline choices of components + + Returns: + Dict[str, autoPyTorchComponent]: A filtered dict of learning + rate backbones + + """ + if dataset_properties is None: + dataset_properties = {} + + if include is not None and exclude is not None: + raise ValueError( + "The argument include and exclude cannot be used together.") + + available_comp = self.get_components() + + if include is not None: + for incl in include: + if incl not in available_comp: + raise ValueError("Trying to include unknown component: " + "%s" % incl) + + components_dict = OrderedDict() + for name in available_comp: + if include is not None and name not in include: + continue + elif exclude is not None and name in exclude: + continue + + entry = available_comp[name] + + # Exclude itself to avoid infinite loop + if entry is FeatureProprocessorChoice or hasattr(entry, 'get_components'): + continue + + task_type = str(dataset_properties['task_type']) + properties = entry.get_properties() + if ( + STRING_TO_TASK_TYPES[task_type] in CLASSIFICATION_TASKS + and not bool(properties['handles_classification']) + ): + continue + elif ( + STRING_TO_TASK_TYPES[task_type] in REGRESSION_TASKS + and not bool(properties['handles_regression']) + ): + continue + + # target_type = dataset_properties['target_type'] + # Apply some automatic filtering here for + # backbones based on the dataset! + # TODO: Think if there is any case where a preprocessor + # is not compatible for a certain dataset + + components_dict[name] = entry + + return components_dict + def get_hyperparameter_search_space(self, dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, default: Optional[str] = None, @@ -72,8 +146,16 @@ def get_hyperparameter_search_space(self, 'RandomKitchenSinks', 'Nystroem', 'PolynomialFeatures', - 'PowerTransformer', 'TruncatedSVD', + 'ExtraTreesPreprocessorClassification', + 'ExtraTreesPreprocessorRegression', + 'FeatureAgglomeration', + 'RandomTreesEmbedding', + 'SelectPercentileClassification', + 'SelectPercentileRegression', + 'SelectRatesClassification', + 'SelectRatesRegression', + 'LibLinearSVCPreprocessor' ] for default_ in defaults: if default_ in available_: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py index d11f69b90..eb576d472 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py @@ -10,7 +10,8 @@ class autoPyTorchFeaturePreprocessingComponent(autoPyTorchTabularPreprocessingComponent): - _required_properties: List[str] = ['handles_sparse'] + _required_properties: List[str] = [ + 'handles_sparse', 'handles_classification', 'handles_regression'] def __init__(self, random_state: Optional[np.random.RandomState] = None): if random_state is None: @@ -30,7 +31,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: Returns: (Dict[str, Any]): the updated 'X' dictionary """ - if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: + if self.preprocessor['numerical'] is None: raise AttributeError("{} can't tranform without fitting first" .format(self.__class__.__name__)) X.update({'feature_preprocessor': self.preprocessor}) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py new file mode 100644 index 000000000..5d91ac2b6 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py @@ -0,0 +1,97 @@ +import warnings +from math import ceil, floor +from typing import Dict, List, Optional, Sequence + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.utils.common import HyperparameterSearchSpace, HyperparameterValueType + + +NoneType_ = Optional[str] # This typing is exclusively for Literal["none", "None", None] +# TODO: when we drop support for 3.7 use the following line +# NoneType_ = Optional[Literal["none", "None"]] + + +def filter_score_func_choices( + class_name: str, + score_func: HyperparameterSearchSpace, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None +) -> HyperparameterSearchSpace: + """ + In the context of select rates classification or select percentile classification, + some score functions are not compatible with sparse or signed data. + This function filters out those score function from the search space of the component + depending on the dataset. + + Args: + score_func (HyperparameterSearchSpace) + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]): + Information about the dataset. Defaults to None. + + Raises: + ValueError: + if none of the score function choices are incompatible with the dataset + + Returns: + HyperparameterSearchSpace: + updated score function search space + """ + value_range = list(score_func.value_range) + if dataset_properties is not None: + if dataset_properties.get("issigned", False): + value_range = [value for value in value_range if value not in ("chi2", "mutual_info_classif")] + if dataset_properties.get("issparse", False): + value_range = [value for value in value_range if value != "f_classif"] + + if sorted(value_range) != sorted(list(score_func.value_range)): + warnings.warn(f"Given choices for `score_func` are not compatible with the dataset. " + f"Updating choices to {value_range}") + + if len(value_range) == 0: + raise ValueError(f"`{class_name}` is not compatible with the" + f" current dataset as it is both `signed` and `sparse`") + default_value = score_func.default_value if score_func.default_value in value_range else value_range[-1] + score_func = HyperparameterSearchSpace(hyperparameter="score_func", + value_range=value_range, + default_value=default_value, + ) + return score_func + + +def percentage_value_range_to_integer_range( + hyperparameter_search_space: HyperparameterSearchSpace, + default_value_range: Sequence[HyperparameterValueType], + default_value: int, + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, +) -> HyperparameterSearchSpace: + """ + For some feature preprocessors, the value of an integer hyperparameter + needs to be lower than the number of features. To facilitate this, + autoPyTorch uses a value range based on the percentage of the number + of features. This function converts that hyperparameter search space + to an integer value range as is required by the underlying sklearn + preprocessors. + """ + hyperparameter_name = hyperparameter_search_space.hyperparameter + if dataset_properties is not None: + n_features = len(dataset_properties['numerical_columns']) if isinstance( + dataset_properties['numerical_columns'], List) else 0 + if n_features == 1: + # log=True is not supported in ConfigSpace when the value range consists of 0 + # raising ValueError: Negative lower bound (0) for log-scale hyperparameter is forbidden. + log = False + else: + log = hyperparameter_search_space.log + hyperparameter_search_space = HyperparameterSearchSpace( + hyperparameter=hyperparameter_name, + value_range=( + floor(float(hyperparameter_search_space.value_range[0]) * n_features), + floor(float(hyperparameter_search_space.value_range[1]) * n_features)), + default_value=ceil(float(hyperparameter_search_space.default_value) * n_features), + log=log) + else: + hyperparameter_search_space = HyperparameterSearchSpace(hyperparameter=hyperparameter_name, + value_range=default_value_range, + default_value=default_value, + log=hyperparameter_search_space.log) + + return hyperparameter_search_space diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py index ea09798ce..608ee8ec5 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/SimpleImputer.py @@ -1,9 +1,7 @@ from typing import Any, Dict, List, Optional, Union from ConfigSpace.configuration_space import ConfigurationSpace -from ConfigSpace.hyperparameters import ( - CategoricalHyperparameter -) +from ConfigSpace.hyperparameters import CategoricalHyperparameter import numpy as np @@ -16,91 +14,101 @@ class SimpleImputer(BaseImputer): """ - Impute missing values for categorical columns with '!missing!' - (In case of numpy data, the constant value is set to -1, under - the assumption that categorical data is fit with an Ordinal Scaler) + An imputer for numerical columns + + Attributes: + random_state (Optional[np.random.RandomState]): + The random state to use for the imputer. + numerical_strategy (str: default='mean'): + The strategy to use for imputing numerical columns. + Can be one of ['most_frequent', 'constant_!missing!'] """ - def __init__(self, - random_state: Optional[Union[np.random.RandomState, int]] = None, - numerical_strategy: str = 'mean', - categorical_strategy: str = 'most_frequent'): + def __init__( + self, + random_state: Optional[np.random.RandomState] = None, + numerical_strategy: str = 'mean', + ): super().__init__() self.random_state = random_state self.numerical_strategy = numerical_strategy - self.categorical_strategy = categorical_strategy - def fit(self, X: Dict[str, Any], y: Any = None) -> BaseImputer: + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> BaseImputer: """ - The fit function calls the fit function of the underlying model - and returns the transformed array. + Builds the preprocessor based on the given fit dictionary 'X'. + Args: - X (np.ndarray): input features - y (Optional[np.ndarray]): input labels + X (Dict[str, Any]): + The fit dictionary + y (Optional[Any]): + Not Used -- to comply with API Returns: - instance of self + self: + returns an instance of self. """ self.check_requirements(X, y) - categorical_columns = X['dataset_properties']['categorical_columns'] \ - if isinstance(X['dataset_properties']['categorical_columns'], List) else [] - if len(categorical_columns) != 0: - if self.categorical_strategy == 'constant_!missing!': - self.preprocessor['categorical'] = SklearnSimpleImputer(strategy='constant', - # Train data is numpy - # as of this point, where - # Ordinal Encoding is using - # for categorical. Only - # Numbers are allowed - # fill_value='!missing!', - fill_value=-1, - copy=False) - else: - self.preprocessor['categorical'] = SklearnSimpleImputer(strategy=self.categorical_strategy, - copy=False) - numerical_columns = X['dataset_properties']['numerical_columns'] \ - if isinstance(X['dataset_properties']['numerical_columns'], List) else [] - if len(numerical_columns) != 0: + + # Choose an imputer for any numerical columns + numerical_columns = X['dataset_properties']['numerical_columns'] + + if isinstance(numerical_columns, List) and len(numerical_columns) > 0: if self.numerical_strategy == 'constant_zero': - self.preprocessor['numerical'] = SklearnSimpleImputer(strategy='constant', - fill_value=0, - copy=False) + imputer = SklearnSimpleImputer(strategy='constant', fill_value=0, copy=False) + self.preprocessor['numerical'] = imputer else: - self.preprocessor['numerical'] = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False) + imputer = SklearnSimpleImputer(strategy=self.numerical_strategy, copy=False) + self.preprocessor['numerical'] = imputer return self @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, - numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='numerical_strategy', - value_range=("mean", "median", - "most_frequent", - "constant_zero"), - default_value="mean", - ), - categorical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( - hyperparameter='categorical_strategy', - value_range=("most_frequent", - "constant_!missing!"), - default_value="most_frequent") + numerical_strategy: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter='numerical_strategy', + value_range=("mean", "median", "most_frequent", "constant_zero"), + default_value="mean", + ), ) -> ConfigurationSpace: + """Get the hyperparameter search space for the SimpleImputer + + Args: + dataset_properties (Optional[Dict[str, BaseDatasetPropertiesType]]) + Properties that describe the dataset + Note: Not actually Optional, just adhering to its supertype + numerical_strategy (HyperparameterSearchSpace: default = ...) + The strategy to use for numerical imputation + + Returns: + ConfigurationSpace + The space of possible configurations for a SimpleImputer with the given + `dataset_properties` + """ cs = ConfigurationSpace() - assert dataset_properties is not None, "To create hyperparameter search space" \ - ", dataset_properties should not be None" - if len(dataset_properties['numerical_columns']) \ - if isinstance(dataset_properties['numerical_columns'], List) else 0 != 0: - add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter) - if len(dataset_properties['categorical_columns']) \ - if isinstance(dataset_properties['categorical_columns'], List) else 0 != 0: - add_hyperparameter(cs, categorical_strategy, CategoricalHyperparameter) + if dataset_properties is None: + raise ValueError("SimpleImputer requires `dataset_properties` for generating" + " a search space.") + + if ( + isinstance(dataset_properties['numerical_columns'], List) + and len(dataset_properties['numerical_columns']) != 0 + ): + add_hyperparameter(cs, numerical_strategy, CategoricalHyperparameter) return cs @staticmethod - def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None - ) -> Dict[str, Union[str, bool]]: + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + """Get the properties of the SimpleImputer class and what it can handle + + Returns: + Dict[str, Union[str, bool]]: + A dict from property names to values + """ return { 'shortname': 'SimpleImputer', 'name': 'Simple Imputer', diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py index b65f3c229..1f33a765a 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/imputation/base_imputer.py @@ -14,8 +14,7 @@ class BaseImputer(autoPyTorchTabularPreprocessingComponent): def __init__(self) -> None: super().__init__() self.add_fit_requirements([ - FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), - FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)]) + FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)]) def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ @@ -26,7 +25,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: Returns: (Dict[str, Any]): the updated 'X' dictionary """ - if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: + if self.preprocessor['numerical'] is None and len(X["dataset_properties"]["numerical_columns"]) != 0: raise ValueError("cant call transform on {} without fitting first." .format(self.__class__.__name__)) X.update({'imputer': self.preprocessor}) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py new file mode 100644 index 000000000..7dd2502f9 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/PowerTransformer.py @@ -0,0 +1,38 @@ +from typing import Any, Dict, Optional, Union + +import numpy as np + +from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler + + +class PowerTransformer(BaseScaler): + """ + Map data to as close to a Gaussian distribution as possible + in order to reduce variance and minimize skewness. + + Uses `yeo-johnson` power transform method. Also, data is normalised + to zero mean and unit variance. + """ + def __init__(self, + random_state: Optional[np.random.RandomState] = None): + super().__init__() + self.random_state = random_state + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = SklearnPowerTransformer(method='yeo-johnson', copy=False) + return self + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'PowerTransformer', + 'name': 'PowerTransformer', + 'handles_sparse': False + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py new file mode 100644 index 000000000..cc0b4fa7a --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/QuantileTransformer.py @@ -0,0 +1,73 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter +) + +import numpy as np + +from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter + + +class QuantileTransformer(BaseScaler): + """ + Transform the features to follow a uniform or a normal distribution + using quantiles information. + + For more details of each attribute, see: + https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html + """ + def __init__( + self, + n_quantiles: int = 1000, + output_distribution: str = "normal", # Literal["normal", "uniform"] + random_state: Optional[np.random.RandomState] = None + ): + super().__init__() + self.random_state = random_state + self.n_quantiles = n_quantiles + self.output_distribution = output_distribution + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles, + output_distribution=self.output_distribution, + copy=False) + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles", + value_range=(10, 2000), + default_value=1000, + ), + output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution", + value_range=("uniform", "normal"), + default_value="normal", + ) + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + + # TODO parametrize like the Random Forest as n_quantiles = n_features^param + add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter) + add_hyperparameter(cs, output_distribution, CategoricalHyperparameter) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'QuantileTransformer', + 'name': 'QuantileTransformer', + 'handles_sparse': False + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py new file mode 100644 index 000000000..2c59d77c2 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/RobustScaler.py @@ -0,0 +1,73 @@ +from typing import Any, Dict, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, +) + +import numpy as np + +from sklearn.preprocessing import RobustScaler as SklearnRobustScaler + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler +from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter + + +class RobustScaler(BaseScaler): + """ + Remove the median and scale features according to the quantile_range to make + the features robust to outliers. + + For more details of the preprocessor, see: + https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html + """ + def __init__( + self, + q_min: float = 0.25, + q_max: float = 0.75, + random_state: Optional[np.random.RandomState] = None + ): + super().__init__() + self.add_fit_requirements([ + FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)]) + self.random_state = random_state + self.q_min = q_min + self.q_max = q_max + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler: + + self.check_requirements(X, y) + with_centering = bool(not X['dataset_properties']['issparse']) + + self.preprocessor['numerical'] = SklearnRobustScaler(quantile_range=(self.q_min, self.q_max), + with_centering=with_centering, + copy=False) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + q_min: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_min", + value_range=(0.001, 0.3), + default_value=0.25), + q_max: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="q_max", + value_range=(0.7, 0.999), + default_value=0.75) + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + + add_hyperparameter(cs, q_min, UniformFloatHyperparameter) + add_hyperparameter(cs, q_max, UniformFloatHyperparameter) + + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'RobustScaler', + 'name': 'RobustScaler', + 'handles_sparse': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py index 082b17cb9..d4d3ffeb5 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/__init__.py @@ -66,9 +66,21 @@ def get_hyperparameter_search_space(self, raise ValueError("no scalers found, please add a scaler") if default is None: - defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler'] + defaults = [ + 'StandardScaler', + 'Normalizer', + 'MinMaxScaler', + 'PowerTransformer', + 'QuantileTransformer', + 'RobustScaler', + 'NoScaler' + ] for default_ in defaults: if default_ in available_scalers: + if include is not None and default_ not in include: + continue + if exclude is not None and default_ in exclude: + continue default = default_ break diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py new file mode 100644 index 000000000..e5e71ea1e --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py @@ -0,0 +1,44 @@ +from typing import Any, Dict, Optional, Union + +import numpy as np + +from sklearn.feature_selection import VarianceThreshold as SklearnVarianceThreshold + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ + autoPyTorchTabularPreprocessingComponent + + +class VarianceThreshold(autoPyTorchTabularPreprocessingComponent): + """ + Removes features that have the same value in the training data. + """ + def __init__(self, random_state: Optional[np.random.RandomState] = None): + super().__init__() + + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'VarianceThreshold': + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = SklearnVarianceThreshold( + threshold=0.0 + ) + return self + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + if self.preprocessor['numerical'] is None: + raise ValueError("cannot call transform on {} without fitting first." + .format(self.__class__.__name__)) + X.update({'variance_threshold': self.preprocessor}) + return X + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + + return { + 'shortname': 'Variance Threshold', + 'name': 'Variance Threshold (constant feature removal)', + 'handles_sparse': True, + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 7fbf33f99..aa2b4c25f 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -6,7 +6,7 @@ import pandas as pd -from scipy.sparse import csr_matrix +from scipy.sparse import spmatrix from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent @@ -21,7 +21,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None self.random_state = random_state self.add_fit_requirements([ FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True), - FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, + FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True, dataset_property=False)]) def fit(self, X: Dict[str, Any], y: Any = None) -> "EarlyPreprocessing": diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index 1a04d6645..7ff914a98 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -5,7 +5,7 @@ import pandas as pd -from scipy.sparse import csr_matrix +from scipy.sparse import spmatrix import torch from torch import nn @@ -29,7 +29,7 @@ def __init__(self, super().__init__() self.add_fit_requirements([ FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True), - FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, + FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True, dataset_property=False), FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False), diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 8652c347c..2f3c5fb3c 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -44,7 +44,7 @@ def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: num_numerical_columns = numerical_column_transformer.transform( X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), - dtype=int) + dtype=np.int32) categories = X['dataset_properties']['categories'] for i, category in enumerate(categories): diff --git a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py index f39194477..365213bae 100644 --- a/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py +++ b/autoPyTorch/pipeline/components/training/data_loader/base_data_loader.py @@ -106,7 +106,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: # This parameter indicates that the data has been pre-processed for speed # Overwrite the datamanager with the pre-processes data datamanager.replace_data(X['X_train'], X['X_test'] if 'X_test' in X else None) - train_dataset, val_dataset = datamanager.get_dataset_for_training(split_id=X['split_id']) + + train_dataset = datamanager.get_dataset(split_id=X['split_id'], train=True) self.train_data_loader = torch.utils.data.DataLoader( train_dataset, @@ -118,15 +119,17 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader: collate_fn=custom_collate_fn, ) - self.val_data_loader = torch.utils.data.DataLoader( - val_dataset, - batch_size=min(self.batch_size, len(val_dataset)), - shuffle=False, - num_workers=X.get('num_workers', 0), - pin_memory=X.get('pin_memory', True), - drop_last=X.get('drop_last', False), - collate_fn=custom_collate_fn, - ) + if X.get('val_indices', None) is not None: + val_dataset = datamanager.get_dataset(split_id=X['split_id'], train=False) + self.val_data_loader = torch.utils.data.DataLoader( + val_dataset, + batch_size=min(self.batch_size, len(val_dataset)), + shuffle=False, + num_workers=X.get('num_workers', 0), + pin_memory=X.get('pin_memory', True), + drop_last=X.get('drop_last', True), + collate_fn=custom_collate_fn, + ) if X.get('X_test', None) is not None: self.test_data_loader = self.get_loader(X=X['X_test'], @@ -184,7 +187,6 @@ def get_val_data_loader(self) -> torch.utils.data.DataLoader: Returns: torch.utils.data.DataLoader: A validation data loader """ - assert self.val_data_loader is not None, "No val data loader fitted" return self.val_data_loader def get_test_data_loader(self) -> torch.utils.data.DataLoader: diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index e54006d10..c1008b3ba 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -66,6 +66,7 @@ def __init__(self, random_state=random_state) self.run_summary: Optional[RunSummary] = None self.writer: Optional[SummaryWriter] = None + self.early_stopping_split_type: Optional[str] = None self._fit_requirements: Optional[List[FitRequirement]] = [ FitRequirement("lr_scheduler", (_LRScheduler,), user_defined=False, dataset_property=False), FitRequirement("num_run", (int,), user_defined=False, dataset_property=False), @@ -277,6 +278,11 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic optimize_metric=None if not X['metrics_during_training'] else X.get('optimize_metric'), ) + if X['val_data_loader'] is not None: + self.early_stopping_split_type = 'val' + else: + self.early_stopping_split_type = 'train' + epoch = 1 while True: @@ -293,9 +299,17 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic writer=writer, ) + # its fine if train_loss is None due to `is_max_time_reached()` + if train_loss is None: + if self.budget_tracker.is_max_time_reached(): + break + else: + raise RuntimeError("Got an unexpected None in `train_loss`.") + val_loss, val_metrics, test_loss, test_metrics = None, {}, None, {} if self.eval_valid_each_epoch(X): - val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer) + if X['val_data_loader']: + val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer) if 'test_data_loader' in X and X['test_data_loader']: test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer) @@ -334,9 +348,13 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic if 'cuda' in X['device']: torch.cuda.empty_cache() + if self.run_summary.is_empty(): + raise RuntimeError("Budget exhausted without finishing an epoch.") + # wrap up -- add score if not evaluating every epoch if not self.eval_valid_each_epoch(X): - val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer) + if X['val_data_loader']: + val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer) if 'test_data_loader' in X and X['val_data_loader']: test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer) self.run_summary.add_performance( @@ -372,14 +390,17 @@ def _load_best_weights_and_clean_checkpoints(self, X: Dict[str, Any]) -> None: """ assert self.checkpoint_dir is not None # mypy assert self.run_summary is not None # mypy + assert self.early_stopping_split_type is not None # mypy best_path = os.path.join(self.checkpoint_dir, 'best.pth') - self.logger.debug(f" Early stopped model {X['num_run']} on epoch {self.run_summary.get_best_epoch()}") + best_epoch = self.run_summary.get_best_epoch(split_type=self.early_stopping_split_type) + self.logger.debug(f" Early stopped model {X['num_run']} on epoch {best_epoch}") # We will stop the training. Load the last best performing weights X['network'].load_state_dict(torch.load(best_path)) # Clean the temp dir shutil.rmtree(self.checkpoint_dir) + self.checkpoint_dir = None def early_stop_handler(self, X: Dict[str, Any]) -> bool: """ @@ -394,6 +415,7 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool: bool: If true, training should be stopped """ assert self.run_summary is not None + assert self.early_stopping_split_type is not None # mypy # Allow to disable early stopping if X['early_stopping'] is None or X['early_stopping'] < 0: @@ -403,7 +425,9 @@ def early_stop_handler(self, X: Dict[str, Any]) -> bool: if self.checkpoint_dir is None: self.checkpoint_dir = tempfile.mkdtemp(dir=X['backend'].temporary_directory) - epochs_since_best = self.run_summary.get_last_epoch() - self.run_summary.get_best_epoch() + last_epoch = self.run_summary.get_last_epoch() + best_epoch = self.run_summary.get_best_epoch(split_type=self.early_stopping_split_type) + epochs_since_best = last_epoch - best_epoch # Save the checkpoint if there is a new best epoch best_path = os.path.join(self.checkpoint_dir, 'best.pth') diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index 4909f56ce..4fe94ca4f 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -119,10 +119,11 @@ def add_performance(self, self.performance_tracker['val_metrics'][epoch] = val_metrics self.performance_tracker['test_metrics'][epoch] = test_metrics - def get_best_epoch(self, loss_type: str = 'val_loss') -> int: - # If we compute validation scores, prefer the performance + def get_best_epoch(self, split_type: str = 'val') -> int: + # If we compute for optimization, prefer the performance # metric to the loss if self.optimize_metric is not None: + metrics_type = f"{split_type}_metrics" scorer = CLASSIFICATION_METRICS[ self.optimize_metric ] if self.optimize_metric in CLASSIFICATION_METRICS else REGRESSION_METRICS[ @@ -131,13 +132,12 @@ def get_best_epoch(self, loss_type: str = 'val_loss') -> int: # Some metrics maximize, other minimize! opt_func = np.argmax if scorer._sign > 0 else np.argmin return int(opt_func( - [self.performance_tracker['val_metrics'][e][self.optimize_metric] - for e in range(1, len(self.performance_tracker['val_metrics']) + 1)] + [metrics[self.optimize_metric] for metrics in self.performance_tracker[metrics_type].values()] )) + 1 # Epochs start at 1 else: + loss_type = f"{split_type}_loss" return int(np.argmin( - [self.performance_tracker[loss_type][e] - for e in range(1, len(self.performance_tracker[loss_type]) + 1)], + list(self.performance_tracker[loss_type].values()), )) + 1 # Epochs start at 1 def get_last_epoch(self) -> int: @@ -179,6 +179,16 @@ def repr_last_epoch(self) -> str: string += '=' * 40 return string + def is_empty(self) -> bool: + """ + Checks if the object is empty or not + + Returns: + bool + """ + # if train_loss is empty, we can be sure that RunSummary is empty. + return not bool(self.performance_tracker['train_loss']) + class BaseTrainerComponent(autoPyTorchTrainingComponent): @@ -277,7 +287,7 @@ def _scheduler_step( def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int, writer: Optional[SummaryWriter], - ) -> Tuple[float, Dict[str, float]]: + ) -> Tuple[Optional[float], Dict[str, float]]: """ Train the model for a single epoch. @@ -317,6 +327,9 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int, epoch * len(train_loader) + step, ) + if N == 0: + return None, {} + self._scheduler_step(step_interval=StepIntervalUnit.epoch, loss=loss_sum / N) if self.metrics_during_training: diff --git a/autoPyTorch/pipeline/create_searchspace_util.py b/autoPyTorch/pipeline/create_searchspace_util.py index f66371917..640a787e2 100644 --- a/autoPyTorch/pipeline/create_searchspace_util.py +++ b/autoPyTorch/pipeline/create_searchspace_util.py @@ -47,7 +47,7 @@ def get_match_array( matches_dimensions = [len(choices) for choices in node_i_choices] # Start by allowing every combination of nodes. Go through all # combinations/pipelines and erase the illegal ones - matches = np.ones(matches_dimensions, dtype=int) + matches = np.ones(matches_dimensions, dtype=np.int32) # TODO: Check if we need this, like are there combinations from the # pipeline we should dynamically avoid? diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index b95de512e..720d0af64 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -19,6 +19,9 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( + CoalescerChoice +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import ( EncoderChoice ) @@ -27,6 +30,8 @@ ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent @@ -307,6 +312,8 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), + ("variance_threshold", VarianceThreshold(random_state=self.random_state)), + ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 57d0126d0..06da9cabb 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -19,6 +19,9 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( + CoalescerChoice +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import ( EncoderChoice ) @@ -27,6 +30,8 @@ ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent @@ -257,6 +262,8 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), + ("variance_threshold", VarianceThreshold(random_state=self.random_state)), + ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, diff --git a/autoPyTorch/py.typed b/autoPyTorch/py.typed new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/autoPyTorch/py.typed @@ -0,0 +1 @@ + diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index 7be8a233c..4e4d4e804 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -1,3 +1,4 @@ +from enum import Enum from typing import Any, Dict, Iterable, List, NamedTuple, Optional, Sequence, Type, Union from ConfigSpace.configuration_space import ConfigurationSpace @@ -13,7 +14,7 @@ import pandas as pd -import scipy.sparse +from scipy.sparse import spmatrix import torch from torch.utils.data.dataloader import default_collate @@ -21,6 +22,11 @@ HyperparameterValueType = Union[int, str, float] +def ispandas(X: Any) -> bool: + """ Whether X is pandas.DataFrame or pandas.Series """ + return hasattr(X, "iloc") + + class FitRequirement(NamedTuple): """ A class that holds inputs required to fit a pipeline. Also indicates whether @@ -75,6 +81,27 @@ def __str__(self) -> str: self.hyperparameter, self.value_range, self.default_value, self.log) +class autoPyTorchEnum(str, Enum): + """ + Utility class for enums in autoPyTorch. + Allows users to use strings, while we internally use + this enum + """ + def __eq__(self, other: Any) -> bool: + if isinstance(other, autoPyTorchEnum): + return type(self) == type(other) and self.value == other.value + elif isinstance(other, str): + return bool(self.value == other) + else: + enum_name = self.__class__.__name__ + raise RuntimeError(f"Unsupported type {type(other)}. " + f"{enum_name} only supports `str` and" + f"`{enum_name}`") + + def __hash__(self) -> int: + return hash(self.value) + + def custom_collate_fn(batch: List) -> List[Optional[torch.Tensor]]: """ In the case of not providing a y tensor, in a @@ -146,10 +173,10 @@ def get_device_from_fit_dictionary(X: Dict[str, Any]) -> torch.device: return torch.device(X.get("device", "cpu")) -def subsampler(data: Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix], +def subsampler(data: Union[np.ndarray, pd.DataFrame, spmatrix], x: Union[np.ndarray, List[int]] - ) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix]: - return data[x] if isinstance(data, (np.ndarray, scipy.sparse.csr_matrix)) else data.iloc[x] + ) -> Union[np.ndarray, pd.DataFrame, spmatrix]: + return data[x] if isinstance(data, (np.ndarray, spmatrix)) else data.iloc[x] def get_hyperparameter(hyperparameter: HyperparameterSearchSpace, @@ -214,3 +241,20 @@ def add_hyperparameter(cs: ConfigurationSpace, None """ cs.add_hyperparameter(get_hyperparameter(hyperparameter, hyperparameter_type)) + + +def check_none(p: Any) -> bool: + """ + utility function to check if `p` is None. + + Args: + p (str): + variable to check + + Returns: + bool: + True, if `p` is in (None, "none", "None") + """ + if p in ("None", "none", None): + return True + return False diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py index a0b020622..4b699e3c3 100644 --- a/autoPyTorch/utils/implementations.py +++ b/autoPyTorch/utils/implementations.py @@ -1,7 +1,11 @@ -from typing import Any, Callable, Dict, Type, Union +from typing import Any, Callable, Dict, List, Optional, Type, Union import numpy as np +from scipy import sparse + +from sklearn.base import BaseEstimator, TransformerMixin + import torch @@ -59,3 +63,124 @@ def __call__(self, y: Union[np.ndarray, torch.Tensor]) -> np.ndarray: @staticmethod def get_properties() -> Dict[str, Any]: return {'supported_losses': ['BCEWithLogitsLoss']} + + +class MinorityCoalesceTransformer(BaseEstimator, TransformerMixin): + """ Group together categories whose occurrence is less than a specified min_frac.""" + def __init__(self, min_frac: Optional[float] = None): + self.min_frac = min_frac + self._categories_to_coalesce: Optional[List[np.ndarray]] = None + + if self.min_frac is not None and (self.min_frac < 0 or self.min_frac > 1): + raise ValueError(f"min_frac for {self.__class__.__name__} must be in [0, 1], but got {min_frac}") + + def _check_dataset(self, X: Union[np.ndarray, sparse.csr_matrix]) -> None: + """ + When transforming datasets, we modify values to: + * 0 for nan values + * -1 for unknown values + * -2 for values to be coalesced + For this reason, we need to check whether datasets have values + smaller than -2 to avoid mis-transformation. + Note that zero-imputation is the default setting in SimpleImputer of sklearn. + + Args: + X (np.ndarray): + The input features from the user, likely transformed by an encoder and imputator. + """ + X_data = X.data if sparse.issparse(X) else X + if np.nanmin(X_data) <= -2: + raise ValueError("The categoricals in input features for MinorityCoalesceTransformer " + "cannot have integers smaller than -2.") + + @staticmethod + def _get_column_data( + X: Union[np.ndarray, sparse.csr_matrix], + col_idx: int, + is_sparse: bool + ) -> Union[np.ndarray, sparse.csr_matrix]: + """ + Args: + X (Union[np.ndarray, sparse.csr_matrix]): + The feature tensor with only categoricals. + col_idx (int): + The index of the column to get the data. + is_sparse (bool): + Whether the tensor is sparse or not. + + Return: + col_data (Union[np.ndarray, sparse.csr_matrix]): + The column data of the tensor. + """ + + if is_sparse: + assert not isinstance(X, np.ndarray) # mypy check + indptr_start = X.indptr[col_idx] + indptr_end = X.indptr[col_idx + 1] + col_data = X.data[indptr_start:indptr_end] + else: + col_data = X[:, col_idx] + + return col_data + + def fit(self, X: Union[np.ndarray, sparse.csr_matrix], + y: Optional[np.ndarray] = None) -> 'MinorityCoalesceTransformer': + """ + Train the estimator to identify low frequency classes on the input train data. + + Args: + X (Union[np.ndarray, sparse.csr_matrix]): + The input features from the user, likely transformed by an encoder and imputator. + y (Optional[np.ndarray]): + Optional labels for the given task, not used by this estimator. + """ + self._check_dataset(X) + n_instances, n_features = X.shape + + if self.min_frac is None: + self._categories_to_coalesce = [np.array([]) for _ in range(n_features)] + return self + + categories_to_coalesce: List[np.ndarray] = [] + is_sparse = sparse.issparse(X) + for col in range(n_features): + col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse) + unique_vals, counts = np.unique(col_data, return_counts=True) + frac = counts / n_instances + categories_to_coalesce.append(unique_vals[frac < self.min_frac]) + + self._categories_to_coalesce = categories_to_coalesce + return self + + def transform( + self, + X: Union[np.ndarray, sparse.csr_matrix] + ) -> Union[np.ndarray, sparse.csr_matrix]: + """ + Coalesce categories with low frequency in X. + + Args: + X (Union[np.ndarray, sparse.csr_matrix]): + The input features from the user, likely transformed by an encoder and imputator. + """ + self._check_dataset(X) + + if self._categories_to_coalesce is None: + raise RuntimeError("fit() must be called before transform()") + + if self.min_frac is None: + return X + + n_features = X.shape[1] + is_sparse = sparse.issparse(X) + + for col in range(n_features): + # -2 stands coalesced. For more details, see the doc in _check_dataset + col_data = self._get_column_data(X=X, col_idx=col, is_sparse=is_sparse) + mask = np.isin(col_data, self._categories_to_coalesce[col]) + col_data[mask] = -2 + + return X + + def fit_transform(self, X: np.ndarray, y: Optional[np.ndarray] = None) -> np.ndarray: + return self.fit(X, y).transform(X) diff --git a/autoPyTorch/utils/results_manager.py b/autoPyTorch/utils/results_manager.py new file mode 100644 index 000000000..c1860b0f6 --- /dev/null +++ b/autoPyTorch/utils/results_manager.py @@ -0,0 +1,686 @@ +import io +from datetime import datetime +from typing import Any, Dict, List, Tuple, Union + +from ConfigSpace.configuration_space import Configuration + +import numpy as np + +import scipy + +from smac.runhistory.runhistory import RunHistory, RunKey, RunValue +from smac.tae import StatusType +from smac.utils.io.traj_logging import TrajEntry + +from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric + + +# TODO remove StatusType.RUNNING at some point in the future when the new SMAC 0.13.2 +# is the new minimum required version! +STATUS_TYPES = [ + StatusType.SUCCESS, + # Success (but did not advance to higher budget such as cutoff by hyperband) + StatusType.DONOTADVANCE, + StatusType.TIMEOUT, + StatusType.CRASHED, + StatusType.ABORT, + StatusType.MEMOUT +] + + +def cost2metric(cost: float, metric: autoPyTorchMetric) -> float: + """ + Revert cost metric evaluated in SMAC to the original metric. + + The conversion is defined in: + autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss + cost = metric._optimum - metric._sign * original_metric_value + ==> original_metric_value = metric._sign * (metric._optimum - cost) + """ + return metric._sign * (metric._optimum - cost) + + +def get_start_time(run_history: RunHistory) -> float: + """ + Get start time of optimization. + + Args: + run_history (RunHistory): + The history of config evals from SMAC. + + Returns: + starttime (float): + The start time of the first training. + """ + + start_times = [] + for run_value in run_history.data.values(): + if run_value.status in (StatusType.STOP, StatusType.RUNNING): + continue + elif run_value.status not in STATUS_TYPES: + raise ValueError(f'Unexpected run status: {run_value.status}') + + start_times.append(run_value.starttime) + + return float(np.min(start_times)) # mypy redefinition + + +def _extract_metrics_info( + run_value: RunValue, + scoring_functions: List[autoPyTorchMetric], + inference_name: str +) -> Dict[str, float]: + """ + Extract the metric information given a run_value + and a list of metrics of interest. + + Args: + run_value (RunValue): + The information for each config evaluation. + scoring_functions (List[autoPyTorchMetric]): + The list of metrics to retrieve the info. + inference_name (str): + The name of the inference. Either `train`, `opt` or `test`. + + Returns: + metric_info (Dict[str, float]): + The metric values of interest. + Since the metrics in additional_info are `cost`, + we transform them into the original form. + """ + + if run_value.status not in (StatusType.SUCCESS, StatusType.DONOTADVANCE): + # Additional info for metrics is not available in this case. + return {metric.name: metric._worst_possible_result for metric in scoring_functions} + + inference_choices = ['train', 'opt', 'test'] + if inference_name not in inference_choices: + raise ValueError(f'inference_name must be in {inference_choices}, but got {inference_choices}') + + cost_info = run_value.additional_info[f'{inference_name}_loss'] + avail_metrics = cost_info.keys() + + return { + metric.name: cost2metric(cost=cost_info[metric.name], metric=metric) + if metric.name in avail_metrics else metric._worst_possible_result + for metric in scoring_functions + } + + +class EnsembleResults: + def __init__( + self, + metric: autoPyTorchMetric, + ensemble_performance_history: List[Dict[str, Any]], + order_by_endtime: bool = False + ): + """ + The wrapper class for ensemble_performance_history. + This class extracts the information from ensemble_performance_history + and allows other class to easily handle the history. + + Attributes: + train_scores (List[float]): + The ensemble scores on the training dataset. + test_scores (List[float]): + The ensemble scores on the test dataset. + end_times (List[float]): + The end time of the end of each ensemble evaluation. + Each element is a float timestamp. + empty (bool): + Whether the ensemble history about `self.metric` is empty or not. + metric (autoPyTorchMetric): + The information about the metric to contain. + In the case when such a metric does not exist in the record, + This class raises KeyError. + """ + self._test_scores: List[float] = [] + self._train_scores: List[float] = [] + self._end_times: List[float] = [] + self._metric = metric + self._empty = True # Initial state is empty. + self._instantiated = False + + self._extract_results_from_ensemble_performance_history(ensemble_performance_history) + if order_by_endtime: + self._sort_by_endtime() + + self._instantiated = True + + @property + def train_scores(self) -> np.ndarray: + return np.asarray(self._train_scores) + + @property + def test_scores(self) -> np.ndarray: + return np.asarray(self._test_scores) + + @property + def end_times(self) -> np.ndarray: + return np.asarray(self._end_times) + + @property + def metric_name(self) -> str: + return self._metric.name + + def empty(self) -> bool: + """ This is not property to follow coding conventions. """ + return self._empty + + def _update(self, data: Dict[str, Any]) -> None: + if self._instantiated: + raise RuntimeError( + 'EnsembleResults should not be overwritten once instantiated. ' + 'Instantiate new object rather than using update.' + ) + + self._train_scores.append(data[f'train_{self.metric_name}']) + self._test_scores.append(data[f'test_{self.metric_name}']) + self._end_times.append(datetime.timestamp(data['Timestamp'])) + + def _sort_by_endtime(self) -> None: + """ + Since the default order is by start time + and parallel computation might change the order of ending, + this method provides the feature to sort by end time. + Note that this method is destructive. + """ + if self._instantiated: + raise RuntimeError( + 'EnsembleResults should not be overwritten once instantiated. ' + 'Instantiate new object with order_by_endtime=True.' + ) + + order = np.argsort(self._end_times) + + self._train_scores = self.train_scores[order].tolist() + self._test_scores = self.test_scores[order].tolist() + self._end_times = self.end_times[order].tolist() + + def _extract_results_from_ensemble_performance_history( + self, + ensemble_performance_history: List[Dict[str, Any]] + ) -> None: + """ + Extract information to from `ensemble_performance_history` + to match the format of this class format. + + Args: + ensemble_performance_history (List[Dict[str, Any]]): + The history of the ensemble performance from EnsembleBuilder. + Its key must be either `train_xxx`, `test_xxx` or `Timestamp`. + """ + + if ( + len(ensemble_performance_history) == 0 + or f'train_{self.metric_name}' not in ensemble_performance_history[0].keys() + ): + self._empty = True + return + + self._empty = False # We can extract ==> not empty + for data in ensemble_performance_history: + self._update(data) + + +class SearchResults: + def __init__( + self, + metric: autoPyTorchMetric, + scoring_functions: List[autoPyTorchMetric], + run_history: RunHistory, + order_by_endtime: bool = False + ): + """ + The wrapper class for run_history. + This class extracts the information from run_history + and allows other class to easily handle the history. + Note that the data is sorted by starttime by default and + metric_dict has the original form of metric value, i.e. not necessarily cost. + + Attributes: + train_metric_dict (Dict[str, List[float]]): + The extracted train metric information at each evaluation. + Each list keeps the metric information specified by scoring_functions and metric. + opt_metric_dict (Dict[str, List[float]]): + The extracted opt metric information at each evaluation. + Each list keeps the metric information specified by scoring_functions and metric. + test_metric_dict (Dict[str, List[float]]): + The extracted test metric information at each evaluation. + Each list keeps the metric information specified by scoring_functions and metric. + fit_times (List[float]): + The time needed to fit each model. + end_times (List[float]): + The end time of the end of each evaluation. + Each element is a float timestamp. + configs (List[Configuration]): + The configurations at each evaluation. + status_types (List[StatusType]): + The list of status types of each evaluation (e.g. success, crush). + budgets (List[float]): + The budgets used for each evaluation. + Here, budget refers to the definition in Hyperband or Successive halving. + config_ids (List[int]): + The ID of each configuration. Since we use cutoff such as in Hyperband, + we need to store it to know whether each configuration is a suvivor. + is_traditionals (List[bool]): + Whether each configuration is from traditional machine learning methods. + additional_infos (List[Dict[str, float]]): + It usually serves as the source of each metric at each evaluation. + In other words, train or test performance is extracted from this info. + rank_opt_scores (np.ndarray): + The rank of each evaluation among all the evaluations. + metric (autoPyTorchMetric): + The metric of the main interest. + scoring_functions (List[autoPyTorchMetric]): + The list of metrics to contain in the additional_infos. + """ + if metric not in scoring_functions: + scoring_functions.append(metric) + + self.train_metric_dict: Dict[str, List[float]] = {metric.name: [] for metric in scoring_functions} + self.opt_metric_dict: Dict[str, List[float]] = {metric.name: [] for metric in scoring_functions} + self.test_metric_dict: Dict[str, List[float]] = {metric.name: [] for metric in scoring_functions} + + self._fit_times: List[float] = [] + self._end_times: List[float] = [] + self.configs: List[Configuration] = [] + self.status_types: List[StatusType] = [] + self.budgets: List[float] = [] + self.config_ids: List[int] = [] + self.is_traditionals: List[bool] = [] + self.additional_infos: List[Dict[str, float]] = [] + self.rank_opt_scores: np.ndarray = np.array([]) + self._scoring_functions = scoring_functions + self._metric = metric + self._instantiated = False + + self._extract_results_from_run_history(run_history) + if order_by_endtime: + self._sort_by_endtime() + + self._instantiated = True + + @property + def train_scores(self) -> np.ndarray: + """ training metric values at each evaluation """ + return np.asarray(self.train_metric_dict[self.metric_name]) + + @property + def opt_scores(self) -> np.ndarray: + """ validation metric values at each evaluation """ + return np.asarray(self.opt_metric_dict[self.metric_name]) + + @property + def test_scores(self) -> np.ndarray: + """ test metric values at each evaluation """ + return np.asarray(self.test_metric_dict[self.metric_name]) + + @property + def fit_times(self) -> np.ndarray: + return np.asarray(self._fit_times) + + @property + def end_times(self) -> np.ndarray: + return np.asarray(self._end_times) + + @property + def metric_name(self) -> str: + return self._metric.name + + def _update( + self, + config: Configuration, + run_key: RunKey, + run_value: RunValue + ) -> None: + + if self._instantiated: + raise RuntimeError( + 'SearchResults should not be overwritten once instantiated. ' + 'Instantiate new object rather than using update.' + ) + elif run_value.status in (StatusType.STOP, StatusType.RUNNING): + return + elif run_value.status not in STATUS_TYPES: + raise ValueError(f'Unexpected run status: {run_value.status}') + + is_traditional = False # If run is not successful, unsure ==> not True ==> False + if run_value.additional_info is not None: + is_traditional = run_value.additional_info['configuration_origin'] == 'traditional' + + self.status_types.append(run_value.status) + self.configs.append(config) + self.budgets.append(run_key.budget) + self.config_ids.append(run_key.config_id) + self.is_traditionals.append(is_traditional) + self.additional_infos.append(run_value.additional_info) + self._fit_times.append(run_value.time) + self._end_times.append(run_value.endtime) + + for inference_name in ['train', 'opt', 'test']: + metric_info = _extract_metrics_info( + run_value=run_value, + scoring_functions=self._scoring_functions, + inference_name=inference_name + ) + for metric_name, val in metric_info.items(): + getattr(self, f'{inference_name}_metric_dict')[metric_name].append(val) + + def _sort_by_endtime(self) -> None: + """ + Since the default order is by start time + and parallel computation might change the order of ending, + this method provides the feature to sort by end time. + Note that this method is destructive. + """ + if self._instantiated: + raise RuntimeError( + 'SearchResults should not be overwritten once instantiated. ' + 'Instantiate new object with order_by_endtime=True.' + ) + + order = np.argsort(self._end_times) + + self.train_metric_dict = {name: [arr[idx] for idx in order] for name, arr in self.train_metric_dict.items()} + self.opt_metric_dict = {name: [arr[idx] for idx in order] for name, arr in self.opt_metric_dict.items()} + self.test_metric_dict = {name: [arr[idx] for idx in order] for name, arr in self.test_metric_dict.items()} + + self._fit_times = [self._fit_times[idx] for idx in order] + self._end_times = [self._end_times[idx] for idx in order] + self.status_types = [self.status_types[idx] for idx in order] + self.budgets = [self.budgets[idx] for idx in order] + self.config_ids = [self.config_ids[idx] for idx in order] + self.is_traditionals = [self.is_traditionals[idx] for idx in order] + self.additional_infos = [self.additional_infos[idx] for idx in order] + + # Don't use numpy slicing to avoid version dependency (cast config to object might cause issues) + self.configs = [self.configs[idx] for idx in order] + + # Only rank_opt_scores is np.ndarray + self.rank_opt_scores = self.rank_opt_scores[order] + + def _extract_results_from_run_history(self, run_history: RunHistory) -> None: + """ + Extract the information to match this class format. + + Args: + run_history (RunHistory): + The history of config evals from SMAC. + """ + + for run_key, run_value in run_history.data.items(): + config = run_history.ids_config[run_key.config_id] + self._update(config=config, run_key=run_key, run_value=run_value) + + self.rank_opt_scores = scipy.stats.rankdata( + -1 * self._metric._sign * self.opt_scores, # rank order + method='min' + ) + + +class MetricResults: + def __init__( + self, + metric: autoPyTorchMetric, + run_history: RunHistory, + ensemble_performance_history: List[Dict[str, Any]] + ): + """ + The wrapper class for ensemble_performance_history. + This class extracts the information from ensemble_performance_history + and allows other class to easily handle the history. + Note that all the data is sorted by endtime! + + Attributes: + start_time (float): + The timestamp at the very beginning of the optimization. + cum_times (np.ndarray): + The runtime needed to reach the end of each evaluation. + The time unit is second. + metric (autoPyTorchMetric): + The information about the metric to contain. + search_results (SearchResults): + The instance to fetch the metric values of `self.metric` + from run_history. + ensemble_results (EnsembleResults): + The instance to fetch the metric values of `self.metric` + from ensemble_performance_history. + If there is no information available, self.empty() returns True. + data (Dict[str, np.ndarray]): + Keys are `{single, ensemble}::{train, opt, test}::{metric.name}`. + Each array contains the evaluated values for the corresponding category. + """ + self.start_time = get_start_time(run_history) + self.metric = metric + self.search_results = SearchResults( + metric=metric, + run_history=run_history, + scoring_functions=[], + order_by_endtime=True + ) + self.ensemble_results = EnsembleResults( + metric=metric, + ensemble_performance_history=ensemble_performance_history, + order_by_endtime=True + ) + + if ( + not self.ensemble_results.empty() + and self.search_results.end_times[-1] < self.ensemble_results.end_times[-1] + ): + # Augment runtime table with the final available end time + self.cum_times = np.hstack( + [self.search_results.end_times - self.start_time, + [self.ensemble_results.end_times[-1] - self.start_time]] + ) + else: + self.cum_times = self.search_results.end_times - self.start_time + + self.data: Dict[str, np.ndarray] = {} + self._extract_results() + + def _extract_results(self) -> None: + """ Extract metric values of `self.metric` and store them in `self.data`. """ + metric_name = self.metric.name + for inference_name in ['train', 'test', 'opt']: + # TODO: Extract information from self.search_results + data = getattr(self.search_results, f'{inference_name}_metric_dict')[metric_name] + self.data[f'single::{inference_name}::{metric_name}'] = np.array(data) + + if self.ensemble_results.empty() or inference_name == 'opt': + continue + + data = getattr(self.ensemble_results, f'{inference_name}_scores') + self.data[f'ensemble::{inference_name}::{metric_name}'] = np.array(data) + + def get_ensemble_merged_data(self) -> Dict[str, np.ndarray]: + """ + Merge the ensemble performance data to the closest time step + available in the run_history. + One performance metric will be allocated to one time step. + Other time steps will be filled by the worst possible value. + + Returns: + data (Dict[str, np.ndarray]): + Merged data as mentioned above + """ + + data = {k: v.copy() for k, v in self.data.items()} # deep copy + + if self.ensemble_results.empty(): # no ensemble data available + return data + + train_scores, test_scores = self.ensemble_results.train_scores, self.ensemble_results.test_scores + end_times = self.ensemble_results.end_times + cur, timestep_size, sign = 0, self.cum_times.size, self.metric._sign + key_train, key_test = f'ensemble::train::{self.metric.name}', f'ensemble::test::{self.metric.name}' + + train_perfs = np.full_like(self.cum_times, self.metric._worst_possible_result) + test_perfs = np.full_like(self.cum_times, self.metric._worst_possible_result) + + for timestamp, train_score, test_score in zip(end_times, train_scores, test_scores): + avail_time = timestamp - self.start_time + while cur < timestep_size and self.cum_times[cur] < avail_time: + # Guarantee that cum_times[cur] >= avail_time + cur += 1 + + # results[cur] is the closest available checkpoint after or at the avail_time + # ==> Assign this data to that checkpoint + time_index = min(cur, timestep_size - 1) + # If there already exists a previous allocated value, update by a better value + train_perfs[time_index] = sign * max(sign * train_perfs[time_index], sign * train_score) + test_perfs[time_index] = sign * max(sign * test_perfs[time_index], sign * test_score) + + data.update({key_train: train_perfs, key_test: test_perfs}) + return data + + +class ResultsManager: + def __init__(self, *args: Any, **kwargs: Any): + """ + This module is used to gather result information for BaseTask. + In other words, this module is supposed to be wrapped by BaseTask. + + Attributes: + run_history (RunHistory): + A `SMAC Runshistory `_ + object that holds information about the runs of the target algorithm made during search + ensemble_performance_history (List[Dict[str, Any]]): + The history of the ensemble performance from EnsembleBuilder. + Its keys are `train_xxx`, `test_xxx` or `Timestamp`. + trajectory (List[TrajEntry]): + A list of all incumbent configurations during search + """ + self.run_history: RunHistory = RunHistory() + self.ensemble_performance_history: List[Dict[str, Any]] = [] + self.trajectory: List[TrajEntry] = [] + + def _check_run_history(self) -> None: + if self.run_history is None: + raise RuntimeError("No Run History found, search has not been called.") + + if self.run_history.empty(): + raise RuntimeError("Run History is empty. Something went wrong, " + "SMAC was not able to fit any model?") + + def get_incumbent_results( + self, + metric: autoPyTorchMetric, + include_traditional: bool = False + ) -> Tuple[Configuration, Dict[str, Union[int, str, float]]]: + """ + Get Incumbent config and the corresponding results + + Args: + metric (autoPyTorchMetric): + A metric that is evaluated when searching with fit AutoPytorch. + include_traditional (bool): + Whether to include results from tradtional pipelines + + Returns: + Configuration (CS.ConfigurationSpace): + The incumbent configuration + Dict[str, Union[int, str, float]]: + Additional information about the run of the incumbent configuration. + """ + self._check_run_history() + + results = SearchResults(metric=metric, scoring_functions=[], run_history=self.run_history) + + if not include_traditional: + non_traditional = ~np.array(results.is_traditionals) + scores = results.opt_scores[non_traditional] + indices = np.arange(len(results.configs))[non_traditional] + else: + scores = results.opt_scores + indices = np.arange(len(results.configs)) + + incumbent_idx = indices[np.argmax(metric._sign * scores)] + incumbent_config = results.configs[incumbent_idx] + incumbent_results = results.additional_infos[incumbent_idx] + + assert incumbent_results is not None # mypy check + return incumbent_config, incumbent_results + + def get_search_results( + self, + scoring_functions: List[autoPyTorchMetric], + metric: autoPyTorchMetric + ) -> SearchResults: + """ + This attribute is populated with data from `self.run_history` + and contains information about the configurations, and their + corresponding metric results, status of run, parameters and + the budget + + Args: + scoring_functions (List[autoPyTorchMetric]): + Metrics to show in the results. + metric (autoPyTorchMetric): + A metric that is evaluated when searching with fit AutoPytorch. + + Returns: + SearchResults: + An instance that contains the results from search + """ + self._check_run_history() + return SearchResults(metric=metric, scoring_functions=scoring_functions, run_history=self.run_history) + + def sprint_statistics( + self, + dataset_name: str, + scoring_functions: List[autoPyTorchMetric], + metric: autoPyTorchMetric + ) -> str: + """ + Prints statistics about the SMAC search. + + These statistics include: + + 1. Optimisation Metric + 2. Best Optimisation score achieved by individual pipelines + 3. Total number of target algorithm runs + 4. Total number of successful target algorithm runs + 5. Total number of crashed target algorithm runs + 6. Total number of target algorithm runs that exceeded the time limit + 7. Total number of successful target algorithm runs that exceeded the memory limit + + Args: + dataset_name (str): + The dataset name that was used in the run. + scoring_functions (List[autoPyTorchMetric]): + Metrics to show in the results. + metric (autoPyTorchMetric): + A metric that is evaluated when searching with fit AutoPytorch. + + Returns: + (str): + Formatted string with statistics + """ + search_results = self.get_search_results(scoring_functions, metric) + success_status = (StatusType.SUCCESS, StatusType.DONOTADVANCE) + sio = io.StringIO() + sio.write("autoPyTorch results:\n") + sio.write(f"\tDataset name: {dataset_name}\n") + sio.write(f"\tOptimisation Metric: {metric}\n") + + num_runs = len(search_results.status_types) + num_success = sum([s in success_status for s in search_results.status_types]) + num_crash = sum([s == StatusType.CRASHED for s in search_results.status_types]) + num_timeout = sum([s == StatusType.TIMEOUT for s in search_results.status_types]) + num_memout = sum([s == StatusType.MEMOUT for s in search_results.status_types]) + + if num_success > 0: + best_score = metric._sign * np.max(metric._sign * search_results.opt_scores) + sio.write(f"\tBest validation score: {best_score}\n") + + sio.write(f"\tNumber of target algorithm runs: {num_runs}\n") + sio.write(f"\tNumber of successful target algorithm runs: {num_success}\n") + sio.write(f"\tNumber of crashed target algorithm runs: {num_crash}\n") + sio.write(f"\tNumber of target algorithms that exceeded the time " + f"limit: {num_timeout}\n") + sio.write(f"\tNumber of target algorithms that exceeded the memory " + f"limit: {num_memout}\n") + + return sio.getvalue() diff --git a/autoPyTorch/utils/results_visualizer.py b/autoPyTorch/utils/results_visualizer.py new file mode 100644 index 000000000..e1debe29c --- /dev/null +++ b/autoPyTorch/utils/results_visualizer.py @@ -0,0 +1,334 @@ +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, NamedTuple, Optional, Tuple + +import matplotlib.pyplot as plt + +import numpy as np + +from autoPyTorch.utils.results_manager import MetricResults + + +plt.rcParams["font.family"] = "Times New Roman" +plt.rcParams["font.size"] = 18 + + +@dataclass(frozen=True) +class ColorLabelSettings: + """ + The settings for each plot. + If None is provided, those plots are omitted. + + Attributes: + single_train (Optional[Tuple[Optional[str], Optional[str]]]): + The setting for the plot of the optimal single train result. + single_opt (Optional[Tuple[Optional[str], Optional[str]]]): + The setting for the plot of the optimal single result used in optimization. + single_test (Optional[Tuple[Optional[str], Optional[str]]]): + The setting for the plot of the optimal single test result. + ensemble_train (Optional[Tuple[Optional[str], Optional[str]]]): + The setting for the plot of the optimal ensemble train result. + ensemble_test (Optional[Tuple[Optional[str], Optional[str]]]): + The setting for the plot of the optimal ensemble test result. + """ + single_train: Optional[Tuple[Optional[str], Optional[str]]] = ('red', None) + single_opt: Optional[Tuple[Optional[str], Optional[str]]] = ('blue', None) + single_test: Optional[Tuple[Optional[str], Optional[str]]] = ('green', None) + ensemble_train: Optional[Tuple[Optional[str], Optional[str]]] = ('brown', None) + ensemble_test: Optional[Tuple[Optional[str], Optional[str]]] = ('purple', None) + + def extract_dicts( + self, + results: MetricResults + ) -> Tuple[Dict[str, Optional[str]], Dict[str, Optional[str]]]: + """ + Args: + results (MetricResults): + The results of the optimization in the base task API. + It determines what keys to include. + + Returns: + colors, labels (Tuple[Dict[str, Optional[str]], Dict[str, Optional[str]]]): + The dicts for colors and labels. + The keys are determined by results and each label and color + are determined by each instantiation. + Note that the keys include the metric name. + """ + + colors, labels = {}, {} + + for key, color_label in vars(self).items(): + if color_label is None: + continue + + prefix = '::'.join(key.split('_')) + try: + new_key = [key for key in results.data.keys() if key.startswith(prefix)][0] + colors[new_key], labels[new_key] = color_label + except IndexError: # ensemble does not always have results + pass + + return colors, labels + + +class PlotSettingParams(NamedTuple): + """ + Parameters for the plot environment. + + Attributes: + n_points (int): + The number of points to plot. + xlabel (Optional[str]): + The label in the x axis. + ylabel (Optional[str]): + The label in the y axis. + xscale (str): + The scale of x axis. + yscale (str): + The scale of y axis. + title (Optional[str]): + The title of the subfigure. + xlim (Tuple[float, float]): + The range of x axis. + ylim (Tuple[float, float]): + The range of y axis. + grid (bool): + Whether to have grid lines. + If users would like to define lines in detail, + they need to deactivate it. + legend (bool): + Whether to have legend in the figure. + legend_kwargs (Dict[str, Any]): + The kwargs for ax.legend. + Ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.legend.html + title (Optional[str]): + The title of the figure. + title_kwargs (Dict[str, Any]): + The kwargs for ax.set_title except title label. + Ref: https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.axes.Axes.set_title.html + show (bool): + Whether to show the plot. + If figname is not None, the save will be prioritized. + figname (Optional[str]): + Name of a figure to save. If None, no figure will be saved. + savefig_kwargs (Dict[str, Any]): + The kwargs for plt.savefig except filename. + Ref: https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.savefig.html + args, kwargs (Any): + Arguments for the ax.plot. + """ + n_points: int = 20 + xscale: str = 'linear' + yscale: str = 'linear' + xlabel: Optional[str] = None + ylabel: Optional[str] = None + title: Optional[str] = None + title_kwargs: Dict[str, Any] = {} + xlim: Optional[Tuple[float, float]] = None + ylim: Optional[Tuple[float, float]] = None + grid: bool = True + legend: bool = True + legend_kwargs: Dict[str, Any] = {} + show: bool = False + figname: Optional[str] = None + figsize: Optional[Tuple[int, int]] = None + savefig_kwargs: Dict[str, Any] = {} + + +class ScaleChoices(Enum): + linear = 'linear' + log = 'log' + + +def _get_perf_and_time( + cum_results: np.ndarray, + cum_times: np.ndarray, + plot_setting_params: PlotSettingParams, + worst_val: float +) -> Tuple[np.ndarray, np.ndarray]: + """ + Get the performance and time step to plot. + + Args: + cum_results (np.ndarray): + The cumulated performance per evaluation. + cum_times (np.ndarray): + The cumulated runtime at the end of each evaluation. + plot_setting_params (PlotSettingParams): + Parameters for the plot. + worst_val (float): + The worst possible value given a metric. + + Returns: + check_points (np.ndarray): + The time in second where the plot will happen. + perf_by_time_step (np.ndarray): + The best performance at the corresponding time in second + where the plot will happen. + """ + + scale_choices = [s.name for s in ScaleChoices] + if plot_setting_params.xscale not in scale_choices or plot_setting_params.yscale not in scale_choices: + raise ValueError(f'xscale and yscale must be in {scale_choices}, ' + f'but got xscale={plot_setting_params.xscale}, yscale={plot_setting_params.yscale}') + + n_evals, runtime_lb, runtime_ub = cum_results.size, cum_times[0], cum_times[-1] + + if plot_setting_params.xscale == 'log': + # Take the even time interval in the log scale and revert + check_points = np.exp(np.linspace(np.log(runtime_lb), np.log(runtime_ub), plot_setting_params.n_points)) + else: + check_points = np.linspace(runtime_lb, runtime_ub, plot_setting_params.n_points) + + check_points += 1e-8 # Prevent float error + + # The worst possible value is always at the head + perf_by_time_step = np.full_like(check_points, worst_val) + cur = 0 + + for i, check_point in enumerate(check_points): + while cur < n_evals and cum_times[cur] <= check_point: + # Guarantee that cum_times[cur] > check_point + # ==> cum_times[cur - 1] <= check_point + cur += 1 + if cur: # filter cur - 1 == -1 + # results[cur - 1] was obtained before or at the checkpoint + # ==> The best performance up to this checkpoint + perf_by_time_step[i] = cum_results[cur - 1] + + if plot_setting_params.yscale == 'log' and np.any(perf_by_time_step < 0): + raise ValueError('log scale is not available when performance metric can be negative.') + + return check_points, perf_by_time_step + + +class ResultsVisualizer: + @staticmethod + def _set_plot_args( + ax: plt.Axes, + plot_setting_params: PlotSettingParams + ) -> None: + if plot_setting_params.xlim is not None: + ax.set_xlim(*plot_setting_params.xlim) + if plot_setting_params.ylim is not None: + ax.set_ylim(*plot_setting_params.ylim) + + if plot_setting_params.xlabel is not None: + ax.set_xlabel(plot_setting_params.xlabel) + if plot_setting_params.ylabel is not None: + ax.set_ylabel(plot_setting_params.ylabel) + + ax.set_xscale(plot_setting_params.xscale) + ax.set_yscale(plot_setting_params.yscale) + + if plot_setting_params.grid: + if plot_setting_params.xscale == 'log' or plot_setting_params.yscale == 'log': + ax.grid(True, which='minor', color='gray', linestyle=':') + + ax.grid(True, which='major', color='black') + + if plot_setting_params.legend: + ax.legend(**plot_setting_params.legend_kwargs) + + if plot_setting_params.title is not None: + ax.set_title(plot_setting_params.title, **plot_setting_params.title_kwargs) + + if plot_setting_params.figname is not None: + plt.savefig(plot_setting_params.figname, **plot_setting_params.savefig_kwargs) + elif plot_setting_params.show: + plt.show() + + @staticmethod + def _plot_individual_perf_over_time( + ax: plt.Axes, + cum_times: np.ndarray, + cum_results: np.ndarray, + worst_val: float, + plot_setting_params: PlotSettingParams, + label: Optional[str] = None, + color: Optional[str] = None, + *args: Any, + **kwargs: Any + ) -> None: + """ + Plot the incumbent performance of the AutoPytorch over time. + This method is created to make plot_perf_over_time more readable + and it is not supposed to be used only in this class, but not from outside. + + Args: + ax (plt.Axes): + axis to plot (subplots of matplotlib). + cum_times (np.ndarray): + The cumulated time until each end of config evaluation. + results (np.ndarray): + The cumulated performance per evaluation. + worst_val (float): + The worst possible value given a metric. + plot_setting_params (PlotSettingParams): + Parameters for the plot. + label (Optional[str]): + The name of the plot. + color (Optional[str]): + Color of the plot. + args, kwargs (Any): + Arguments for the ax.plot. + """ + check_points, perf_by_time_step = _get_perf_and_time( + cum_results=cum_results, + cum_times=cum_times, + plot_setting_params=plot_setting_params, + worst_val=worst_val + ) + + ax.plot(check_points, perf_by_time_step, color=color, label=label, *args, **kwargs) + + def plot_perf_over_time( + self, + results: MetricResults, + plot_setting_params: PlotSettingParams, + colors: Dict[str, Optional[str]], + labels: Dict[str, Optional[str]], + ax: Optional[plt.Axes] = None, + *args: Any, + **kwargs: Any + ) -> None: + """ + Plot the incumbent performance of the AutoPytorch over time. + + Args: + results (MetricResults): + The module that handles results from various sources. + plot_setting_params (PlotSettingParams): + Parameters for the plot. + labels (Dict[str, Optional[str]]): + The name of the plot. + colors (Dict[str, Optional[str]]): + Color of the plot. + ax (Optional[plt.Axes]): + axis to plot (subplots of matplotlib). + If None, it will be created automatically. + args, kwargs (Any): + Arguments for the ax.plot. + """ + if ax is None: + _, ax = plt.subplots(nrows=1, ncols=1) + + data = results.get_ensemble_merged_data() + cum_times = results.cum_times + minimize = (results.metric._sign == -1) + + for key in data.keys(): + _label, _color, _perfs = labels[key], colors[key], data[key] + # Take the best results over time + _cum_perfs = np.minimum.accumulate(_perfs) if minimize else np.maximum.accumulate(_perfs) + + self._plot_individual_perf_over_time( # type: ignore + ax=ax, cum_results=_cum_perfs, cum_times=cum_times, + plot_setting_params=plot_setting_params, + worst_val=results.metric._worst_possible_result, + label=_label if _label is not None else ' '.join(key.split('::')), + color=_color, + *args, **kwargs + ) + + self._set_plot_args(ax=ax, plot_setting_params=plot_setting_params) diff --git a/autoPyTorch/utils/single_thread_client.py b/autoPyTorch/utils/single_thread_client.py index 9bb0fe3eb..30fd05b94 100644 --- a/autoPyTorch/utils/single_thread_client.py +++ b/autoPyTorch/utils/single_thread_client.py @@ -61,8 +61,24 @@ def submit( func: Callable, *args: List, priority: int = 0, + key: Any = None, + workers: Any = None, + resources: Any = None, + retries: Any = None, + fifo_timeout: Any = "100 ms", + allow_other_workers: Any = False, + actor: Any = False, + actors: Any = False, + pure: Any = None, **kwargs: Any, ) -> Any: + """ + Note + ---- + The keyword arguments caught in `dask.distributed.Client` need to + be specified here so they don't get passed in as ``**kwargs`` to the + ``func``. + """ return DummyFuture(func(*args, **kwargs)) def close(self) -> None: diff --git a/docs/installation.rst b/docs/installation.rst index c9f236d14..10d0bbcba 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -46,5 +46,31 @@ Manual Installation Docker Image -========================= - TODO +============ +A Docker image is also provided on dockerhub. To download from dockerhub, +use: + +.. code:: bash + + docker pull automlorg/autopytorch:master + +You can also verify that the image was downloaded via: + +.. code:: bash + + docker images # Verify that the image was downloaded + +This image can be used to start an interactive session as follows: + +.. code:: bash + + docker run -it automlorg/autopytorch:master + +To start a Jupyter notebook, you could instead run e.g.: + +.. code:: bash + + docker run -it -v ${PWD}:/opt/nb -p 8888:8888 automlorg/autopytorch:master /bin/bash -c "mkdir -p /opt/nb && jupyter notebook --notebook-dir=/opt/nb --ip='0.0.0.0' --port=8888 --no-browser --allow-root" + +Alternatively, it is possible to use the development version of autoPyTorch by replacing all +occurences of ``master`` by ``development``. diff --git a/examples/40_advanced/example_plot_over_time.py b/examples/40_advanced/example_plot_over_time.py new file mode 100644 index 000000000..cf672fc46 --- /dev/null +++ b/examples/40_advanced/example_plot_over_time.py @@ -0,0 +1,81 @@ +""" +============================== +Plot the Performance over Time +============================== + +Auto-Pytorch uses SMAC to fit individual machine learning algorithms +and then ensembles them together using `Ensemble Selection +`_. + +The following examples shows how to plot both the performance +of the individual models and their respective ensemble. + +Additionally, as we are compatible with matplotlib, +you can input any args or kwargs that are compatible with ax.plot. +In the case when you would like to create multipanel visualization, +please input plt.Axes obtained from matplotlib.pyplot.subplots. + +""" +import warnings + +import numpy as np +import pandas as pd + +from sklearn import model_selection + +import matplotlib.pyplot as plt + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.utils.results_visualizer import PlotSettingParams + + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + + +############################################################################ +# Task Definition +# =============== +n_samples, dim = 100, 2 +X = np.random.random((n_samples, dim)) * 2 - 1 +y = ((X ** 2).sum(axis=-1) < 2 / np.pi).astype(np.int32) +print(y) + +X, y = pd.DataFrame(X), pd.DataFrame(y) +X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y) + +############################################################################ +# API Instantiation and Searching +# =============================== +api = TabularClassificationTask(seed=42) + +api.search(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, + optimize_metric='accuracy', total_walltime_limit=120, func_eval_time_limit_secs=10) + +############################################################################ +# Create Setting Parameters Object +# ================================ +metric_name = 'accuracy' + +params = PlotSettingParams( + xscale='log', + xlabel='Runtime', + ylabel='Accuracy', + title='Toy Example', + figname='example_plot_over_time.png', + savefig_kwargs={'bbox_inches': 'tight'}, + show=False # If you would like to show, make it True and set figname=None +) + +############################################################################ +# Plot with the Specified Setting Parameters +# ========================================== +# _, ax = plt.subplots() <=== You can feed it to post-process the figure. + +# You might need to run `export DISPLAY=:0.0` if you are using non-GUI based environment. +api.plot_perf_over_time( + metric_name=metric_name, + plot_setting_params=params, + marker='*', + markersize=10 +) diff --git a/examples/40_advanced/example_resampling_strategy.py b/examples/40_advanced/example_resampling_strategy.py index d02859f1b..852375589 100644 --- a/examples/40_advanced/example_resampling_strategy.py +++ b/examples/40_advanced/example_resampling_strategy.py @@ -93,7 +93,7 @@ ############################################################################ # Search for an ensemble of machine learning algorithms -# ----------------------------------------------------------------------- +# ----------------------------------------------------- api.search( X_train=X_train, @@ -107,7 +107,7 @@ ############################################################################ # Print the final ensemble performance -# ------------ +# ------------------------------------ y_pred = api.predict(X_test) score = api.score(y_pred, y_test) print(score) diff --git a/examples/40_advanced/example_single_configuration.py b/examples/40_advanced/example_single_configuration.py new file mode 100644 index 000000000..7f87c6de3 --- /dev/null +++ b/examples/40_advanced/example_single_configuration.py @@ -0,0 +1,81 @@ +# -*- encoding: utf-8 -*- +""" +========================== +Fit a single configuration +========================== +*Auto-PyTorch* searches for the best combination of machine learning algorithms +and their hyper-parameter configuration for a given task. +This example shows how one can fit one of these pipelines, both, with a user defined +configuration, and a randomly sampled one form the configuration space. +The pipelines that Auto-PyTorch fits are compatible with Scikit-Learn API. You can +get further documentation about Scikit-Learn models here: _ +""" +import os +import tempfile as tmp +import warnings + +os.environ['JOBLIB_TEMP_FOLDER'] = tmp.gettempdir() +os.environ['OMP_NUM_THREADS'] = '1' +os.environ['OPENBLAS_NUM_THREADS'] = '1' +os.environ['MKL_NUM_THREADS'] = '1' + +warnings.simplefilter(action='ignore', category=UserWarning) +warnings.simplefilter(action='ignore', category=FutureWarning) + +import sklearn.datasets +import sklearn.metrics + +from autoPyTorch.api.tabular_classification import TabularClassificationTask +from autoPyTorch.datasets.resampling_strategy import HoldoutValTypes + + +############################################################################ +# Data Loading +# ============ + +X, y = sklearn.datasets.fetch_openml(data_id=3, return_X_y=True, as_frame=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, test_size=0.5, random_state=3 +) + +############################################################################ +# Define an estimator +# =================== + +estimator = TabularClassificationTask( + resampling_strategy=HoldoutValTypes.holdout_validation, + resampling_strategy_args={'val_share': 0.5}, +) + +############################################################################ +# Get a configuration of the pipeline for current dataset +# =============================================================== + +dataset = estimator.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + dataset_name='kr-vs-kp') +configuration = estimator.get_search_space(dataset).get_default_configuration() + +print("Passed Configuration:", configuration) +########################################################################### +# Fit the configuration +# ===================== + +pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, + configuration=configuration, + budget_type='epochs', + budget=5, + run_time_limit_secs=75 + ) + +# The fit_pipeline command also returns a named tuple with the pipeline constraints +print(run_info) + +# The fit_pipeline command also returns a named tuple with train/test performance +print(run_value) + +# This object complies with Scikit-Learn Pipeline API. +# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html +print(pipeline.named_steps) diff --git a/examples/40_advanced/example_visualization.py b/examples/40_advanced/example_visualization.py index 37c1c6dc3..a88899e81 100644 --- a/examples/40_advanced/example_visualization.py +++ b/examples/40_advanced/example_visualization.py @@ -149,18 +149,3 @@ grid=True, ) plt.show() - -# We then can understand the importance of each input feature using -# a permutation importance analysis. This is done as a proof of concept, to -# showcase that we can leverage of scikit-learn API. -result = permutation_importance(estimator, X_train, y_train, n_repeats=5, - scoring='accuracy', - random_state=seed) -sorted_idx = result.importances_mean.argsort() - -fig, ax = plt.subplots() -ax.boxplot(result.importances[sorted_idx].T, - vert=False, labels=X_test.columns[sorted_idx]) -ax.set_title("Permutation Importances (Train set)") -fig.tight_layout() -plt.show() diff --git a/requirements.txt b/requirements.txt index 6f81bfcb7..5582e1793 100755 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,7 @@ imgaug>=0.4.0 ConfigSpace>=0.4.14,<0.5 pynisher>=0.6.3 pyrfr>=0.7,<0.9 -smac==0.14.0 +smac>=1.2 dask distributed>=2.2.0 catboost diff --git a/setup.py b/setup.py index 96cafefe9..e1e3d47e2 100755 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ keywords="machine learning algorithm configuration hyperparameter" "optimization tuning neural architecture deep learning", packages=setuptools.find_packages(), + package_data={"autoPyTorch": ['py.typed']}, classifiers=[ "Development Status :: 3 - Alpha", "Topic :: Utilities", diff --git a/test/test_api/.tmp_api/runhistory.json b/test/test_api/.tmp_api/runhistory.json index 6f61e1395..28c0cbd32 100644 --- a/test/test_api/.tmp_api/runhistory.json +++ b/test/test_api/.tmp_api/runhistory.json @@ -705,6 +705,7 @@ "1": { "data_loader:batch_size": 64, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "ReduceLROnPlateau", @@ -737,6 +738,7 @@ "2": { "data_loader:batch_size": 101, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PowerTransformer", "imputer:numerical_strategy": "most_frequent", "lr_scheduler:__choice__": "CyclicLR", @@ -801,6 +803,7 @@ "3": { "data_loader:batch_size": 242, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "RandomKitchenSinks", "imputer:numerical_strategy": "median", "lr_scheduler:__choice__": "NoScheduler", @@ -831,6 +834,7 @@ "4": { "data_loader:batch_size": 115, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "Nystroem", "imputer:numerical_strategy": "median", "lr_scheduler:__choice__": "CosineAnnealingLR", @@ -864,6 +868,7 @@ "5": { "data_loader:batch_size": 185, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "RandomKitchenSinks", "imputer:numerical_strategy": "median", "lr_scheduler:__choice__": "ReduceLROnPlateau", @@ -904,6 +909,7 @@ "6": { "data_loader:batch_size": 95, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "RandomKitchenSinks", "imputer:numerical_strategy": "most_frequent", "lr_scheduler:__choice__": "ExponentialLR", @@ -937,6 +943,7 @@ "7": { "data_loader:batch_size": 119, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "Nystroem", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "StepLR", @@ -979,6 +986,7 @@ "8": { "data_loader:batch_size": 130, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PolynomialFeatures", "imputer:numerical_strategy": "median", "lr_scheduler:__choice__": "CyclicLR", @@ -1032,6 +1040,7 @@ "9": { "data_loader:batch_size": 137, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "Nystroem", "imputer:numerical_strategy": "mean", "lr_scheduler:__choice__": "CosineAnnealingLR", diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 5cb271eb0..4346ff2b6 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -2,8 +2,9 @@ import os import pathlib import pickle +import tempfile import unittest -from test.test_api.utils import dummy_do_dummy_prediction, dummy_eval_function +from test.test_api.utils import dummy_do_dummy_prediction, dummy_eval_train_function import ConfigSpace as CS from ConfigSpace.configuration_space import Configuration @@ -17,17 +18,18 @@ import sklearn import sklearn.datasets -from sklearn.base import BaseEstimator -from sklearn.base import clone +from sklearn.base import BaseEstimator, clone from sklearn.ensemble import VotingClassifier, VotingRegressor -from smac.runhistory.runhistory import RunHistory +from smac.runhistory.runhistory import RunHistory, RunInfo, RunValue from autoPyTorch.api.tabular_classification import TabularClassificationTask from autoPyTorch.api.tabular_regression import TabularRegressionTask +from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.resampling_strategy import ( CrossValTypes, HoldoutValTypes, + NoResamplingStrategyTypes, ) from autoPyTorch.optimizer.smbo import AutoMLSMBO from autoPyTorch.pipeline.base_pipeline import BasePipeline @@ -41,8 +43,8 @@ # Test # ==== -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function', - new=dummy_eval_function) +@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', + new=dummy_eval_train_function) @pytest.mark.parametrize('openml_id', (40981, )) @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', ((HoldoutValTypes.holdout_validation, None), @@ -216,13 +218,10 @@ def test_tabular_classification(openml_id, resampling_strategy, backend, resampl # Make sure that a configuration space is stored in the estimator assert isinstance(estimator.get_search_space(), CS.ConfigurationSpace) - # test fit on dummy data - assert isinstance(estimator.fit(dataset=backend.load_datamanager()), BasePipeline) - @pytest.mark.parametrize('openml_name', ("boston", )) -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function', - new=dummy_eval_function) +@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', + new=dummy_eval_train_function) @pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', ((HoldoutValTypes.holdout_validation, None), (CrossValTypes.k_fold_cross_validation, {'num_splits': CV_NUM_SPLITS}) @@ -467,7 +466,7 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): estimator._all_supported_metrics = False with pytest.raises(ValueError, match=r".*Dummy prediction failed with run state.*"): - with unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function') as dummy: + with unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function') as dummy: dummy.side_effect = MemoryError estimator._do_dummy_prediction() @@ -498,8 +497,8 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): del estimator -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function', - new=dummy_eval_function) +@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', + new=dummy_eval_train_function) @pytest.mark.parametrize('openml_id', (40981, )) def test_portfolio_selection(openml_id, backend, n_samples): @@ -540,8 +539,8 @@ def test_portfolio_selection(openml_id, backend, n_samples): assert any(successful_config in portfolio_configs for successful_config in successful_configs) -@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function', - new=dummy_eval_function) +@unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_train_function', + new=dummy_eval_train_function) @pytest.mark.parametrize('openml_id', (40981, )) def test_portfolio_selection_failure(openml_id, backend, n_samples): @@ -645,3 +644,289 @@ def test_build_pipeline(api_type, fit_dictionary_tabular): pipeline = api.build_pipeline(fit_dictionary_tabular['dataset_properties']) assert isinstance(pipeline, BaseEstimator) assert len(pipeline.steps) > 0 + + +@pytest.mark.parametrize("disable_file_output", [['all'], None]) +@pytest.mark.parametrize('openml_id', (40984,)) +@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', + ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}), + (CrossValTypes.k_fold_cross_validation, {'num_splits': 2}), + (NoResamplingStrategyTypes.no_resampling, {}) + ) + ) +@pytest.mark.parametrize("budget", [15, 20]) +def test_pipeline_fit(openml_id, + resampling_strategy, + resampling_strategy_args, + backend, + disable_file_output, + budget, + n_samples): + # Get the data and check that contents of data-manager make sense + X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True + ) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X[:n_samples], y[:n_samples], random_state=1) + + # Search for a good configuration + estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=resampling_strategy, + ensemble_size=0 + ) + + dataset = estimator.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args) + + configuration = estimator.get_search_space(dataset).get_default_configuration() + pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, + configuration=configuration, + run_time_limit_secs=50, + disable_file_output=disable_file_output, + budget_type='epochs', + budget=budget + ) + assert isinstance(dataset, BaseDataset) + assert isinstance(run_info, RunInfo) + assert isinstance(run_info.config, Configuration) + + assert isinstance(run_value, RunValue) + assert 'SUCCESS' in str(run_value.status) + + if disable_file_output is None: + if resampling_strategy in CrossValTypes: + assert isinstance(pipeline, BaseEstimator) + X_test = dataset.test_tensors[0] + preds = pipeline.predict_proba(X_test) + assert isinstance(preds, np.ndarray) + + score = accuracy(dataset.test_tensors[1], preds) + assert isinstance(score, float) + assert score > 0.65 + else: + assert isinstance(pipeline, BasePipeline) + # To make sure we fitted the model, there should be a + # run summary object with accuracy + run_summary = pipeline.named_steps['trainer'].run_summary + assert run_summary is not None + X_test = dataset.test_tensors[0] + preds = pipeline.predict(X_test) + assert isinstance(preds, np.ndarray) + + score = accuracy(dataset.test_tensors[1], preds) + assert isinstance(score, float) + assert score > 0.65 + else: + assert pipeline is None + assert run_value.cost < 0.35 + + # Make sure that the pipeline can be pickled + dump_file = os.path.join(tempfile.gettempdir(), 'automl.dump.pkl') + with open(dump_file, 'wb') as f: + pickle.dump(pipeline, f) + + num_run_dir = estimator._backend.get_numrun_directory( + run_info.seed, run_value.additional_info['num_run'], budget=float(budget)) + + cv_model_path = os.path.join(num_run_dir, estimator._backend.get_cv_model_filename( + run_info.seed, run_value.additional_info['num_run'], budget=float(budget))) + model_path = os.path.join(num_run_dir, estimator._backend.get_model_filename( + run_info.seed, run_value.additional_info['num_run'], budget=float(budget))) + + if disable_file_output: + # No file output is expected + assert not os.path.exists(num_run_dir) + else: + # We expect the model path always + # And the cv model only on 'cv' + assert os.path.exists(model_path) + if resampling_strategy in CrossValTypes: + assert os.path.exists(cv_model_path) + elif resampling_strategy in HoldoutValTypes: + assert not os.path.exists(cv_model_path) + + +@pytest.mark.parametrize('openml_id', (40984,)) +@pytest.mark.parametrize('resampling_strategy,resampling_strategy_args', + ((HoldoutValTypes.holdout_validation, {'val_share': 0.8}), + ) + ) +def test_pipeline_fit_error( + openml_id, + resampling_strategy, + resampling_strategy_args, + backend, + n_samples +): + # Get the data and check that contents of data-manager make sense + X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True + ) + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X[:n_samples], y[:n_samples], random_state=1) + + # Search for a good configuration + estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=resampling_strategy, + ) + + dataset = estimator.get_dataset(X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + resampling_strategy=resampling_strategy, + resampling_strategy_args=resampling_strategy_args) + + configuration = estimator.get_search_space(dataset).get_default_configuration() + pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, + configuration=configuration, + run_time_limit_secs=7, + ) + + assert 'TIMEOUT' in str(run_value.status) + assert pipeline is None + + +@pytest.mark.parametrize('openml_id', (40981, )) +def test_tabular_classification_test_evaluator(openml_id, backend, n_samples): + + # Get the data and check that contents of data-manager make sense + X, y = sklearn.datasets.fetch_openml( + data_id=int(openml_id), + return_X_y=True, as_frame=True + ) + X, y = X.iloc[:n_samples], y.iloc[:n_samples] + + X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=42) + + # Search for a good configuration + estimator = TabularClassificationTask( + backend=backend, + resampling_strategy=NoResamplingStrategyTypes.no_resampling, + seed=42, + ensemble_size=0 + ) + + with unittest.mock.patch.object(estimator, '_do_dummy_prediction', new=dummy_do_dummy_prediction): + estimator.search( + X_train=X_train, y_train=y_train, + X_test=X_test, y_test=y_test, + optimize_metric='accuracy', + total_walltime_limit=50, + func_eval_time_limit_secs=20, + enable_traditional_pipeline=False, + ) + + # Internal dataset has expected settings + assert estimator.dataset.task_type == 'tabular_classification' + + assert estimator.resampling_strategy == NoResamplingStrategyTypes.no_resampling + assert estimator.dataset.resampling_strategy == NoResamplingStrategyTypes.no_resampling + # Check for the created files + tmp_dir = estimator._backend.temporary_directory + loaded_datamanager = estimator._backend.load_datamanager() + assert len(loaded_datamanager.train_tensors) == len(estimator.dataset.train_tensors) + + expected_files = [ + 'smac3-output/run_42/configspace.json', + 'smac3-output/run_42/runhistory.json', + 'smac3-output/run_42/scenario.txt', + 'smac3-output/run_42/stats.json', + 'smac3-output/run_42/train_insts.txt', + 'smac3-output/run_42/trajectory.json', + '.autoPyTorch/datamanager.pkl', + '.autoPyTorch/start_time_42', + ] + for expected_file in expected_files: + assert os.path.exists(os.path.join(tmp_dir, expected_file)), "{}/{}/{}".format( + tmp_dir, + [data for data in pathlib.Path(tmp_dir).glob('*')], + expected_file, + ) + + # Check that smac was able to find proper models + succesful_runs = [run_value.status for run_value in estimator.run_history.data.values( + ) if 'SUCCESS' in str(run_value.status)] + assert len(succesful_runs) > 1, [(k, v) for k, v in estimator.run_history.data.items()] + + # Search for an existing run key in disc. A individual model might have + # a timeout and hence was not written to disc + successful_num_run = None + SUCCESS = False + for i, (run_key, value) in enumerate(estimator.run_history.data.items()): + if 'SUCCESS' in str(value.status): + run_key_model_run_dir = estimator._backend.get_numrun_directory( + estimator.seed, run_key.config_id + 1, run_key.budget) + successful_num_run = run_key.config_id + 1 + if os.path.exists(run_key_model_run_dir): + # Runkey config id is different from the num_run + # more specifically num_run = config_id + 1(dummy) + SUCCESS = True + break + + assert SUCCESS, f"Successful run was not properly saved for num_run: {successful_num_run}" + + model_file = os.path.join(run_key_model_run_dir, + f"{estimator.seed}.{successful_num_run}.{run_key.budget}.model") + assert os.path.exists(model_file), model_file + + # Make sure that predictions on the test data are printed and make sense + test_prediction = os.path.join(run_key_model_run_dir, + estimator._backend.get_prediction_filename( + 'test', estimator.seed, successful_num_run, + run_key.budget)) + assert os.path.exists(test_prediction), test_prediction + assert np.shape(np.load(test_prediction, allow_pickle=True))[0] == np.shape(X_test)[0] + + y_pred = estimator.predict(X_test) + assert np.shape(y_pred)[0] == np.shape(X_test)[0] + + # Make sure that predict proba has the expected shape + probabilites = estimator.predict_proba(X_test) + assert np.shape(probabilites) == (np.shape(X_test)[0], 2) + + score = estimator.score(y_pred, y_test) + assert 'accuracy' in score + + # check incumbent config and results + incumbent_config, incumbent_results = estimator.get_incumbent_results() + assert isinstance(incumbent_config, Configuration) + assert isinstance(incumbent_results, dict) + assert 'opt_loss' in incumbent_results, "run history: {}, successful_num_run: {}".format(estimator.run_history.data, + successful_num_run) + assert 'train_loss' in incumbent_results + + +@pytest.mark.parametrize("ans,task_class", ( + ("continuous", TabularRegressionTask), + ("multiclass", TabularClassificationTask)) +) +def test_task_inference(ans, task_class, backend): + # Get the data and check that contents of data-manager make sense + X = np.random.random((6, 1)) + y = np.array([-10 ** 12, 0, 1, 2, 3, 4], dtype=np.int64) + 10 ** 12 + + estimator = task_class( + backend=backend, + resampling_strategy=HoldoutValTypes.holdout_validation, + resampling_strategy_args=None, + seed=42, + ) + dataset = estimator.get_dataset(X, y) + assert dataset.output_type == ans + + y += 10 ** 12 + 10 # Check if the function catches overflow possibilities + if ans == 'continuous': + with pytest.raises(ValueError): # ValueError due to `Too large value` + estimator.get_dataset(X, y) + else: + estimator.get_dataset(X, y) diff --git a/test/test_api/test_base_api.py b/test/test_api/test_base_api.py index 126b702e6..f487ad5ea 100644 --- a/test/test_api/test_base_api.py +++ b/test/test_api/test_base_api.py @@ -12,6 +12,7 @@ from autoPyTorch.api.base_task import BaseTask, _pipeline_predict from autoPyTorch.constants import TABULAR_CLASSIFICATION, TABULAR_REGRESSION +from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline @@ -20,6 +21,7 @@ # ==== @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True) def test_nonsupported_arguments(fit_dictionary_tabular): + BaseTask.__abstractmethods__ = set() with pytest.raises(ValueError, match=r".*Expected search space updates to be of instance.*"): api = BaseTask(search_space_updates='None') @@ -82,6 +84,7 @@ def test_pipeline_predict_function(): @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only'], indirect=True) def test_show_models(fit_dictionary_tabular): + BaseTask.__abstractmethods__ = set() api = BaseTask() api.ensemble_ = MagicMock() api.models_ = [TabularClassificationPipeline(dataset_properties=fit_dictionary_tabular['dataset_properties'])] @@ -94,6 +97,7 @@ def test_show_models(fit_dictionary_tabular): def test_set_pipeline_config(): # checks if we can correctly change the pipeline options + BaseTask.__abstractmethods__ = set() estimator = BaseTask() pipeline_options = {"device": "cuda", "budget_type": "epochs", @@ -110,6 +114,7 @@ def test_set_pipeline_config(): (3, 50, 'runtime', {'budget_type': 'runtime', 'runtime': 50}), ]) def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, budget_type, expected): + BaseTask.__abstractmethods__ = set() estimator = BaseTask(task_type='tabular_classification', ensemble_size=0) # Fixture pipeline config @@ -139,3 +144,19 @@ def test_pipeline_get_budget(fit_dictionary_tabular, min_budget, max_budget, bud assert list(smac_mock.call_args)[1]['ta_kwargs']['pipeline_config'] == default_pipeline_config assert list(smac_mock.call_args)[1]['max_budget'] == max_budget assert list(smac_mock.call_args)[1]['initial_budget'] == min_budget + + +def test_no_resampling_error(backend): + """ + Checks if an error is raised when trying to construct ensemble + using `NoResamplingStrategy`. + """ + BaseTask.__abstractmethods__ = set() + + with pytest.raises(ValueError, match=r"`NoResamplingStrategy` cannot be used for ensemble construction"): + BaseTask( + backend=backend, + resampling_strategy=NoResamplingStrategyTypes.no_resampling, + seed=42, + ensemble_size=1 + ) diff --git a/test/test_api/test_results_manager.py b/test/test_api/test_results_manager.py deleted file mode 100644 index 4c6e7a7ae..000000000 --- a/test/test_api/test_results_manager.py +++ /dev/null @@ -1,232 +0,0 @@ -import json -import os -from test.test_api.utils import make_dict_run_history_data -from unittest.mock import MagicMock - -import ConfigSpace.hyperparameters as CSH -from ConfigSpace.configuration_space import Configuration, ConfigurationSpace - -import numpy as np - -import pytest - -from smac.runhistory.runhistory import RunHistory, StatusType - -from autoPyTorch.api.base_task import BaseTask -from autoPyTorch.api.results_manager import ResultsManager, STATUS2MSG, SearchResults, cost2metric -from autoPyTorch.metrics import accuracy, balanced_accuracy, log_loss - - -def _check_status(status): - """ Based on runhistory_B.json """ - ans = [ - STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS], - STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS], - STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS], - STATUS2MSG[StatusType.CRASHED], STATUS2MSG[StatusType.SUCCESS], - STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS], - STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS], - STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.SUCCESS], - STATUS2MSG[StatusType.TIMEOUT], STATUS2MSG[StatusType.TIMEOUT], - ] - assert isinstance(status, list) - assert isinstance(status[0], str) - assert status == ans - - -def _check_costs(costs): - """ Based on runhistory_B.json """ - ans = [0.15204678362573099, 0.4444444444444444, 0.5555555555555556, 0.29824561403508776, - 0.4444444444444444, 0.4444444444444444, 1.0, 0.5555555555555556, 0.4444444444444444, - 0.15204678362573099, 0.15204678362573099, 0.4035087719298246, 0.4444444444444444, - 0.4444444444444444, 1.0, 1.0] - assert np.allclose(1 - np.array(costs), ans) - assert isinstance(costs, np.ndarray) - assert costs.dtype is np.dtype(np.float) - - -def _check_fit_times(fit_times): - """ Based on runhistory_B.json """ - ans = [3.154788017272949, 3.2763524055480957, 22.723600149154663, 4.990685224533081, 10.684926509857178, - 9.947429180145264, 11.687273979187012, 8.478890419006348, 5.485020637512207, 11.514830589294434, - 15.370736837387085, 23.846530199050903, 6.757539510726929, 15.061991930007935, 50.010520696640015, - 22.011935234069824] - - assert np.allclose(fit_times, ans) - assert isinstance(fit_times, np.ndarray) - assert fit_times.dtype is np.dtype(np.float) - - -def _check_budgets(budgets): - """ Based on runhistory_B.json """ - ans = [5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555, - 5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555, - 5.555555555555555, 16.666666666666664, 50.0, 16.666666666666664, 16.666666666666664, - 16.666666666666664, 50.0, 50.0] - assert np.allclose(budgets, ans) - assert isinstance(budgets, list) - assert isinstance(budgets[0], float) - - -def _check_additional_infos(status_types, additional_infos): - for i, status in enumerate(status_types): - info = additional_infos[i] - if status in (STATUS2MSG[StatusType.SUCCESS], STATUS2MSG[StatusType.DONOTADVANCE]): - metric_info = info.get('opt_loss', None) - assert metric_info is not None - elif info is not None: - metric_info = info.get('opt_loss', None) - assert metric_info is None - - -def _check_metric_dict(metric_dict, status_types): - assert isinstance(metric_dict['accuracy'], list) - assert metric_dict['accuracy'][0] > 0 - assert isinstance(metric_dict['balanced_accuracy'], list) - assert metric_dict['balanced_accuracy'][0] > 0 - - for key, vals in metric_dict.items(): - # ^ is a XOR operator - # True and False / False and True must be fulfilled - assert all([(s == STATUS2MSG[StatusType.SUCCESS]) ^ isnan - for s, isnan in zip(status_types, np.isnan(vals))]) - - -def test_extract_results_from_run_history(): - # test the raise error for the `status_msg is None` - run_history = RunHistory() - cs = ConfigurationSpace() - config = Configuration(cs, {}) - run_history.add( - config=config, - cost=0.0, - time=1.0, - status=StatusType.CAPPED, - ) - with pytest.raises(ValueError) as excinfo: - SearchResults(metric=accuracy, scoring_functions=[], run_history=run_history) - - assert excinfo._excinfo[0] == ValueError - - -def test_search_results_sprint_statistics(): - api = BaseTask() - for method in ['get_search_results', 'sprint_statistics', 'get_incumbent_results']: - with pytest.raises(RuntimeError) as excinfo: - getattr(api, method)() - - assert excinfo._excinfo[0] == RuntimeError - - run_history_data = json.load(open(os.path.join(os.path.dirname(__file__), - '.tmp_api/runhistory_B.json'), - mode='r'))['data'] - api._results_manager.run_history = MagicMock() - api.run_history.empty = MagicMock(return_value=False) - - # The run_history has 16 runs + 1 run interruption ==> 16 runs - api.run_history.data = make_dict_run_history_data(run_history_data) - api._metric = accuracy - api.dataset_name = 'iris' - api._scoring_functions = [accuracy, balanced_accuracy] - api.search_space = MagicMock(spec=ConfigurationSpace) - search_results = api.get_search_results() - - _check_status(search_results.status_types) - _check_costs(search_results.opt_scores) - _check_fit_times(search_results.fit_times) - _check_budgets(search_results.budgets) - _check_metric_dict(search_results.metric_dict, search_results.status_types) - _check_additional_infos(status_types=search_results.status_types, - additional_infos=search_results.additional_infos) - - # config_ids can duplicate because of various budget size - config_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 1, 10, 11, 12, 10, 13] - assert config_ids == search_results.config_ids - - # assert that contents of search_results are of expected types - assert isinstance(search_results.rank_test_scores, np.ndarray) - assert search_results.rank_test_scores.dtype is np.dtype(np.int) - assert isinstance(search_results.configs, list) - - n_success, n_timeout, n_memoryout, n_crashed = 13, 2, 0, 1 - msg = ["autoPyTorch results:", f"\tDataset name: {api.dataset_name}", - f"\tOptimisation Metric: {api._metric.name}", - f"\tBest validation score: {max(search_results.opt_scores)}", - "\tNumber of target algorithm runs: 16", f"\tNumber of successful target algorithm runs: {n_success}", - f"\tNumber of crashed target algorithm runs: {n_crashed}", - f"\tNumber of target algorithms that exceeded the time limit: {n_timeout}", - f"\tNumber of target algorithms that exceeded the memory limit: {n_memoryout}"] - - assert isinstance(api.sprint_statistics(), str) - assert all([m1 == m2 for m1, m2 in zip(api.sprint_statistics().split("\n"), msg)]) - - -@pytest.mark.parametrize('run_history', (None, RunHistory())) -def test_check_run_history(run_history): - manager = ResultsManager() - manager.run_history = run_history - - with pytest.raises(RuntimeError) as excinfo: - manager._check_run_history() - - assert excinfo._excinfo[0] == RuntimeError - - -T, NT = 'traditional', 'non-traditional' -SCORES = [0.1 * (i + 1) for i in range(10)] - - -@pytest.mark.parametrize('include_traditional', (True, False)) -@pytest.mark.parametrize('metric', (accuracy, log_loss)) -@pytest.mark.parametrize('origins', ([T] * 5 + [NT] * 5, [T, NT] * 5, [NT] * 5 + [T] * 5)) -@pytest.mark.parametrize('scores', (SCORES, SCORES[::-1])) -def test_get_incumbent_results(include_traditional, metric, origins, scores): - manager = ResultsManager() - cs = ConfigurationSpace() - cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1)) - - configs = [0.1 * (i + 1) for i in range(len(scores))] - if metric.name == "log_loss": - # This is to detect mis-computation in reversion - metric._optimum = 0.1 - - best_cost, best_idx = np.inf, -1 - for idx, (a, origin, score) in enumerate(zip(configs, origins, scores)): - config = Configuration(cs, {'a': a}) - - # conversion defined in: - # autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss - cost = metric._optimum - metric._sign * score - manager.run_history.add( - config=config, - cost=cost, - time=1.0, - status=StatusType.SUCCESS, - additional_info={'opt_loss': {metric.name: score}, - 'configuration_origin': origin} - ) - if cost > best_cost: - continue - - if include_traditional: - best_cost, best_idx = cost, idx - elif origin != T: - best_cost, best_idx = cost, idx - - incumbent_config, incumbent_results = manager.get_incumbent_results( - metric=metric, - include_traditional=include_traditional - ) - - assert isinstance(incumbent_config, Configuration) - assert isinstance(incumbent_results, dict) - best_score, best_a = scores[best_idx], configs[best_idx] - assert np.allclose( - [best_score, best_score, best_a], - [cost2metric(best_cost, metric), - incumbent_results['opt_loss'][metric.name], - incumbent_config['a']] - ) - - if not include_traditional: - assert incumbent_results['configuration_origin'] != T diff --git a/test/test_api/utils.py b/test/test_api/utils.py index a8c258fe9..f8a11db88 100644 --- a/test/test_api/utils.py +++ b/test/test_api/utils.py @@ -69,7 +69,7 @@ def _fit_and_predict(self, pipeline, fold: int, train_indices, # create closure for evaluating an algorithm -def dummy_eval_function( +def dummy_eval_train_function( backend, queue, metric, diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 7f2ff2507..3d352d765 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -13,6 +13,7 @@ import sklearn.model_selection from autoPyTorch.data.tabular_feature_validator import TabularFeatureValidator +from autoPyTorch.data.utils import megabytes # Fixtures to be used in this class. By default all elements have 100 datapoints @@ -557,3 +558,47 @@ def test_comparator(): key=functools.cmp_to_key(validator._comparator) ) assert ans == feat_type + + +# Actual checks for the features +@pytest.mark.parametrize( + 'input_data_featuretest', + ( + 'numpy_numericalonly_nonan', + 'numpy_numericalonly_nan', + 'numpy_mixed_nan', + 'pandas_numericalonly_nan', + 'sparse_bsr_nonan', + 'sparse_bsr_nan', + 'sparse_coo_nonan', + 'sparse_coo_nan', + 'sparse_csc_nonan', + 'sparse_csc_nan', + 'sparse_csr_nonan', + 'sparse_csr_nan', + 'sparse_dia_nonan', + 'sparse_dia_nan', + 'sparse_dok_nonan', + 'sparse_dok_nan', + 'openml_40981', # Australian + ), + indirect=True +) +def test_featurevalidator_reduce_precision(input_data_featuretest): + X_train, X_test = sklearn.model_selection.train_test_split( + input_data_featuretest, test_size=0.1, random_state=1) + validator = TabularFeatureValidator(dataset_compression={'memory_allocation': 0, 'methods': ['precision']}) + validator.fit(X_train=X_train) + transformed_X_train = validator.transform(X_train.copy()) + + assert validator._reduced_dtype is not None + assert megabytes(transformed_X_train) < megabytes(X_train) + + transformed_X_test = validator.transform(X_test.copy()) + assert megabytes(transformed_X_test) < megabytes(X_test) + if hasattr(transformed_X_train, 'iloc'): + assert all(transformed_X_train.dtypes == transformed_X_test.dtypes) + assert all(transformed_X_train.dtypes == validator._precision) + else: + assert transformed_X_train.dtype == transformed_X_test.dtype + assert transformed_X_test.dtype == validator._reduced_dtype diff --git a/test/test_data/test_target_validator.py b/test/test_data/test_target_validator.py index aadc73416..8fd4527d9 100644 --- a/test/test_data/test_target_validator.py +++ b/test/test_data/test_target_validator.py @@ -150,17 +150,17 @@ def test_targetvalidator_supported_types_noclassification(input_data_targettest) assert validator.encoder is None if hasattr(input_data_targettest, "iloc"): - np.testing.assert_array_equal( + assert np.allclose( np.ravel(input_data_targettest.to_numpy()), np.ravel(transformed_y) ) elif sparse.issparse(input_data_targettest): - np.testing.assert_array_equal( + assert np.allclose( np.ravel(input_data_targettest.todense()), np.ravel(transformed_y.todense()) ) else: - np.testing.assert_array_equal( + assert np.allclose( np.ravel(np.array(input_data_targettest)), np.ravel(transformed_y) ) diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py new file mode 100644 index 000000000..505860a94 --- /dev/null +++ b/test/test_data/test_utils.py @@ -0,0 +1,127 @@ +from typing import Mapping + +import numpy as np + +from pandas.testing import assert_frame_equal + +import pytest + +from sklearn.datasets import fetch_openml + +from autoPyTorch.data.utils import ( + default_dataset_compression_arg, + get_dataset_compression_mapping, + megabytes, + reduce_dataset_size_if_too_large, + reduce_precision, + validate_dataset_compression_arg +) +from autoPyTorch.utils.common import subsampler + + +@pytest.mark.parametrize('openmlid', [2, 40984]) +@pytest.mark.parametrize('as_frame', [True, False]) +def test_reduce_dataset_if_too_large(openmlid, as_frame, n_samples): + X, _ = fetch_openml(data_id=openmlid, return_X_y=True, as_frame=as_frame) + X = subsampler(data=X, x=range(n_samples)) + + X_converted = reduce_dataset_size_if_too_large(X.copy(), memory_allocation=0) + np.allclose(X, X_converted) if not as_frame else assert_frame_equal(X, X_converted, check_dtype=False) + assert megabytes(X_converted) < megabytes(X) + + +def test_validate_dataset_compression_arg(): + + data_compression_args = validate_dataset_compression_arg({}, 10) + # check whether the function uses default args + # to fill in case args is empty + assert data_compression_args is not None + + # assert memory allocation is an integer after validation + assert isinstance(data_compression_args['memory_allocation'], int) + + # check whether the function raises an error + # in case an unknown key is in args + with pytest.raises(ValueError, match=r'Unknown key in dataset_compression, .*'): + validate_dataset_compression_arg({'not_there': 1}, 1) + + # check whether the function raises an error + # in case memory_allocation is not int or float is in args + with pytest.raises(ValueError, match=r"key 'memory_allocation' must be an `int` or `float`.*"): + validate_dataset_compression_arg({'memory_allocation': 'not int'}, 1) + + # check whether the function raises an error + # in case memory_allocation is an int greater than memory limit + with pytest.raises(ValueError, match=r"key 'memory_allocation' if int must be in.*"): + validate_dataset_compression_arg({'memory_allocation': 1}, 0) + + # check whether the function raises an error + # in case memory_allocation is a float greater than 1 + with pytest.raises(ValueError, match=r"key 'memory_allocation' if float must be in.*"): + validate_dataset_compression_arg({'memory_allocation': 1.5}, 0) + + # check whether the function raises an error + # in case an unknown method is passed in args + with pytest.raises(ValueError, match=r"key 'methods' can only contain .*"): + validate_dataset_compression_arg({'methods': 'unknown'}, 1) + + # check whether the function raises an error + # in case an unknown key is in args + with pytest.raises(ValueError, match=r'Unknown type for `dataset_compression` .*'): + validate_dataset_compression_arg(1, 1) + + +def test_error_raised_reduce_precision(): + # check whether the function raises an error + # in case X is not an expected type + with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to .*'): + reduce_precision(X='not expected') + + +def _verify_dataset_compression_mapping(mapping, expected_mapping): + assert isinstance(mapping, Mapping) + assert 'methods' in mapping + assert 'memory_allocation' in mapping + assert mapping == expected_mapping + + +@pytest.mark.parametrize('memory_limit', [2048]) +def test_get_dataset_compression_mapping(memory_limit): + """ + Tests the functionalities of `get_dataset_compression_mapping` + """ + dataset_compression_mapping = get_dataset_compression_mapping( + dataset_compression=True, + memory_limit=memory_limit) + # validation converts the memory allocation from float to integer based on the memory limit + expected_mapping = validate_dataset_compression_arg(default_dataset_compression_arg, memory_limit) + _verify_dataset_compression_mapping(dataset_compression_mapping, expected_mapping) + + mapping = {'memory_allocation': 0.01, 'methods': ['precision']} + dataset_compression_mapping = get_dataset_compression_mapping( + dataset_compression=mapping, + memory_limit=memory_limit + ) + expected_mapping = validate_dataset_compression_arg(mapping, memory_limit) + _verify_dataset_compression_mapping(dataset_compression_mapping, expected_mapping) + + dataset_compression_mapping = get_dataset_compression_mapping( + dataset_compression=False, + memory_limit=memory_limit + ) + assert dataset_compression_mapping is None + + +def test_unsupported_errors(): + """ + Checks if errors are raised when unsupported data is passed to reduce + """ + X = np.array([ + ['a', 'b', 'c', 'a', 'b', 'c'], + ['a', 'b', 'd', 'r', 'b', 'c']]) + with pytest.raises(ValueError, match=r'X.dtype = .*'): + reduce_dataset_size_if_too_large(X, 0) + + X = [[1, 2], [2, 3]] + with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'): + reduce_dataset_size_if_too_large(X, 0) diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py index 482c99769..cc89f5276 100644 --- a/test/test_data/test_validation.py +++ b/test/test_data/test_validation.py @@ -103,7 +103,7 @@ def test_sparse_data_validation_for_regression(): validator.fit(X_train=X_sp, y_train=y) - X_t, y_t = validator.transform(X, y) + X_t, y_t = validator.transform(X_sp, y) assert np.shape(X) == np.shape(X_t) # make sure everything was encoded to number diff --git a/test/test_datasets/test_base_dataset.py b/test/test_datasets/test_base_dataset.py new file mode 100644 index 000000000..52b2fa9a5 --- /dev/null +++ b/test/test_datasets/test_base_dataset.py @@ -0,0 +1,19 @@ +import numpy as np + +import pytest + +from autoPyTorch.datasets.base_dataset import _get_output_properties + + +@pytest.mark.parametrize( + "target_labels,dim,task_type", ( + (np.arange(5), 5, "multiclass"), + (np.linspace(0, 1, 3), 1, "continuous"), + (np.linspace(0, 1, 3)[:, np.newaxis], 1, "continuous") + ) +) +def test_get_output_properties(target_labels, dim, task_type): + train_tensors = np.array([np.empty_like(target_labels), target_labels]) + output_dim, output_type = _get_output_properties(train_tensors) + assert output_dim == dim + assert output_type == task_type diff --git a/test/test_datasets/test_tabular_dataset.py b/test/test_datasets/test_tabular_dataset.py index 409e6bdec..2ee8b608e 100644 --- a/test/test_datasets/test_tabular_dataset.py +++ b/test/test_datasets/test_tabular_dataset.py @@ -2,6 +2,9 @@ import pytest +from autoPyTorch.data.tabular_validator import TabularInputValidator +from autoPyTorch.datasets.base_dataset import TransformSubset +from autoPyTorch.datasets.resampling_strategy import CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.utils.pipeline import get_dataset_requirements @@ -46,3 +49,34 @@ def test_get_dataset_properties(backend, fit_dictionary_tabular): def test_not_supported(): with pytest.raises(ValueError, match=r".*A feature validator is required to build.*"): TabularDataset(np.ones(10), np.ones(10)) + + +@pytest.mark.parametrize('resampling_strategy', + (HoldoutValTypes.holdout_validation, + CrossValTypes.k_fold_cross_validation, + NoResamplingStrategyTypes.no_resampling + )) +def test_get_dataset(resampling_strategy, n_samples): + """ + Checks the functionality of get_dataset function of the TabularDataset + gives an error when trying to get training and validation subset + """ + X = np.zeros(shape=(n_samples, 4)) + Y = np.ones(n_samples) + validator = TabularInputValidator(is_classification=True) + validator.fit(X, Y) + dataset = TabularDataset( + resampling_strategy=resampling_strategy, + X=X, + Y=Y, + validator=validator + ) + transform_subset = dataset.get_dataset(split_id=0, train=True) + assert isinstance(transform_subset, TransformSubset) + + if isinstance(resampling_strategy, NoResamplingStrategyTypes): + with pytest.raises(ValueError): + dataset.get_dataset(split_id=0, train=False) + else: + transform_subset = dataset.get_dataset(split_id=0, train=False) + assert isinstance(transform_subset, TransformSubset) diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py index 6cec57fb4..a0be2c3f3 100644 --- a/test/test_evaluation/test_abstract_evaluator.py +++ b/test/test_evaluation/test_abstract_evaluator.py @@ -13,6 +13,7 @@ from autoPyTorch.automl_common.common.utils.backend import Backend, BackendContext from autoPyTorch.evaluation.abstract_evaluator import AbstractEvaluator +from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.components.training.metrics.metrics import accuracy this_directory = os.path.dirname(__file__) @@ -129,7 +130,7 @@ def test_disable_file_output(self): ae = AbstractEvaluator( backend=self.backend_mock, queue=queue_mock, - disable_file_output=True, + disable_file_output=[DisableFileOutputParameters.all], metric=accuracy, logger_port=unittest.mock.Mock(), budget=0, @@ -314,3 +315,35 @@ def test_error_unsupported_budget_type(self): self.assertIsInstance(e, ValueError) shutil.rmtree(self.working_directory, ignore_errors=True) + + def test_error_unsupported_disable_file_output_parameters(self): + shutil.rmtree(self.working_directory, ignore_errors=True) + os.mkdir(self.working_directory) + + queue_mock = unittest.mock.Mock() + + context = BackendContext( + prefix='autoPyTorch', + temporary_directory=os.path.join(self.working_directory, 'tmp'), + output_directory=os.path.join(self.working_directory, 'out'), + delete_tmp_folder_after_terminate=True, + delete_output_folder_after_terminate=True, + ) + with unittest.mock.patch.object(Backend, 'load_datamanager') as load_datamanager_mock: + load_datamanager_mock.return_value = get_multiclass_classification_datamanager() + + backend = Backend(context, prefix='autoPyTorch') + + try: + AbstractEvaluator( + backend=backend, + output_y_hat_optimization=False, + queue=queue_mock, + metric=accuracy, + budget=0, + configuration=1, + disable_file_output=['model']) + except Exception as e: + self.assertIsInstance(e, ValueError) + + shutil.rmtree(self.working_directory, ignore_errors=True) diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py index 222755b6e..2cabb6a73 100644 --- a/test/test_evaluation/test_evaluation.py +++ b/test/test_evaluation/test_evaluation.py @@ -92,13 +92,14 @@ def run_over_time(): ############################################################################ # Test ExecuteTaFuncWithQueue.run_wrapper() - @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function') + @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function') def test_eval_with_limits_holdout(self, pynisher_mock): pynisher_mock.side_effect = safe_eval_success_mock config = unittest.mock.Mock() config.config_id = 198 ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, + multi_objectives=["cost"], memory_limit=3072, metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), @@ -106,7 +107,7 @@ def test_eval_with_limits_holdout(self, pynisher_mock): logger_port=self.logger_port, pynisher_context='fork', ) - info = ta.run_wrapper(RunInfo(config=config, cutoff=30, instance=None, + info = ta.run_wrapper(RunInfo(config=config, cutoff=2000000, instance=None, instance_specific=None, seed=1, capped=False)) self.assertEqual(info[0].config.config_id, 198) self.assertEqual(info[1].status, StatusType.SUCCESS, info) @@ -120,6 +121,7 @@ def test_cutoff_lower_than_remaining_time(self, pynisher_mock): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -146,6 +148,7 @@ def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -166,6 +169,7 @@ def test_zero_or_negative_cutoff(self, pynisher_mock): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -178,7 +182,7 @@ def test_zero_or_negative_cutoff(self, pynisher_mock): instance_specific=None, seed=1, capped=False)) self.assertEqual(run_value.status, StatusType.STOP) - @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function') + @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function') def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock): pynisher_mock.return_value = None config = unittest.mock.Mock() @@ -187,6 +191,7 @@ def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -220,7 +225,7 @@ def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock): 'subprocess_stdout': '', 'subprocess_stderr': ''}) - @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function') + @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function') def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock): pynisher_mock.side_effect = MemoryError config = unittest.mock.Mock() @@ -228,6 +233,7 @@ def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -266,6 +272,7 @@ def side_effect(**kwargs): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -289,6 +296,7 @@ def side_effect(**kwargs): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -302,7 +310,7 @@ def side_effect(**kwargs): self.assertIsInstance(info[1].time, float) self.assertNotIn('exitcode', info[1].additional_info) - @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function') + @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function') def test_eval_with_limits_holdout_2(self, eval_houldout_mock): config = unittest.mock.Mock() config.config_id = 198 @@ -316,6 +324,7 @@ def side_effect(*args, **kwargs): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -331,7 +340,7 @@ def side_effect(*args, **kwargs): self.assertIn('configuration_origin', info[1].additional_info) self.assertEqual(info[1].additional_info['message'], "{'subsample': 30}") - @unittest.mock.patch('autoPyTorch.evaluation.train_evaluator.eval_function') + @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function') def test_exception_in_target_function(self, eval_holdout_mock): config = unittest.mock.Mock() config.config_id = 198 @@ -340,6 +349,7 @@ def test_exception_in_target_function(self, eval_holdout_mock): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -363,6 +373,7 @@ def test_silent_exception_in_target_function(self): ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, stats=self.stats, memory_limit=3072, + multi_objectives=["cost"], metric=accuracy, cost_for_crash=get_cost_of_crash(accuracy), abort_on_first_run_crash=False, @@ -394,6 +405,33 @@ def test_silent_exception_in_target_function(self): self.assertNotIn('exit_status', info[1].additional_info) self.assertNotIn('traceback', info[1]) + def test_eval_with_simple_intensification(self): + config = unittest.mock.Mock(spec=int) + config.config_id = 198 + + ta = ExecuteTaFuncWithQueue(backend=BackendMock(), seed=1, + stats=self.stats, + memory_limit=3072, + multi_objectives=["cost"], + metric=accuracy, + cost_for_crash=get_cost_of_crash(accuracy), + abort_on_first_run_crash=False, + logger_port=self.logger_port, + pynisher_context='fork', + budget_type='runtime' + ) + ta.pynisher_logger = unittest.mock.Mock() + run_info = RunInfo(config=config, cutoff=3000, instance=None, + instance_specific=None, seed=1, capped=False) + + for budget in [0.0, 50.0]: + # Simple intensification always returns budget = 0 + # Other intensifications return a non-zero value + self.stats.submitted_ta_runs += 1 + run_info = run_info._replace(budget=budget) + run_info_out, _ = ta.run_wrapper(run_info) + self.assertEqual(run_info_out.budget, budget) + @pytest.mark.parametrize("metric,expected", [(accuracy, 1.0), (log_loss, MAXINT)]) def test_get_cost_of_crash(metric, expected): diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_evaluators.py similarity index 65% rename from test/test_evaluation/test_train_evaluator.py rename to test/test_evaluation/test_evaluators.py index a3ff067f1..2ca32af10 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_evaluators.py @@ -15,7 +15,8 @@ from smac.tae import StatusType from autoPyTorch.automl_common.common.utils.backend import create -from autoPyTorch.datasets.resampling_strategy import CrossValTypes +from autoPyTorch.datasets.resampling_strategy import CrossValTypes, NoResamplingStrategyTypes +from autoPyTorch.evaluation.test_evaluator import TestEvaluator from autoPyTorch.evaluation.train_evaluator import TrainEvaluator from autoPyTorch.evaluation.utils import read_queue from autoPyTorch.pipeline.base_pipeline import BasePipeline @@ -294,3 +295,155 @@ def test_additional_metrics_during_training(self, pipeline_mock): self.assertIn('additional_run_info', result) self.assertIn('opt_loss', result['additional_run_info']) self.assertGreater(len(result['additional_run_info']['opt_loss'].keys()), 1) + + +class TestTestEvaluator(BaseEvaluatorTest, unittest.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + """ + Creates a backend mock + """ + tmp_dir_name = self.id() + self.ev_path = os.path.join(this_directory, '.tmp_evaluations', tmp_dir_name) + if os.path.exists(self.ev_path): + shutil.rmtree(self.ev_path) + os.makedirs(self.ev_path, exist_ok=False) + dummy_model_files = [os.path.join(self.ev_path, str(n)) for n in range(100)] + dummy_pred_files = [os.path.join(self.ev_path, str(n)) for n in range(100, 200)] + dummy_cv_model_files = [os.path.join(self.ev_path, str(n)) for n in range(200, 300)] + backend_mock = unittest.mock.Mock() + backend_mock.get_model_dir.return_value = self.ev_path + backend_mock.get_cv_model_dir.return_value = self.ev_path + backend_mock.get_model_path.side_effect = dummy_model_files + backend_mock.get_cv_model_path.side_effect = dummy_cv_model_files + backend_mock.get_prediction_output_path.side_effect = dummy_pred_files + backend_mock.temporary_directory = self.ev_path + self.backend_mock = backend_mock + + self.tmp_dir = os.path.join(self.ev_path, 'tmp_dir') + self.output_dir = os.path.join(self.ev_path, 'out_dir') + + def tearDown(self): + if os.path.exists(self.ev_path): + shutil.rmtree(self.ev_path) + + @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline') + def test_no_resampling(self, pipeline_mock): + # Binary iris, contains 69 train samples, 31 test samples + D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling) + pipeline_mock.predict_proba.side_effect = \ + lambda X, batch_size=None: np.tile([0.6, 0.4], (len(X), 1)) + pipeline_mock.side_effect = lambda **kwargs: pipeline_mock + pipeline_mock.get_additional_run_info.return_value = None + pipeline_mock.get_default_pipeline_options.return_value = {'budget_type': 'epochs', 'epochs': 10} + + configuration = unittest.mock.Mock(spec=Configuration) + backend_api = create(self.tmp_dir, self.output_dir, 'autoPyTorch') + backend_api.load_datamanager = lambda: D + queue_ = multiprocessing.Queue() + + evaluator = TestEvaluator(backend_api, queue_, configuration=configuration, metric=accuracy, budget=0) + evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) + evaluator.file_output.return_value = (None, {}) + + evaluator.fit_predict_and_loss() + + rval = read_queue(evaluator.queue) + self.assertEqual(len(rval), 1) + result = rval[0]['loss'] + self.assertEqual(len(rval[0]), 3) + self.assertRaises(queue.Empty, evaluator.queue.get, timeout=1) + + self.assertEqual(evaluator.file_output.call_count, 1) + self.assertEqual(result, 0.5806451612903225) + self.assertEqual(pipeline_mock.fit.call_count, 1) + # 2 calls because of train and test set + self.assertEqual(pipeline_mock.predict_proba.call_count, 2) + self.assertEqual(evaluator.file_output.call_count, 1) + # Should be none as no val preds are mentioned + self.assertIsNone(evaluator.file_output.call_args[0][1]) + # Number of y_test_preds and Y_test should be the same + self.assertEqual(evaluator.file_output.call_args[0][0].shape[0], + D.test_tensors[1].shape[0]) + self.assertEqual(evaluator.pipeline.fit.call_count, 1) + + @unittest.mock.patch.object(TestEvaluator, '_loss') + def test_file_output(self, loss_mock): + + D = get_regression_datamanager(NoResamplingStrategyTypes.no_resampling) + D.name = 'test' + self.backend_mock.load_datamanager.return_value = D + configuration = unittest.mock.Mock(spec=Configuration) + queue_ = multiprocessing.Queue() + loss_mock.return_value = None + + evaluator = TestEvaluator(self.backend_mock, queue_, configuration=configuration, metric=accuracy, budget=0) + + self.backend_mock.get_model_dir.return_value = True + evaluator.pipeline = 'model' + evaluator.Y_optimization = D.train_tensors[1] + rval = evaluator.file_output( + D.train_tensors[1], + None, + D.test_tensors[1], + ) + + self.assertEqual(rval, (None, {})) + # These targets are not saved as Fit evaluator is not used to make an ensemble + self.assertEqual(self.backend_mock.save_targets_ensemble.call_count, 0) + self.assertEqual(self.backend_mock.save_numrun_to_dir.call_count, 1) + self.assertEqual(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1].keys(), + {'seed', 'idx', 'budget', 'model', 'cv_model', + 'ensemble_predictions', 'valid_predictions', 'test_predictions'}) + self.assertIsNotNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['model']) + self.assertIsNone(self.backend_mock.save_numrun_to_dir.call_args_list[-1][1]['cv_model']) + + # Check for not containing NaNs - that the models don't predict nonsense + # for unseen data + D.test_tensors[1][0] = np.NaN + rval = evaluator.file_output( + D.train_tensors[1], + None, + D.test_tensors[1], + ) + self.assertEqual( + rval, + ( + 1.0, + { + 'error': + 'Model predictions for test set contains NaNs.' + }, + ) + ) + + @unittest.mock.patch('autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline') + def test_predict_proba_binary_classification(self, mock): + D = get_binary_classification_datamanager(NoResamplingStrategyTypes.no_resampling) + self.backend_mock.load_datamanager.return_value = D + mock.predict_proba.side_effect = lambda y, batch_size=None: np.array( + [[0.1, 0.9]] * y.shape[0] + ) + mock.side_effect = lambda **kwargs: mock + mock.get_default_pipeline_options.return_value = {'budget_type': 'epochs', 'epochs': 10} + configuration = unittest.mock.Mock(spec=Configuration) + queue_ = multiprocessing.Queue() + + evaluator = TestEvaluator(self.backend_mock, queue_, configuration=configuration, metric=accuracy, budget=0) + + evaluator.fit_predict_and_loss() + Y_test_pred = self.backend_mock.save_numrun_to_dir.call_args_list[0][-1][ + 'ensemble_predictions'] + + for i in range(7): + self.assertEqual(0.9, Y_test_pred[i][1]) + + def test_get_results(self): + queue_ = multiprocessing.Queue() + for i in range(5): + queue_.put((i * 1, 1 - (i * 0.2), 0, "", StatusType.SUCCESS)) + result = read_queue(queue_) + self.assertEqual(len(result), 5) + self.assertEqual(result[0][0], 0) + self.assertAlmostEqual(result[0][1], 1.0) diff --git a/test/test_evaluation/test_utils.py b/test/test_evaluation/test_utils.py new file mode 100644 index 000000000..e81eea38b --- /dev/null +++ b/test/test_evaluation/test_utils.py @@ -0,0 +1,35 @@ +""" +Tests the functionality in autoPyTorch.evaluation.utils +""" +import pytest + +from autoPyTorch.evaluation.utils import DisableFileOutputParameters + + +@pytest.mark.parametrize('disable_file_output', + [['pipeline', 'pipelines'], + [DisableFileOutputParameters.pipelines, DisableFileOutputParameters.pipeline]]) +def test_disable_file_output_no_error(disable_file_output): + """ + Checks that `DisableFileOutputParameters.check_compatibility` + does not raise an error for the parameterized values of `disable_file_output`. + + Args: + disable_file_output ([List[Union[str, DisableFileOutputParameters]]]): + Options that should be compatible with the `DisableFileOutputParameters` + defined in `autoPyTorch`. + """ + DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) + + +def test_disable_file_output_error(): + """ + Checks that `DisableFileOutputParameters.check_compatibility` raises an error + for a value not present in `DisableFileOutputParameters` and ensures that the + expected error is raised. + """ + disable_file_output = ['model'] + with pytest.raises(ValueError, match=r"Expected .*? to be in the members (.*?) of" + r" DisableFileOutputParameters or as string value" + r" of a member."): + DisableFileOutputParameters.check_compatibility(disable_file_output=disable_file_output) diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py index ac16e286a..a2705e19b 100644 --- a/test/test_pipeline/components/preprocessing/base.py +++ b/test/test_pipeline/components/preprocessing/base.py @@ -3,9 +3,12 @@ from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \ TabularColumnTransformer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import CoalescerChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline @@ -28,6 +31,8 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], steps.extend([ ("imputer", SimpleImputer()), + ("variance_threshold", VarianceThreshold()), + ("coalescer", CoalescerChoice(default_dataset_properties)), ("encoder", EncoderChoice(default_dataset_properties)), ("scaler", ScalerChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), diff --git a/test/test_pipeline/components/preprocessing/test_coalescer.py b/test/test_pipeline/components/preprocessing/test_coalescer.py new file mode 100644 index 000000000..811cf8b6e --- /dev/null +++ b/test/test_pipeline/components/preprocessing/test_coalescer.py @@ -0,0 +1,86 @@ +import copy +import unittest + +import numpy as np + +import pytest + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( + CoalescerChoice +) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer.MinorityCoalescer import ( + MinorityCoalescer +) + + +def test_transform_before_fit(): + with pytest.raises(RuntimeError): + mc = MinorityCoalescer(min_frac=None, random_state=np.random.RandomState()) + mc.transform(np.random.random((4, 4))) + + +class TestCoalescerChoice(unittest.TestCase): + def test_raise_error_in_check_update_compatiblity(self): + dataset_properties = {'numerical_columns': [], 'categorical_columns': []} + cc = CoalescerChoice(dataset_properties) + choices = ["NoCoescer"] # component name with typo + with pytest.raises(ValueError): + # raise error because no categorical columns, but choices do not have no coalescer + cc._check_update_compatiblity(choices_in_update=choices, dataset_properties=dataset_properties) + + def test_raise_error_in_get_component_without_updates(self): + dataset_properties = {'numerical_columns': [], 'categorical_columns': []} + cc = CoalescerChoice(dataset_properties) + with pytest.raises(ValueError): + # raise error because no categorical columns, but choices do not have no coalescer + cc._get_component_without_updates( + avail_components={}, + dataset_properties=dataset_properties, + default="", + include=[] + ) + + def test_get_set_config_space(self): + """Make sure that we can setup a valid choice in the Coalescer + choice""" + dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': [5]} + coalescer_choice = CoalescerChoice(dataset_properties) + cs = coalescer_choice.get_hyperparameter_search_space() + + # Make sure that all hyperparameters are part of the search space + self.assertListEqual( + sorted(cs.get_hyperparameter('__choice__').choices), + sorted(list(coalescer_choice.get_components().keys())) + ) + + # Make sure we can properly set some random configs + # Whereas just one iteration will make sure the algorithm works, + # doing five iterations increase the confidence. We will be able to + # catch component specific crashes + for _ in range(5): + config = cs.sample_configuration() + config_dict = copy.deepcopy(config.get_dictionary()) + coalescer_choice.set_hyperparameters(config) + + self.assertEqual(coalescer_choice.choice.__class__, + coalescer_choice.get_components()[config_dict['__choice__']]) + + # Then check the choice configuration + selected_choice = config_dict.pop('__choice__', None) + for key, value in config_dict.items(): + # Remove the selected_choice string from the parameter + # so we can query in the object for it + key = key.replace(selected_choice + ':', '') + self.assertIn(key, vars(coalescer_choice.choice)) + self.assertEqual(value, coalescer_choice.choice.__dict__[key]) + + def test_only_numerical(self): + dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': []} + + chooser = CoalescerChoice(dataset_properties) + configspace = chooser.get_hyperparameter_search_space().sample_configuration().get_dictionary() + self.assertEqual(configspace['__choice__'], 'NoCoalescer') + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py index 99fad6b1f..dd0a08d26 100644 --- a/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py +++ b/test/test_pipeline/components/preprocessing/test_feature_preprocessor.py @@ -7,6 +7,7 @@ from sklearn.base import BaseEstimator from sklearn.compose import make_column_transformer +from autoPyTorch.constants import CLASSIFICATION_TASKS, REGRESSION_TASKS, STRING_TO_TASK_TYPES from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing import ( FeatureProprocessorChoice ) @@ -20,24 +21,49 @@ def random_state(): return 11 -@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures', 'PowerTransformer', - 'Nystroem', 'KernelPCA', 'RandomKitchenSinks']) +@pytest.fixture(params=['NoFeaturePreprocessor', + 'FastICA', + 'KernelPCA', + 'RandomKitchenSinks', + 'Nystroem', + 'PolynomialFeatures', + 'TruncatedSVD', + 'ExtraTreesPreprocessorClassification', + 'ExtraTreesPreprocessorRegression', + 'FeatureAgglomeration', + 'RandomTreesEmbedding', + 'SelectPercentileClassification', + 'SelectPercentileRegression', + 'SelectRatesClassification', + 'SelectRatesRegression', + 'LibLinearSVCPreprocessor' + ]) def preprocessor(request): return request.param @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only', - 'classification_numerical_and_categorical'], indirect=True) + 'classification_numerical_and_categorical', + 'regression_numerical_only'], indirect=True) class TestFeaturePreprocessors: def test_feature_preprocessor(self, fit_dictionary_tabular, preprocessor, random_state): + task_type = str(fit_dictionary_tabular['dataset_properties']['task_type']) + if ( + ("Classification" in preprocessor or preprocessor == "LibLinearSVCPreprocessor") + and STRING_TO_TASK_TYPES[task_type] not in CLASSIFICATION_TASKS + ): + pytest.skip("Tests not relevant for {}".format(preprocessor.__class__.__name__)) + elif "Regression" in preprocessor and STRING_TO_TASK_TYPES[task_type] not in REGRESSION_TASKS: + pytest.skip("Tests not relevant for {}".format(preprocessor.__class__.__name__)) preprocessor = FeatureProprocessorChoice( dataset_properties=fit_dictionary_tabular['dataset_properties'] - ).get_components()[preprocessor](random_state=random_state) + ).get_components()[preprocessor] + configuration = preprocessor. \ get_hyperparameter_search_space(dataset_properties=fit_dictionary_tabular["dataset_properties"]) \ .get_default_configuration().get_dictionary() - preprocessor = preprocessor.set_params(**configuration) + preprocessor = preprocessor(**configuration, random_state=random_state) preprocessor.fit(fit_dictionary_tabular) X = preprocessor.transform(fit_dictionary_tabular) sklearn_preprocessor = X['feature_preprocessor']['numerical'] @@ -54,7 +80,7 @@ def test_feature_preprocessor(self, fit_dictionary_tabular, preprocessor, random column_transformer = make_column_transformer((sklearn_preprocessor, X['dataset_properties']['numerical_columns']), remainder='passthrough') - column_transformer.fit(X['X_train']) + column_transformer.fit(X['X_train'], X['y_train']) transformed = column_transformer.transform(X['X_train']) assert isinstance(transformed, np.ndarray) @@ -67,6 +93,14 @@ def test_pipeline_fit_include(self, fit_dictionary_tabular, preprocessor): in the include """ + task_type = str(fit_dictionary_tabular['dataset_properties']['task_type']) + if ( + ("Classification" in preprocessor or preprocessor == "LibLinearSVCPreprocessor") + and STRING_TO_TASK_TYPES[task_type] not in CLASSIFICATION_TASKS + ): + pytest.skip("Tests not relevant for {}".format(preprocessor.__class__.__name__)) + elif "Regression" in preprocessor and STRING_TO_TASK_TYPES[task_type] not in REGRESSION_TASKS: + pytest.skip("Tests not relevant for {}".format(preprocessor.__class__.__name__)) fit_dictionary_tabular['epochs'] = 1 pipeline = TabularClassificationPipeline( @@ -78,6 +112,11 @@ def test_pipeline_fit_include(self, fit_dictionary_tabular, preprocessor): try: pipeline.fit(fit_dictionary_tabular) except Exception as e: + if ( + ("must be non-negative" or "contains negative values") in e.args[0] + and not fit_dictionary_tabular['dataset_properties']['issigned'] + ): + pytest.skip("Failure because scaler made data nonnegative.") pytest.fail(f"For config {config} failed with {e}") # To make sure we fitted the model, there should be a diff --git a/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py b/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py index 57841aef0..d159b70e5 100644 --- a/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py +++ b/test/test_pipeline/components/preprocessing/test_feature_preprocessor_choice.py @@ -10,14 +10,17 @@ class TestFeaturePreprocessorChoice(unittest.TestCase): def test_get_set_config_space(self): """Make sure that we can setup a valid choice in the feature preprocessor choice""" - dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': [5]} + dataset_properties = {'numerical_columns': list(range(4)), + 'categorical_columns': [5], + 'task_type': 'tabular_classification'} feature_preprocessor_choice = FeatureProprocessorChoice(dataset_properties) cs = feature_preprocessor_choice.get_hyperparameter_search_space() # Make sure that all hyperparameters are part of the search space self.assertListEqual( sorted(cs.get_hyperparameter('__choice__').choices), - sorted(list(feature_preprocessor_choice.get_components().keys())) + sorted(list(feature_preprocessor_choice.get_available_components( + dataset_properties=dataset_properties).keys())) ) # Make sure we can properly set some random configs @@ -39,10 +42,16 @@ def test_get_set_config_space(self): # so we can query in the object for it key = key.replace(selected_choice + ':', '') self.assertIn(key, vars(feature_preprocessor_choice.choice)) + # for score function in some feature preprocessors + # this will fail + if 'score_func' or 'pooling_func' in key: + continue self.assertEqual(value, feature_preprocessor_choice.choice.__dict__[key]) def test_only_categorical(self): - dataset_properties = {'numerical_columns': [], 'categorical_columns': list(range(4))} + dataset_properties = {'numerical_columns': [], + 'categorical_columns': [5], + 'task_type': 'tabular_classification'} chooser = FeatureProprocessorChoice(dataset_properties) configspace = chooser.get_hyperparameter_search_space().sample_configuration().get_dictionary() diff --git a/test/test_pipeline/components/preprocessing/test_imputers.py b/test/test_pipeline/components/preprocessing/test_imputers.py index 983737dfe..0db460b77 100644 --- a/test/test_pipeline/components/preprocessing/test_imputers.py +++ b/test/test_pipeline/components/preprocessing/test_imputers.py @@ -3,6 +3,8 @@ import numpy as np from numpy.testing import assert_array_equal +import pytest + from sklearn.base import BaseEstimator, clone from sklearn.compose import make_column_transformer @@ -37,14 +39,14 @@ def test_get_config_space(self): self.assertEqual(param1, param2) def test_mean_imputation(self): - data = np.array([['1.0', np.nan, 3], + data = np.array([[1.0, np.nan, 3], [np.nan, 8, 9], - ['4.0', 5, np.nan], + [4.0, 5, np.nan], [np.nan, 2, 3], - ['7.0', np.nan, 9], - ['4.0', np.nan, np.nan]], dtype=object) - numerical_columns = [1, 2] - categorical_columns = [0] + [7.0, np.nan, 9], + [4.0, np.nan, np.nan]]) + numerical_columns = [0, 1, 2] + categorical_columns = [] train_indices = np.array([0, 2, 3]) test_indices = np.array([1, 4, 5]) dataset_properties = { @@ -64,33 +66,33 @@ def test_mean_imputation(self): # check if the fit dictionary X is modified as expected self.assertIsInstance(X['imputer'], dict) - self.assertIsInstance(categorical_imputer, BaseEstimator) + self.assertIsNone(categorical_imputer) self.assertIsInstance(numerical_imputer, BaseEstimator) # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer((categorical_imputer, - X['dataset_properties']['categorical_columns']), - (numerical_imputer, + column_transformer = make_column_transformer((numerical_imputer, X['dataset_properties']['numerical_columns']), remainder='passthrough') column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0], - [7.0, 3.5, 9.0], - [4.0, 3.5, 3.0]], dtype=str)) + assert_array_equal(transformed, np.array([[2.5, 8, 9], + [7, 3.5, 9], + [4, 3.5, 3]])) def test_median_imputation(self): - data = np.array([['1.0', np.nan, 3], - [np.nan, 8, 9], - ['4.0', 5, np.nan], - [np.nan, 2, 3], - ['7.0', np.nan, 9], - ['4.0', np.nan, np.nan]], dtype=object) - numerical_columns = [1, 2] - categorical_columns = [0] - train_indices = np.array([0, 2, 3]) - test_indices = np.array([1, 4, 5]) + data = np.array([[1.0, np.nan, 7], + [np.nan, 9, 10], + [10.0, 7, 7], + [9.0, np.nan, 11], + [9.0, 9, np.nan], + [np.nan, 5, 6], + [12.0, np.nan, 8], + [9.0, np.nan, np.nan]]) + numerical_columns = [0, 1, 2] + categorical_columns = [] + train_indices = np.array([0, 2, 3, 4, 7]) + test_indices = np.array([1, 5, 6]) dataset_properties = { 'categorical_columns': categorical_columns, 'numerical_columns': numerical_columns, @@ -108,33 +110,33 @@ def test_median_imputation(self): # check if the fit dictionary X is modified as expected self.assertIsInstance(X['imputer'], dict) - self.assertIsInstance(categorical_imputer, BaseEstimator) + self.assertIsNone(categorical_imputer) self.assertIsInstance(numerical_imputer, BaseEstimator) # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer( - (categorical_imputer, X['dataset_properties']['categorical_columns']), - (numerical_imputer, X['dataset_properties']['numerical_columns']), - remainder='passthrough' - ) + column_transformer = make_column_transformer((numerical_imputer, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed.astype(str), np.array([[1.0, 8.0, 9.0], - [7.0, 3.5, 9.0], - [4.0, 3.5, 3.0]], dtype=str)) + assert_array_equal(transformed, np.array([[9, 9, 10], + [9, 5, 6], + [12, 8, 8]])) def test_frequent_imputation(self): - data = np.array([['1.0', np.nan, 3], - [np.nan, 8, 9], - ['4.0', 5, np.nan], - [np.nan, 2, 3], - ['7.0', np.nan, 9], - ['4.0', np.nan, np.nan]], dtype=object) - numerical_columns = [1, 2] - categorical_columns = [0] - train_indices = np.array([0, 2, 3]) - test_indices = np.array([1, 4, 5]) + data = np.array([[1.0, np.nan, 7], + [np.nan, 9, 10], + [10.0, 7, 7], + [9.0, np.nan, 11], + [9.0, 9, np.nan], + [np.nan, 5, 6], + [12.0, np.nan, 8], + [9.0, np.nan, np.nan]]) + numerical_columns = [0, 1, 2] + categorical_columns = [] + train_indices = np.array([0, 2, 4, 5, 7]) + test_indices = np.array([1, 3, 6]) dataset_properties = { 'categorical_columns': categorical_columns, 'numerical_columns': numerical_columns, @@ -143,8 +145,7 @@ def test_frequent_imputation(self): 'X_train': data[train_indices], 'dataset_properties': dataset_properties } - imputer_component = SimpleImputer(numerical_strategy='most_frequent', - categorical_strategy='most_frequent') + imputer_component = SimpleImputer(numerical_strategy='most_frequent') imputer_component = imputer_component.fit(X) X = imputer_component.transform(X) @@ -153,31 +154,29 @@ def test_frequent_imputation(self): # check if the fit dictionary X is modified as expected self.assertIsInstance(X['imputer'], dict) - self.assertIsInstance(categorical_imputer, BaseEstimator) + self.assertIsNone(categorical_imputer) self.assertIsInstance(numerical_imputer, BaseEstimator) # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer( - (categorical_imputer, X['dataset_properties']['categorical_columns']), - (numerical_imputer, X['dataset_properties']['numerical_columns']), - remainder='passthrough' - ) + column_transformer = make_column_transformer((numerical_imputer, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed.astype(str), np.array([[1.0, 8, 9], - [7.0, 2, 9], - [4.0, 2, 3]], dtype=str)) + assert_array_equal(transformed, np.array([[9, 9, 10], + [9, 5, 11], + [12, 5, 8]])) def test_constant_imputation(self): - data = np.array([['1.0', np.nan, 3], + data = np.array([[1.0, np.nan, 3], [np.nan, 8, 9], - ['4.0', 5, np.nan], + [4.0, 5, np.nan], [np.nan, 2, 3], - ['7.0', np.nan, 9], - ['4.0', np.nan, np.nan]], dtype=object) - numerical_columns = [1, 2] - categorical_columns = [0] + [7.0, np.nan, 9], + [4.0, np.nan, np.nan]]) + numerical_columns = [0, 1, 2] + categorical_columns = [] train_indices = np.array([0, 2, 3]) test_indices = np.array([1, 4, 5]) dataset_properties = { @@ -188,8 +187,7 @@ def test_constant_imputation(self): 'X_train': data[train_indices], 'dataset_properties': dataset_properties } - imputer_component = SimpleImputer(numerical_strategy='constant_zero', - categorical_strategy='constant_!missing!') + imputer_component = SimpleImputer(numerical_strategy='constant_zero') imputer_component = imputer_component.fit(X) X = imputer_component.transform(X) @@ -198,20 +196,28 @@ def test_constant_imputation(self): # check if the fit dictionary X is modified as expected self.assertIsInstance(X['imputer'], dict) - self.assertIsInstance(categorical_imputer, BaseEstimator) + self.assertIsNone(categorical_imputer) self.assertIsInstance(numerical_imputer, BaseEstimator) # make column transformer with returned encoder to fit on data - column_transformer = make_column_transformer( - (categorical_imputer, X['dataset_properties']['categorical_columns']), - (numerical_imputer, X['dataset_properties']['numerical_columns']), - remainder='passthrough' - ) + column_transformer = make_column_transformer((numerical_imputer, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') column_transformer = column_transformer.fit(X['X_train']) transformed = column_transformer.transform(data[test_indices]) - assert_array_equal(transformed.astype(str), np.array([['-1', 8, 9], - [7.0, '0', 9], - [4.0, '0', '0']], dtype=str)) + assert_array_equal(transformed, np.array([[0, 8, 9], + [7, 0, 9], + [4, 0, 0]])) + + def test_imputation_without_dataset_properties_raises_error(self): + """Tests SimpleImputer checks for dataset properties when querying for + HyperparameterSearchSpace, even though the arg is marked `Optional`. + + Expects: + * Should raise a ValueError that no dataset_properties were passed + """ + with pytest.raises(ValueError): + SimpleImputer.get_hyperparameter_search_space() if __name__ == '__main__': diff --git a/test/test_pipeline/components/preprocessing/test_scalers.py b/test/test_pipeline/components/preprocessing/test_scalers.py index 94ba0f2dc..7cbc12b07 100644 --- a/test/test_pipeline/components/preprocessing/test_scalers.py +++ b/test/test_pipeline/components/preprocessing/test_scalers.py @@ -9,6 +9,11 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.MinMaxScaler import MinMaxScaler from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.NoScaler import NoScaler from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.Normalizer import Normalizer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.PowerTransformer import \ + PowerTransformer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.QuantileTransformer import \ + QuantileTransformer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.RobustScaler import RobustScaler from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.StandardScaler import StandardScaler @@ -239,3 +244,163 @@ def test_none_scaler(self): self.assertIsInstance(X['scaler'], dict) self.assertIsNone(X['scaler']['categorical']) self.assertIsNone(X['scaler']['numerical']) + + +def test_power_transformer(): + data = np.array([[1, 2, 3], + [7, 8, 9], + [4, 5, 6], + [11, 12, 13], + [17, 18, 19], + [14, 15, 16]]) + train_indices = np.array([0, 2, 5]) + test_indices = np.array([1, 4, 3]) + categorical_columns = list() + numerical_columns = [0, 1, 2] + dataset_properties = {'categorical_columns': categorical_columns, + 'numerical_columns': numerical_columns, + 'issparse': False} + X = { + 'X_train': data[train_indices], + 'dataset_properties': dataset_properties + } + scaler_component = PowerTransformer() + + scaler_component = scaler_component.fit(X) + X = scaler_component.transform(X) + scaler = X['scaler']['numerical'] + + # check if the fit dictionary X is modified as expected + assert isinstance(X['scaler'], dict) + assert isinstance(scaler, BaseEstimator) + assert X['scaler']['categorical'] is None + + # make column transformer with returned encoder to fit on data + column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']), + remainder='passthrough') + column_transformer = column_transformer.fit(X['X_train']) + transformed = column_transformer.transform(data[test_indices]) + + assert_allclose(transformed, np.array([[0.531648, 0.522782, 0.515394], + [1.435794, 1.451064, 1.461685], + [0.993609, 1.001055, 1.005734]]), rtol=1e-06) + + +def test_robust_scaler(): + data = np.array([[1, 2, 3], + [7, 8, 9], + [4, 5, 6], + [11, 12, 13], + [17, 18, 19], + [14, 15, 16]]) + train_indices = np.array([0, 2, 5]) + test_indices = np.array([1, 4, 3]) + categorical_columns = list() + numerical_columns = [0, 1, 2] + dataset_properties = {'categorical_columns': categorical_columns, + 'numerical_columns': numerical_columns, + 'issparse': False} + X = { + 'X_train': data[train_indices], + 'dataset_properties': dataset_properties + } + scaler_component = RobustScaler() + + scaler_component = scaler_component.fit(X) + X = scaler_component.transform(X) + scaler = X['scaler']['numerical'] + + # check if the fit dictionary X is modified as expected + assert isinstance(X['scaler'], dict) + assert isinstance(scaler, BaseEstimator) + assert X['scaler']['categorical'] is None + + # make column transformer with returned encoder to fit on data + column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']), + remainder='passthrough') + column_transformer = column_transformer.fit(X['X_train']) + transformed = column_transformer.transform(data[test_indices]) + + assert_allclose(transformed, np.array([[100, 100, 100], + [433.33333333, 433.33333333, 433.33333333], + [233.33333333, 233.33333333, 233.33333333]])) + + +class TestQuantileTransformer(): + def test_quantile_transformer_uniform(self): + data = np.array([[1, 2, 3], + [7, 8, 9], + [4, 5, 6], + [11, 12, 13], + [17, 18, 19], + [14, 15, 16]]) + train_indices = np.array([0, 2, 5]) + test_indices = np.array([1, 4, 3]) + categorical_columns = list() + numerical_columns = [0, 1, 2] + dataset_properties = {'categorical_columns': categorical_columns, + 'numerical_columns': numerical_columns, + 'issparse': False} + X = { + 'X_train': data[train_indices], + 'dataset_properties': dataset_properties + } + scaler_component = QuantileTransformer(output_distribution='uniform') + + scaler_component = scaler_component.fit(X) + X = scaler_component.transform(X) + scaler = X['scaler']['numerical'] + + # check if the fit dictionary X is modified as expected + assert isinstance(X['scaler'], dict) + assert isinstance(scaler, BaseEstimator) + assert X['scaler']['categorical'] is None + + # make column transformer with returned encoder to fit on data + column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']), + remainder='passthrough') + column_transformer = column_transformer.fit(X['X_train']) + transformed = column_transformer.transform(data[test_indices]) + + assert_allclose(transformed, np.array([[0.65, 0.65, 0.65], + [1, 1, 1], + [0.85, 0.85, 0.85]]), rtol=1e-06) + + def test_quantile_transformer_normal(self): + data = np.array([[1, 2, 3], + [7, 8, 9], + [4, 5, 6], + [11, 12, 13], + [17, 18, 19], + [14, 15, 16]]) + train_indices = np.array([0, 2, 5]) + test_indices = np.array([1, 4, 3]) + categorical_columns = list() + numerical_columns = [0, 1, 2] + dataset_properties = {'categorical_columns': categorical_columns, + 'numerical_columns': numerical_columns, + 'issparse': False} + X = { + 'X_train': data[train_indices], + 'dataset_properties': dataset_properties + } + scaler_component = QuantileTransformer(output_distribution='normal') + + scaler_component = scaler_component.fit(X) + X = scaler_component.transform(X) + scaler = X['scaler']['numerical'] + + # check if the fit dictionary X is modified as expected + assert isinstance(X['scaler'], dict) + assert isinstance(scaler, BaseEstimator) + assert X['scaler']['categorical'] is None + + # make column transformer with returned encoder to fit on data + column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']), + remainder='passthrough') + column_transformer = column_transformer.fit(X['X_train']) + transformed = column_transformer.transform(data[test_indices]) + + assert_allclose(transformed, np.array([[0.38532, 0.38532, 0.38532], + [5.199338, 5.199338, 5.199338], + [1.036433, 1.036433, 1.036433]]), rtol=1e-05) diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py index 66a96f27f..36de9f275 100644 --- a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py @@ -47,6 +47,7 @@ def test_sparse_data(self, fit_dictionary_tabular): X = np.random.binomial(1, 0.1, (100, 2000)) sparse_X = csr_matrix(X) + y = np.random.randint(0, 1, 100) numerical_columns = list(range(2000)) categorical_columns = [] train_indices = np.array(range(50)) @@ -56,6 +57,7 @@ def test_sparse_data(self, fit_dictionary_tabular): issparse=True) X = { 'X_train': sparse_X, + 'y_train': y, 'train_indices': train_indices, 'dataset_properties': dataset_properties } diff --git a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py new file mode 100644 index 000000000..3f22835b3 --- /dev/null +++ b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py @@ -0,0 +1,49 @@ +import numpy as np +from numpy.testing import assert_array_equal + + +from sklearn.base import BaseEstimator +from sklearn.compose import make_column_transformer + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold + + +def test_variance_threshold(): + data = np.array([[1, 2, 1], + [7, 8, 9], + [4, 5, 1], + [11, 12, 1], + [17, 18, 19], + [14, 15, 16]]) + numerical_columns = [0, 1, 2] + train_indices = np.array([0, 2, 3]) + test_indices = np.array([1, 4, 5]) + dataset_properties = { + 'categorical_columns': [], + 'numerical_columns': numerical_columns, + } + X = { + 'X_train': data[train_indices], + 'dataset_properties': dataset_properties + } + component = VarianceThreshold() + + component = component.fit(X) + X = component.transform(X) + variance_threshold = X['variance_threshold']['numerical'] + + # check if the fit dictionary X is modified as expected + assert isinstance(X['variance_threshold'], dict) + assert isinstance(variance_threshold, BaseEstimator) + + # make column transformer with returned encoder to fit on data + column_transformer = make_column_transformer((variance_threshold, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') + column_transformer = column_transformer.fit(X['X_train']) + transformed = column_transformer.transform(data[test_indices]) + + assert_array_equal(transformed, np.array([[7, 8], + [17, 18], + [14, 15]])) diff --git a/test/test_pipeline/components/setup/test_setup_preprocessing_node.py b/test/test_pipeline/components/setup/test_setup_preprocessing_node.py index 0fc0bb4c0..1ec858864 100644 --- a/test/test_pipeline/components/setup/test_setup_preprocessing_node.py +++ b/test/test_pipeline/components/setup/test_setup_preprocessing_node.py @@ -23,7 +23,7 @@ def setUp(self): dataset = mock.MagicMock() dataset.__len__.return_value = 1 datamanager = mock.MagicMock() - datamanager.get_dataset_for_training.return_value = (dataset, dataset) + datamanager.get_dataset.return_value = (dataset, dataset) datamanager.train_tensors = (np.random.random((10, 15)), np.random.random(10)) datamanager.test_tensors = None self.backend.load_datamanager.return_value = datamanager diff --git a/test/test_pipeline/components/training/test_training.py b/test/test_pipeline/components/training/test_training.py index 8ae2759db..6b277d36d 100644 --- a/test/test_pipeline/components/training/test_training.py +++ b/test/test_pipeline/components/training/test_training.py @@ -108,7 +108,7 @@ def test_fit_transform(self): dataset = unittest.mock.MagicMock() dataset.__len__.return_value = 1 datamanager = unittest.mock.MagicMock() - datamanager.get_dataset_for_training.return_value = (dataset, dataset) + datamanager.get_dataset.return_value = (dataset, dataset) fit_dictionary['backend'].load_datamanager.return_value = datamanager # Mock child classes requirements @@ -236,6 +236,43 @@ def test_train_step(self): lr = optimizer.param_groups[0]['lr'] assert lr == target_lr + def test_train_epoch_no_step(self): + """ + This test checks if max runtime is reached + for an epoch before any train_step has been + completed. In this case we would like to + return None for train_loss and an empty + dictionary for the metrics. + """ + device = torch.device('cpu') + model = torch.nn.Linear(1, 1).to(device) + optimizer = torch.optim.Adam(model.parameters(), lr=1) + data_loader = unittest.mock.MagicMock(spec=torch.utils.data.DataLoader) + ms = [3, 5, 6] + params = { + 'metrics': [], + 'device': device, + 'task_type': constants.TABULAR_REGRESSION, + 'labels': torch.Tensor([]), + 'metrics_during_training': False, + 'budget_tracker': BudgetTracker(budget_type='runtime', max_runtime=0), + 'criterion': torch.nn.MSELoss, + 'optimizer': optimizer, + 'scheduler': torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=ms, gamma=2), + 'model': model, + 'step_interval': StepIntervalUnit.epoch + } + trainer = StandardTrainer() + trainer.prepare(**params) + + loss, metrics = trainer.train_epoch( + train_loader=data_loader, + epoch=0, + writer=None + ) + assert loss is None + assert metrics == {} + class TestStandardTrainer(BaseTraining): def test_regression_epoch_training(self, n_samples): diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index 52288b199..ab21897b8 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -1,6 +1,7 @@ import os import re import unittest +import unittest.mock from ConfigSpace.hyperparameters import ( CategoricalHyperparameter, @@ -23,6 +24,11 @@ parse_hyperparameter_search_space_updates +@pytest.fixture +def exclude(): + return {'feature_preprocessor': ['SelectRatesClassification', 'SelectPercentileClassification']} + + @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_categorical_only', 'classification_numerical_only', 'classification_numerical_and_categorical'], indirect=True) @@ -53,12 +59,13 @@ def _assert_pipeline_search_space(self, pipeline, search_space_updates): elif isinstance(hyperparameter, CategoricalHyperparameter): assert update.value_range == hyperparameter.choices - def test_pipeline_fit(self, fit_dictionary_tabular): + def test_pipeline_fit(self, fit_dictionary_tabular, exclude): """This test makes sure that the pipeline is able to fit given random combinations of hyperparameters across the pipeline""" pipeline = TabularClassificationPipeline( - dataset_properties=fit_dictionary_tabular['dataset_properties']) + dataset_properties=fit_dictionary_tabular['dataset_properties'], + exclude=exclude) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) @@ -80,12 +87,13 @@ def test_pipeline_fit(self, fit_dictionary_tabular): # Make sure a network was fit assert isinstance(pipeline.named_steps['network'].get_network(), torch.nn.Module) - def test_pipeline_predict(self, fit_dictionary_tabular): + def test_pipeline_predict(self, fit_dictionary_tabular, exclude): """This test makes sure that the pipeline is able to predict given a random configuration""" X = fit_dictionary_tabular['X_train'].copy() pipeline = TabularClassificationPipeline( - dataset_properties=fit_dictionary_tabular['dataset_properties']) + dataset_properties=fit_dictionary_tabular['dataset_properties'], + exclude=exclude) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() @@ -104,14 +112,15 @@ def test_pipeline_predict(self, fit_dictionary_tabular): assert isinstance(prediction, np.ndarray) assert prediction.shape == expected_output_shape - def test_pipeline_predict_proba(self, fit_dictionary_tabular): + def test_pipeline_predict_proba(self, fit_dictionary_tabular, exclude): """This test makes sure that the pipeline is able to fit given random combinations of hyperparameters across the pipeline And then predict using predict probability """ X = fit_dictionary_tabular['X_train'].copy() pipeline = TabularClassificationPipeline( - dataset_properties=fit_dictionary_tabular['dataset_properties']) + dataset_properties=fit_dictionary_tabular['dataset_properties'], + exclude=exclude) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() @@ -133,7 +142,7 @@ def test_pipeline_predict_proba(self, fit_dictionary_tabular): assert isinstance(prediction, np.ndarray) assert prediction.shape == expected_output_shape - def test_pipeline_transform(self, fit_dictionary_tabular): + def test_pipeline_transform(self, fit_dictionary_tabular, exclude): """ In the context of autopytorch, transform expands a fit dictionary with components that where previously fit. We can use this as a nice way to make sure @@ -142,7 +151,8 @@ def test_pipeline_transform(self, fit_dictionary_tabular): """ pipeline = TabularClassificationPipeline( - dataset_properties=fit_dictionary_tabular['dataset_properties']) + dataset_properties=fit_dictionary_tabular['dataset_properties'], + exclude=exclude) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) @@ -171,14 +181,15 @@ def test_pipeline_transform(self, fit_dictionary_tabular): assert 'preprocess_transforms' in transformed_fit_dictionary_tabular.keys() @pytest.mark.parametrize("is_small_preprocess", [True, False]) - def test_default_configuration(self, fit_dictionary_tabular, is_small_preprocess): + def test_default_configuration(self, fit_dictionary_tabular, is_small_preprocess, exclude): """Makes sure that when no config is set, we can trust the default configuration from the space""" fit_dictionary_tabular['is_small_preprocess'] = is_small_preprocess pipeline = TabularClassificationPipeline( - dataset_properties=fit_dictionary_tabular['dataset_properties']) + dataset_properties=fit_dictionary_tabular['dataset_properties'], + exclude=exclude) with unittest.mock.patch.object(pipeline.named_steps['trainer'].choice, 'train_epoch') \ as patch_train: patch_train.return_value = 1, {} @@ -258,7 +269,8 @@ def test_get_fit_requirements(self, fit_dictionary_tabular): def test_apply_search_space_updates(self, fit_dictionary_tabular, search_space_updates): dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2], - 'task_type': 'tabular_classification'} + 'task_type': 'tabular_classification', 'issparse': False, + 'issigned': False} pipeline = TabularClassificationPipeline(dataset_properties=dataset_properties, search_space_updates=search_space_updates) self._assert_pipeline_search_space(pipeline, search_space_updates) @@ -275,14 +287,16 @@ def test_read_and_update_search_space(self, fit_dictionary_tabular, search_space file_search_space_updates = parse_hyperparameter_search_space_updates(updates_file=path) assert isinstance(file_search_space_updates, HyperparameterSearchSpaceUpdates) dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2], - 'task_type': 'tabular_classification'} + 'task_type': 'tabular_classification', 'issparse': False, + 'issigned': False} pipeline = TabularClassificationPipeline(dataset_properties=dataset_properties, search_space_updates=file_search_space_updates) assert file_search_space_updates == pipeline.search_space_updates def test_error_search_space_updates(self, fit_dictionary_tabular, error_search_space_updates): dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2], - 'task_type': 'tabular_classification'} + 'task_type': 'tabular_classification', 'issparse': False, + 'issigned': False} try: _ = TabularClassificationPipeline(dataset_properties=dataset_properties, search_space_updates=error_search_space_updates) @@ -293,7 +307,8 @@ def test_error_search_space_updates(self, fit_dictionary_tabular, error_search_s def test_set_range_search_space_updates(self, fit_dictionary_tabular): dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2], - 'task_type': 'tabular_classification'} + 'task_type': 'tabular_classification', 'issparse': False, + 'issigned': False} config_dict = TabularClassificationPipeline(dataset_properties=dataset_properties). \ get_hyperparameter_search_space()._hyperparameters updates = HyperparameterSearchSpaceUpdates() @@ -325,7 +340,8 @@ def test_set_range_search_space_updates(self, fit_dictionary_tabular): def test_set_choices_updates(self, fit_dictionary_tabular): dataset_properties = {'numerical_columns': [1], 'categorical_columns': [2], - 'task_type': 'tabular_classification'} + 'task_type': 'tabular_classification', 'issparse': False, + 'issigned': False} config_dict = TabularClassificationPipeline(dataset_properties=dataset_properties). \ get_hyperparameter_search_space()._hyperparameters updates = HyperparameterSearchSpaceUpdates() @@ -491,3 +507,30 @@ def test_train_pipeline_with_runtime(fit_dictionary_tabular_dummy): # More than 200 epochs would have pass in 5 seconds for this dataset assert len(run_summary.performance_tracker['start_time']) > 100 + + +@pytest.mark.parametrize("fit_dictionary_tabular_dummy", ["classification"], indirect=True) +def test_train_pipeline_with_runtime_max_reached(fit_dictionary_tabular_dummy): + """ + This test makes sure that the pipeline raises an + error in case no epoch has finished successfully + due to max runtime reached + """ + + # Convert the training to runtime + fit_dictionary_tabular_dummy.pop('epochs', None) + fit_dictionary_tabular_dummy['budget_type'] = 'runtime' + fit_dictionary_tabular_dummy['runtime'] = 5 + fit_dictionary_tabular_dummy['early_stopping'] = -1 + + pipeline = TabularClassificationPipeline( + dataset_properties=fit_dictionary_tabular_dummy['dataset_properties']) + + cs = pipeline.get_hyperparameter_search_space() + config = cs.get_default_configuration() + pipeline.set_hyperparameters(config) + + with unittest.mock.patch('autoPyTorch.pipeline.components.training.trainer.BudgetTracker') as patch: + patch.is_max_time_reached.return_value = True + with pytest.raises(RuntimeError): + pipeline.fit(fit_dictionary_tabular_dummy) diff --git a/test/test_api/.tmp_api/runhistory_B.json b/test/test_utils/runhistory.json similarity index 99% rename from test/test_api/.tmp_api/runhistory_B.json rename to test/test_utils/runhistory.json index 37e499664..a2c3658a8 100755 --- a/test/test_api/.tmp_api/runhistory_B.json +++ b/test/test_utils/runhistory.json @@ -1133,6 +1133,7 @@ "1": { "data_loader:batch_size": 64, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", @@ -1166,6 +1167,7 @@ "2": { "data_loader:batch_size": 142, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PowerTransformer", "imputer:categorical_strategy": "constant_!missing!", "imputer:numerical_strategy": "median", @@ -1203,6 +1205,7 @@ "3": { "data_loader:batch_size": 246, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PowerTransformer", "imputer:categorical_strategy": "constant_!missing!", "imputer:numerical_strategy": "most_frequent", @@ -1281,6 +1284,7 @@ "4": { "data_loader:batch_size": 269, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PowerTransformer", "imputer:categorical_strategy": "constant_!missing!", "imputer:numerical_strategy": "median", @@ -1324,6 +1328,7 @@ "5": { "data_loader:batch_size": 191, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "RandomKitchenSinks", "imputer:categorical_strategy": "constant_!missing!", "imputer:numerical_strategy": "most_frequent", @@ -1373,6 +1378,7 @@ "6": { "data_loader:batch_size": 53, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PowerTransformer", "imputer:categorical_strategy": "constant_!missing!", "imputer:numerical_strategy": "median", @@ -1429,6 +1435,7 @@ "7": { "data_loader:batch_size": 232, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "RandomKitchenSinks", "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "most_frequent", @@ -1506,6 +1513,7 @@ "8": { "data_loader:batch_size": 164, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", @@ -1540,6 +1548,7 @@ "9": { "data_loader:batch_size": 94, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PolynomialFeatures", "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", @@ -1589,6 +1598,7 @@ "10": { "data_loader:batch_size": 70, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PowerTransformer", "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "constant_zero", @@ -1637,6 +1647,7 @@ "11": { "data_loader:batch_size": 274, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "RandomKitchenSinks", "imputer:categorical_strategy": "constant_!missing!", "imputer:numerical_strategy": "mean", @@ -1675,6 +1686,7 @@ "12": { "data_loader:batch_size": 191, "encoder:__choice__": "NoEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "NoFeaturePreprocessor", "imputer:categorical_strategy": "constant_!missing!", "imputer:numerical_strategy": "median", @@ -1730,6 +1742,7 @@ "13": { "data_loader:batch_size": 35, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "PowerTransformer", "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "most_frequent", @@ -1766,6 +1779,7 @@ "14": { "data_loader:batch_size": 154, "encoder:__choice__": "OneHotEncoder", + "coalescer:__choice__": "NoCoalescer", "feature_preprocessor:__choice__": "KernelPCA", "imputer:categorical_strategy": "most_frequent", "imputer:numerical_strategy": "mean", diff --git a/test/test_utils/test_coalescer_transformer.py b/test/test_utils/test_coalescer_transformer.py new file mode 100644 index 000000000..eccd6b7bd --- /dev/null +++ b/test/test_utils/test_coalescer_transformer.py @@ -0,0 +1,101 @@ +import numpy as np + +import pytest + +import scipy.sparse + +from autoPyTorch.utils.implementations import MinorityCoalesceTransformer + + +@pytest.fixture +def X1(): + # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 30%, + # 30%, 30%, 5% and 5% respectively + X = np.vstack(( + np.ones((30, 10)) * 3, + np.ones((30, 10)) * 4, + np.ones((30, 10)) * 5, + np.ones((5, 10)) * 6, + np.ones((5, 10)) * 7, + )) + for col in range(X.shape[1]): + np.random.shuffle(X[:, col]) + return X + + +@pytest.fixture +def X2(): + # Generates an array with categories 3, 4, 5, 6, 7 and occurences of 5%, + # 5%, 5%, 35% and 50% respectively + X = np.vstack(( + np.ones((5, 10)) * 3, + np.ones((5, 10)) * 4, + np.ones((5, 10)) * 5, + np.ones((35, 10)) * 6, + np.ones((50, 10)) * 7, + )) + for col in range(X.shape[1]): + np.random.shuffle(X[:, col]) + return X + + +def test_default(X1): + X = X1 + X_copy = np.copy(X) + Y = MinorityCoalesceTransformer().fit_transform(X) + np.testing.assert_array_almost_equal(Y, X_copy) + # Assert no copies were made + assert id(X) == id(Y) + + +def test_coalesce_10_percent(X1): + X = X1 + Y = MinorityCoalesceTransformer(min_frac=.1).fit_transform(X) + for col in range(Y.shape[1]): + hist = np.histogram(Y[:, col], bins=np.arange(-2, 7)) + np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30]) + # Assert no copies were made + assert id(X) == id(Y) + + +def test_coalesce_10_percent_sparse(X1): + X = scipy.sparse.csc_matrix(X1) + Y = MinorityCoalesceTransformer(min_frac=.1).fit_transform(X) + # Assert no copies were made + assert id(X) == id(Y) + Y = Y.todense() + for col in range(Y.shape[1]): + hist = np.histogram(Y[:, col], bins=np.arange(-2, 7)) + np.testing.assert_array_almost_equal(hist[0], [10, 0, 0, 0, 0, 30, 30, 30]) + + +def test_invalid_X(X1): + X = X1 - 5 + with pytest.raises(ValueError): + MinorityCoalesceTransformer().fit_transform(X) + + +@pytest.mark.parametrize("min_frac", [-0.1, 1.1]) +def test_invalid_min_frac(min_frac): + with pytest.raises(ValueError): + MinorityCoalesceTransformer(min_frac=min_frac) + + +def test_transform_before_fit(X1): + with pytest.raises(RuntimeError): + MinorityCoalesceTransformer().transform(X1) + + +def test_transform_after_fit(X1, X2): + # On both X_fit and X_transf, the categories 3, 4, 5, 6, 7 are present. + X_fit = X1 # Here categories 3, 4, 5 have ocurrence above 10% + X_transf = X2 # Here it is the opposite, just categs 6 and 7 are above 10% + + mc = MinorityCoalesceTransformer(min_frac=.1).fit(X_fit) + + # transform() should coalesce categories as learned during fit. + # Category distribution in X_transf should be irrelevant. + Y = mc.transform(X_transf) + for col in range(Y.shape[1]): + hist = np.histogram(Y[:, col], bins=np.arange(-2, 7)) + np.testing.assert_array_almost_equal(hist[0], [85, 0, 0, 0, 0, 5, 5, 5]) diff --git a/test/test_utils/test_common.py b/test/test_utils/test_common.py new file mode 100644 index 000000000..ea3dec563 --- /dev/null +++ b/test/test_utils/test_common.py @@ -0,0 +1,72 @@ +""" +This tests the functionality in autoPyTorch/utils/common. +""" +from enum import Enum + +import pytest + +from autoPyTorch.utils.common import autoPyTorchEnum + + +class SubEnum(autoPyTorchEnum): + x = "x" + y = "y" + + +class DummyEnum(Enum): # You need to move it on top + x = "x" + + +@pytest.mark.parametrize('iter', + ([SubEnum.x], + ["x"], + {SubEnum.x: "hello"}, + {'x': 'hello'}, + SubEnum, + ["x", "y"])) +def test_autopytorch_enum(iter): + """ + This test ensures that a subclass of `autoPyTorchEnum` + can be used with strings. + + Args: + iter (Iterable): + iterable to check for compaitbility + """ + + e = SubEnum.x + + assert e in iter + + +@pytest.mark.parametrize('iter', + [[SubEnum.y], + ["y"], + {SubEnum.y: "hello"}, + {'y': 'hello'}]) +def test_autopytorch_enum_false(iter): + """ + This test ensures that a subclass of `autoPyTorchEnum` + can be used with strings. + Args: + iter (Iterable): + iterable to check for compaitbility + """ + + e = SubEnum.x + + assert e not in iter + + +@pytest.mark.parametrize('others', (1, 2.0, SubEnum, DummyEnum.x)) +def test_raise_errors_autopytorch_enum(others): + """ + This test ensures that a subclass of `autoPyTorchEnum` + raises error properly. + Args: + others (Any): + Variable to compare with SubEnum. + """ + + with pytest.raises(RuntimeError): + SubEnum.x == others diff --git a/test/test_utils/test_results_manager.py b/test/test_utils/test_results_manager.py new file mode 100644 index 000000000..496aec7fa --- /dev/null +++ b/test/test_utils/test_results_manager.py @@ -0,0 +1,471 @@ +import json +import os +from datetime import datetime +from test.test_api.utils import make_dict_run_history_data +from unittest.mock import MagicMock + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import Configuration, ConfigurationSpace + +import numpy as np + +import pytest + +from smac.runhistory.runhistory import RunHistory, RunKey, RunValue, StatusType + +from autoPyTorch.api.base_task import BaseTask +from autoPyTorch.metrics import accuracy, balanced_accuracy, log_loss +from autoPyTorch.utils.results_manager import ( + EnsembleResults, + MetricResults, + ResultsManager, + SearchResults, + cost2metric, + get_start_time +) + + +T, NT = 'traditional', 'non-traditional' +SCORES = [0.1 * (i + 1) for i in range(10)] +END_TIMES = [8, 4, 3, 6, 0, 7, 1, 9, 2, 5] + + +def _check_status(status): + """ Based on runhistory.json """ + ans = [ + StatusType.SUCCESS, StatusType.SUCCESS, + StatusType.SUCCESS, StatusType.SUCCESS, + StatusType.SUCCESS, StatusType.SUCCESS, + StatusType.CRASHED, StatusType.SUCCESS, + StatusType.SUCCESS, StatusType.SUCCESS, + StatusType.SUCCESS, StatusType.SUCCESS, + StatusType.SUCCESS, StatusType.SUCCESS, + StatusType.TIMEOUT, StatusType.TIMEOUT, + ] + assert isinstance(status, list) + assert isinstance(status[0], StatusType) + assert status == ans + + +def _check_costs(costs): + """ Based on runhistory.json """ + ans = [0.15204678362573099, 0.4444444444444444, 0.5555555555555556, 0.29824561403508776, + 0.4444444444444444, 0.4444444444444444, 1.0, 0.5555555555555556, 0.4444444444444444, + 0.15204678362573099, 0.15204678362573099, 0.4035087719298246, 0.4444444444444444, + 0.4444444444444444, 1.0, 1.0] + assert np.allclose(1 - np.array(costs), ans) + assert isinstance(costs, np.ndarray) + assert costs.dtype is np.dtype(np.float) + + +def _check_end_times(end_times): + """ Based on runhistory.json """ + ans = [1637342642.7887495, 1637342647.2651122, 1637342675.2555833, 1637342681.334954, + 1637342693.2717755, 1637342704.341065, 1637342726.1866672, 1637342743.3274522, + 1637342749.9442234, 1637342762.5487585, 1637342779.192385, 1637342804.3368232, + 1637342820.8067145, 1637342846.0210106, 1637342897.1205413, 1637342928.7456856] + + assert np.allclose(end_times, ans) + assert isinstance(end_times, np.ndarray) + assert end_times.dtype is np.dtype(np.float) + + +def _check_fit_times(fit_times): + """ Based on runhistory.json """ + ans = [3.154788017272949, 3.2763524055480957, 22.723600149154663, 4.990685224533081, 10.684926509857178, + 9.947429180145264, 11.687273979187012, 8.478890419006348, 5.485020637512207, 11.514830589294434, + 15.370736837387085, 23.846530199050903, 6.757539510726929, 15.061991930007935, 50.010520696640015, + 22.011935234069824] + + assert np.allclose(fit_times, ans) + assert isinstance(fit_times, np.ndarray) + assert fit_times.dtype is np.dtype(np.float) + + +def _check_budgets(budgets): + """ Based on runhistory.json """ + ans = [5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555, + 5.555555555555555, 5.555555555555555, 5.555555555555555, 5.555555555555555, + 5.555555555555555, 16.666666666666664, 50.0, 16.666666666666664, 16.666666666666664, + 16.666666666666664, 50.0, 50.0] + assert np.allclose(budgets, ans) + assert isinstance(budgets, list) + assert isinstance(budgets[0], float) + + +def _check_additional_infos(status_types, additional_infos): + for i, status in enumerate(status_types): + info = additional_infos[i] + if status in (StatusType.SUCCESS, StatusType.DONOTADVANCE): + metric_info = info.get('opt_loss', None) + assert metric_info is not None + elif info is not None: + metric_info = info.get('opt_loss', None) + assert metric_info is None + + +def _check_metric_dict(metric_dict, status_types, worst_val): + assert isinstance(metric_dict['accuracy'], list) + assert metric_dict['accuracy'][0] > 0 + assert isinstance(metric_dict['balanced_accuracy'], list) + assert metric_dict['balanced_accuracy'][0] > 0 + + for key, vals in metric_dict.items(): + # ^ is a XOR operator + # True and False / False and True must be fulfilled + assert all([(s == StatusType.SUCCESS) ^ np.isclose([val], [worst_val]) + for s, val in zip(status_types, vals)]) + + +def _check_metric_results(scores, metric, run_history, ensemble_performance_history): + if metric.name == 'accuracy': # Check the case when ensemble does not have the metric name + dummy_history = [{'Timestamp': datetime(2000, 1, 1), 'train_log_loss': 1, 'test_log_loss': 1}] + mr = MetricResults(metric, run_history, dummy_history) + # ensemble_results should be None because ensemble evaluated log_loss + assert mr.ensemble_results.empty() + data = mr.get_ensemble_merged_data() + # since ensemble_results is None, merged_data must be identical to the run_history data + assert all(np.allclose(data[key], mr.data[key]) for key in data.keys()) + + mr = MetricResults(metric, run_history, ensemble_performance_history) + perfs = np.array([cost2metric(s, metric) for s in scores]) + modified_scores = scores[::2] + [0] + modified_scores.insert(2, 0) + ens_perfs = np.array([s for s in modified_scores]) + assert np.allclose(mr.data[f'single::train::{metric.name}'], perfs) + assert np.allclose(mr.data[f'single::opt::{metric.name}'], perfs) + assert np.allclose(mr.data[f'single::test::{metric.name}'], perfs) + assert np.allclose(mr.data[f'ensemble::train::{metric.name}'], ens_perfs) + assert np.allclose(mr.data[f'ensemble::test::{metric.name}'], ens_perfs) + + # the end times of synthetic ensemble is [0.25, 0.45, 0.45, 0.65, 0.85, 0.85] + # the end times of synthetic run history is 0.1 * np.arange(1, 9) or 0.1 * np.arange(2, 10) + ensemble_ends_later = mr.search_results.end_times[-1] < mr.ensemble_results.end_times[-1] + indices = [2, 4, 4, 6, 8, 8] if ensemble_ends_later else [1, 3, 3, 5, 7, 7] + + merged_data = mr.get_ensemble_merged_data() + worst_val = metric._worst_possible_result + minimize = metric._sign == -1 + ans = np.full_like(mr.cum_times, worst_val) + for idx, s in zip(indices, mr.ensemble_results.train_scores): + ans[idx] = min(ans[idx], s) if minimize else max(ans[idx], s) + + assert np.allclose(ans, merged_data[f'ensemble::train::{metric.name}']) + assert np.allclose(ans, merged_data[f'ensemble::test::{metric.name}']) + + +def test_extract_results_from_run_history(): + # test the raise error for the `status_msg is None` + run_history = RunHistory() + cs = ConfigurationSpace() + config = Configuration(cs, {}) + run_history.add( + config=config, + cost=0.0, + time=1.0, + status=StatusType.CAPPED, + ) + with pytest.raises(ValueError): + SearchResults(metric=accuracy, scoring_functions=[], run_history=run_history) + + +def test_raise_error_in_update_and_sort_by_time(): + cs = ConfigurationSpace() + cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1)) + config = Configuration(cs, {'a': 0.1}) + + sr = SearchResults(metric=accuracy, scoring_functions=[], run_history=RunHistory()) + er = EnsembleResults(metric=accuracy, ensemble_performance_history=[]) + + with pytest.raises(RuntimeError): + sr._update( + config=config, + run_key=RunKey(config_id=0, instance_id=0, seed=0), + run_value=RunValue( + cost=0, time=1, status=StatusType.SUCCESS, + starttime=0, endtime=1, additional_info={} + ) + ) + + with pytest.raises(RuntimeError): + sr._sort_by_endtime() + + with pytest.raises(RuntimeError): + er._update(data={}) + + with pytest.raises(RuntimeError): + er._sort_by_endtime() + + +@pytest.mark.parametrize('starttimes', (list(range(10)), list(range(10))[::-1])) +@pytest.mark.parametrize('status_types', ( + [StatusType.SUCCESS] * 9 + [StatusType.STOP], + [StatusType.RUNNING] + [StatusType.SUCCESS] * 9 +)) +def test_get_start_time(starttimes, status_types): + run_history = RunHistory() + cs = ConfigurationSpace() + cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1)) + endtime = 1e9 + kwargs = dict(cost=1.0, endtime=endtime) + for starttime, status_type in zip(starttimes, status_types): + config = Configuration(cs, {'a': 0.1 * starttime}) + run_history.add( + config=config, + starttime=starttime, + time=endtime - starttime, + status=status_type, + **kwargs + ) + starttime = get_start_time(run_history) + + # this rule is strictly defined on the inputs defined from pytest + ans = min(t for s, t in zip(status_types, starttimes) if s == StatusType.SUCCESS) + assert starttime == ans + + +def test_raise_error_in_get_start_time(): + # test the raise error for the `status_msg is None` + run_history = RunHistory() + cs = ConfigurationSpace() + config = Configuration(cs, {}) + run_history.add( + config=config, + cost=0.0, + time=1.0, + status=StatusType.CAPPED, + ) + + with pytest.raises(ValueError): + get_start_time(run_history) + + +def test_search_results_sort_by_endtime(): + run_history = RunHistory() + n_configs = len(SCORES) + cs = ConfigurationSpace() + cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1)) + order = np.argsort(END_TIMES) + ans = np.array(SCORES)[order].tolist() + status_types = [StatusType.SUCCESS, StatusType.DONOTADVANCE] * (n_configs // 2) + + for i, (fixed_val, et, status) in enumerate(zip(SCORES, END_TIMES, status_types)): + config = Configuration(cs, {'a': fixed_val}) + run_history.add( + config=config, cost=fixed_val, + status=status, budget=fixed_val, + time=et - fixed_val, starttime=fixed_val, endtime=et, + additional_info={ + 'a': fixed_val, + 'configuration_origin': [T, NT][i % 2], + 'train_loss': {accuracy.name: fixed_val - 0.1}, + 'opt_loss': {accuracy.name: fixed_val}, + 'test_loss': {accuracy.name: fixed_val + 0.1} + } + ) + + sr = SearchResults(accuracy, scoring_functions=[], run_history=run_history, order_by_endtime=True) + assert sr.budgets == ans + assert np.allclose(accuracy._optimum - accuracy._sign * sr.opt_scores, ans) + assert np.allclose(accuracy._optimum - accuracy._sign * sr.train_scores, np.array(ans) - accuracy._sign * 0.1) + assert np.allclose(accuracy._optimum - accuracy._sign * sr.test_scores, np.array(ans) + accuracy._sign * 0.1) + assert np.allclose(1 - sr.opt_scores, ans) + assert sr._end_times == list(range(n_configs)) + assert all(c.get('a') == val for val, c in zip(ans, sr.configs)) + assert all(info['a'] == val for val, info in zip(ans, sr.additional_infos)) + assert np.all(np.array([s for s in status_types])[order] == np.array(sr.status_types)) + assert sr.is_traditionals == np.array([True, False] * 5)[order].tolist() + assert np.allclose(sr.fit_times, np.subtract(np.arange(n_configs), ans)) + + +def test_ensemble_results(): + order = np.argsort(END_TIMES) + end_times = [datetime.timestamp(datetime(2000, et + 1, 1)) for et in END_TIMES] + ensemble_performance_history = [ + {'Timestamp': datetime(2000, et + 1, 1), 'train_accuracy': s1, 'test_accuracy': s2} + for et, s1, s2 in zip(END_TIMES, SCORES, SCORES[::-1]) + ] + + er = EnsembleResults(log_loss, ensemble_performance_history) + assert er.empty() + + er = EnsembleResults(accuracy, ensemble_performance_history) + assert er._train_scores == SCORES + assert np.allclose(er.train_scores, SCORES) + assert er._test_scores == SCORES[::-1] + assert np.allclose(er.test_scores, SCORES[::-1]) + assert np.allclose(er.end_times, end_times) + + er = EnsembleResults(accuracy, ensemble_performance_history, order_by_endtime=True) + assert np.allclose(er.train_scores, np.array(SCORES)[order]) + assert np.allclose(er.test_scores, np.array(SCORES[::-1])[order]) + assert np.allclose(er.end_times, np.array(end_times)[order]) + + +@pytest.mark.parametrize('metric', (accuracy, log_loss)) +@pytest.mark.parametrize('scores', (SCORES[:8], SCORES[:8][::-1])) +@pytest.mark.parametrize('ensemble_ends_later', (True, False)) +def test_metric_results(metric, scores, ensemble_ends_later): + # since datetime --> timestamp variates between machines and float64 might not + # be able to handle time precisely enough, we might need to change t0 in the future. + # Basically, it happens because this test is checking by the precision of milli second + t0, ms_unit = (1970, 1, 1, 9, 0, 0), 100000 + ensemble_performance_history = [ + {'Timestamp': datetime(*t0, ms_unit * 2 * (i + 1) + ms_unit // 2), + f'train_{metric.name}': s, + f'test_{metric.name}': s} + for i, s in enumerate(scores[::2]) + ] + # Add a record with the exact same stamp as the last one + ensemble_performance_history.append( + {'Timestamp': datetime(*t0, ms_unit * 8 + ms_unit // 2), + f'train_{metric.name}': 0, + f'test_{metric.name}': 0} + ) + # Add a record with the exact same stamp as a middle one + ensemble_performance_history.append( + {'Timestamp': datetime(*t0, ms_unit * 4 + ms_unit // 2), + f'train_{metric.name}': 0, + f'test_{metric.name}': 0} + ) + + run_history = RunHistory() + cs = ConfigurationSpace() + cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1)) + + for i, fixed_val in enumerate(scores): + config = Configuration(cs, {'a': fixed_val}) + st = datetime.timestamp(datetime(*t0, ms_unit * (i + 1 - ensemble_ends_later))) + et = datetime.timestamp(datetime(*t0, ms_unit * (i + 2 - ensemble_ends_later))) + run_history.add( + config=config, cost=1, budget=0, + time=0.1, starttime=st, endtime=et, + status=StatusType.SUCCESS, + additional_info={ + 'configuration_origin': T, + 'train_loss': {f'{metric.name}': fixed_val}, + 'opt_loss': {f'{metric.name}': fixed_val}, + 'test_loss': {f'{metric.name}': fixed_val} + } + ) + _check_metric_results(scores, metric, run_history, ensemble_performance_history) + + +def test_search_results_sprint_statistics(): + BaseTask.__abstractmethods__ = set() + api = BaseTask() + for method in ['get_search_results', 'sprint_statistics', 'get_incumbent_results']: + with pytest.raises(RuntimeError): + getattr(api, method)() + + run_history_data = json.load(open(os.path.join(os.path.dirname(__file__), + 'runhistory.json'), + mode='r'))['data'] + api._results_manager.run_history = MagicMock() + api.run_history.empty = MagicMock(return_value=False) + + # The run_history has 16 runs + 1 run interruption ==> 16 runs + api.run_history.data = make_dict_run_history_data(run_history_data) + api._metric = accuracy + api.dataset_name = 'iris' + api._scoring_functions = [accuracy, balanced_accuracy] + api.search_space = MagicMock(spec=ConfigurationSpace) + worst_val = api._metric._worst_possible_result + search_results = api.get_search_results() + + _check_status(search_results.status_types) + _check_costs(search_results.opt_scores) + _check_end_times(search_results.end_times) + _check_fit_times(search_results.fit_times) + _check_budgets(search_results.budgets) + _check_metric_dict(search_results.opt_metric_dict, search_results.status_types, worst_val) + _check_additional_infos(status_types=search_results.status_types, + additional_infos=search_results.additional_infos) + + # config_ids can duplicate because of various budget size + config_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 1, 10, 11, 12, 10, 13] + assert config_ids == search_results.config_ids + + # assert that contents of search_results are of expected types + assert isinstance(search_results.rank_opt_scores, np.ndarray) + assert search_results.rank_opt_scores.dtype is np.dtype(np.int) + assert isinstance(search_results.configs, list) + + n_success, n_timeout, n_memoryout, n_crashed = 13, 2, 0, 1 + msg = ["autoPyTorch results:", f"\tDataset name: {api.dataset_name}", + f"\tOptimisation Metric: {api._metric.name}", + f"\tBest validation score: {max(search_results.opt_scores)}", + "\tNumber of target algorithm runs: 16", f"\tNumber of successful target algorithm runs: {n_success}", + f"\tNumber of crashed target algorithm runs: {n_crashed}", + f"\tNumber of target algorithms that exceeded the time limit: {n_timeout}", + f"\tNumber of target algorithms that exceeded the memory limit: {n_memoryout}"] + + assert isinstance(api.sprint_statistics(), str) + assert all([m1 == m2 for m1, m2 in zip(api.sprint_statistics().split("\n"), msg)]) + + +@pytest.mark.parametrize('run_history', (None, RunHistory())) +def test_check_run_history(run_history): + manager = ResultsManager() + manager.run_history = run_history + + with pytest.raises(RuntimeError): + manager._check_run_history() + + +@pytest.mark.parametrize('include_traditional', (True, False)) +@pytest.mark.parametrize('metric', (accuracy, log_loss)) +@pytest.mark.parametrize('origins', ([T] * 5 + [NT] * 5, [T, NT] * 5, [NT] * 5 + [T] * 5)) +@pytest.mark.parametrize('scores', (SCORES, SCORES[::-1])) +def test_get_incumbent_results(include_traditional, metric, origins, scores): + manager = ResultsManager() + cs = ConfigurationSpace() + cs.add_hyperparameter(CSH.UniformFloatHyperparameter('a', lower=0, upper=1)) + + configs = [0.1 * (i + 1) for i in range(len(scores))] + if metric.name == "log_loss": + # This is to detect mis-computation in reversion + metric._optimum = 0.1 + + best_cost, best_idx = np.inf, -1 + for idx, (a, origin, score) in enumerate(zip(configs, origins, scores)): + config = Configuration(cs, {'a': a}) + + # conversion defined in: + # autoPyTorch/pipeline/components/training/metrics/utils.py::calculate_loss + cost = metric._optimum - metric._sign * score + manager.run_history.add( + config=config, + cost=cost, + time=1.0, + status=StatusType.SUCCESS, + additional_info={'train_loss': {metric.name: cost}, + 'opt_loss': {metric.name: cost}, + 'test_loss': {metric.name: cost}, + 'configuration_origin': origin} + ) + if cost > best_cost: + continue + + if include_traditional: + best_cost, best_idx = cost, idx + elif origin != T: + best_cost, best_idx = cost, idx + + incumbent_config, incumbent_results = manager.get_incumbent_results( + metric=metric, + include_traditional=include_traditional + ) + + assert isinstance(incumbent_config, Configuration) + assert isinstance(incumbent_results, dict) + best_score, best_a = scores[best_idx], configs[best_idx] + assert np.allclose( + [best_score, best_score, best_a], + [cost2metric(best_cost, metric), + cost2metric(incumbent_results['opt_loss'][metric.name], metric), + incumbent_config['a']] + ) + + if not include_traditional: + assert incumbent_results['configuration_origin'] != T diff --git a/test/test_utils/test_results_visualizer.py b/test/test_utils/test_results_visualizer.py new file mode 100644 index 000000000..e31571ef0 --- /dev/null +++ b/test/test_utils/test_results_visualizer.py @@ -0,0 +1,302 @@ +import json +import os +from datetime import datetime +from test.test_api.utils import make_dict_run_history_data +from unittest.mock import MagicMock + +from ConfigSpace import ConfigurationSpace + +import matplotlib.pyplot as plt + +import numpy as np + +import pytest + +from autoPyTorch.api.base_task import BaseTask +from autoPyTorch.metrics import accuracy, balanced_accuracy +from autoPyTorch.utils.results_visualizer import ( + ColorLabelSettings, + PlotSettingParams, + ResultsVisualizer, + _get_perf_and_time +) + + +TEST_CL = ('test color', 'test label') + + +@pytest.mark.parametrize('cl_settings', ( + ColorLabelSettings(single_opt=TEST_CL), + ColorLabelSettings(single_opt=TEST_CL, single_test=None, single_train=None) +)) +@pytest.mark.parametrize('with_ensemble', (True, False)) +def test_extract_dicts(cl_settings, with_ensemble): + dummy_keys = [name for name in [ + 'single::train::dummy', + 'single::opt::dummy', + 'single::test::dummy', + 'ensemble::train::dummy', + 'ensemble::test::dummy' + ] if ( + (with_ensemble or not name.startswith('ensemble')) + and getattr(cl_settings, "_".join(name.split('::')[:2])) is not None + ) + ] + + results = MagicMock() + results.data.keys = MagicMock(return_value=dummy_keys) + cd, ld = cl_settings.extract_dicts(results) + assert set(dummy_keys) == set(cd.keys()) + assert set(dummy_keys) == set(ld.keys()) + + opt_key = 'single::opt::dummy' + assert TEST_CL == (cd[opt_key], ld[opt_key]) + + +@pytest.mark.parametrize('params', ( + PlotSettingParams(show=True), + PlotSettingParams(show=False), + PlotSettingParams(show=True, figname='dummy') +)) +def test_plt_show_in_set_plot_args(params): # TODO + plt.show = MagicMock() + plt.savefig = MagicMock() + _, ax = plt.subplots(nrows=1, ncols=1) + viz = ResultsVisualizer() + + viz._set_plot_args(ax, params) + # if figname is not None, show will not be called. (due to the matplotlib design) + assert plt.show._mock_called == (params.figname is None and params.show) + plt.close() + + +@pytest.mark.parametrize('params', ( + PlotSettingParams(), + PlotSettingParams(figname='fig') +)) +def test_plt_savefig_in_set_plot_args(params): # TODO + plt.savefig = MagicMock() + _, ax = plt.subplots(nrows=1, ncols=1) + viz = ResultsVisualizer() + + viz._set_plot_args(ax, params) + assert plt.savefig._mock_called == (params.figname is not None) + plt.close() + + +@pytest.mark.parametrize('params', ( + PlotSettingParams(grid=True), + PlotSettingParams(grid=False) +)) +def test_ax_grid_in_set_plot_args(params): # TODO + _, ax = plt.subplots(nrows=1, ncols=1) + ax.grid = MagicMock() + viz = ResultsVisualizer() + + viz._set_plot_args(ax, params) + assert ax.grid._mock_called == params.grid + plt.close() + + +@pytest.mark.parametrize('params', ( + PlotSettingParams(xscale='none', yscale='none'), + PlotSettingParams(xscale='none', yscale='log'), + PlotSettingParams(xscale='none', yscale='none'), + PlotSettingParams(xscale='none', yscale='log') +)) +def test_raise_value_error_in_set_plot_args(params): # TODO + _, ax = plt.subplots(nrows=1, ncols=1) + viz = ResultsVisualizer() + + with pytest.raises(ValueError): + viz._set_plot_args(ax, params) + + plt.close() + + +@pytest.mark.parametrize('params', ( + PlotSettingParams(xlim=(-100, 100), ylim=(-200, 200)), + PlotSettingParams(xlabel='x label', ylabel='y label'), + PlotSettingParams(xscale='log', yscale='log'), + PlotSettingParams(legend=False, title='Title') +)) +def test_set_plot_args(params): # TODO + _, ax = plt.subplots(nrows=1, ncols=1) + viz = ResultsVisualizer() + viz._set_plot_args(ax, params) + + if params.xlim is not None: + assert ax.get_xlim() == params.xlim + if params.ylim is not None: + assert ax.get_ylim() == params.ylim + + assert ax.xaxis.get_label()._text == ('' if params.xlabel is None else params.xlabel) + assert ax.yaxis.get_label()._text == ('' if params.ylabel is None else params.ylabel) + assert ax.get_title() == ('' if params.title is None else params.title) + assert params.xscale == ax.get_xscale() + assert params.yscale == ax.get_yscale() + + if params.legend: + assert ax.get_legend() is not None + else: + assert ax.get_legend() is None + + plt.close() + + +@pytest.mark.parametrize('metric_name', ('unknown', 'accuracy')) +def test_raise_error_in_plot_perf_over_time_in_base_task(metric_name): + BaseTask.__abstractmethods__ = set() + api = BaseTask() + + if metric_name == 'unknown': + with pytest.raises(ValueError): + api.plot_perf_over_time(metric_name) + else: + with pytest.raises(RuntimeError): + api.plot_perf_over_time(metric_name) + + +@pytest.mark.parametrize('metric_name', ('balanced_accuracy', 'accuracy')) +def test_plot_perf_over_time(metric_name): # TODO + dummy_history = [{'Timestamp': datetime(2022, 1, 1), 'train_accuracy': 1, 'test_accuracy': 1}] + BaseTask.__abstractmethods__ = set() + api = BaseTask() + run_history_data = json.load(open(os.path.join(os.path.dirname(__file__), + 'runhistory.json'), + mode='r'))['data'] + api._results_manager.run_history = MagicMock() + api.run_history.empty = MagicMock(return_value=False) + + # The run_history has 16 runs + 1 run interruption ==> 16 runs + api.run_history.data = make_dict_run_history_data(run_history_data) + api._results_manager.ensemble_performance_history = dummy_history + api._metric = accuracy + api.dataset_name = 'iris' + api._scoring_functions = [accuracy, balanced_accuracy] + api.search_space = MagicMock(spec=ConfigurationSpace) + + api.plot_perf_over_time(metric_name=metric_name) + _, ax = plt.subplots(nrows=1, ncols=1) + api.plot_perf_over_time(metric_name=metric_name, ax=ax) + + # remove ensemble keys if metric name is not for the opt score + ans = set([ + name + for name in [f'single train {metric_name}', + f'single test {metric_name}', + f'single opt {metric_name}', + f'ensemble train {metric_name}', + f'ensemble test {metric_name}'] + if metric_name == api._metric.name or not name.startswith('ensemble') + ]) + legend_set = set([txt._text for txt in ax.get_legend().texts]) + assert ans == legend_set + plt.close() + + +@pytest.mark.parametrize('params', ( + PlotSettingParams(xscale='none', yscale='none'), + PlotSettingParams(xscale='none', yscale='log'), + PlotSettingParams(xscale='log', yscale='none'), + PlotSettingParams(yscale='log') +)) +def test_raise_error_get_perf_and_time(params): + results = np.linspace(-1, 1, 10) + cum_times = np.linspace(0, 1, 10) + + with pytest.raises(ValueError): + _get_perf_and_time( + cum_results=results, + cum_times=cum_times, + plot_setting_params=params, + worst_val=np.inf + ) + + +@pytest.mark.parametrize('params', ( + PlotSettingParams(n_points=20, xscale='linear', yscale='linear'), + PlotSettingParams(n_points=20, xscale='log', yscale='log') +)) +def test_get_perf_and_time(params): + y_min, y_max = 1e-5, 1 + results = np.linspace(y_min, y_max, 10) + cum_times = np.linspace(y_min, y_max, 10) + + check_points, perf_by_time_step = _get_perf_and_time( + cum_results=results, + cum_times=cum_times, + plot_setting_params=params, + worst_val=np.inf + ) + + times_ans = np.linspace( + y_min if params.xscale == 'linear' else np.log(y_min), + y_max if params.xscale == 'linear' else np.log(y_max), + params.n_points + ) + times_ans = times_ans if params.xscale == 'linear' else np.exp(times_ans) + assert np.allclose(check_points, times_ans) + + if params.xscale == 'linear': + """ + each time step to record the result + [1.00000000e-05, 5.26410526e-02, 1.05272105e-01, 1.57903158e-01, + 2.10534211e-01, 2.63165263e-01, 3.15796316e-01, 3.68427368e-01, + 4.21058421e-01, 4.73689474e-01, 5.26320526e-01, 5.78951579e-01, + 6.31582632e-01, 6.84213684e-01, 7.36844737e-01, 7.89475789e-01, + 8.42106842e-01, 8.94737895e-01, 9.47368947e-01, 1.00000000e+00] + + The time steps when each result was recorded + [ + 1.0000e-05, # cover index 0 ~ 2 + 1.1112e-01, # cover index 3, 4 + 2.2223e-01, # cover index 5, 6 + 3.3334e-01, # cover index 7, 8 + 4.4445e-01, # cover index 9, 10 + 5.5556e-01, # cover index 11, 12 + 6.6667e-01, # cover index 13, 14 + 7.7778e-01, # cover index 15, 16 + 8.8889e-01, # cover index 17, 18 + 1.0000e+00 # cover index 19 + ] + Since the sequence is monotonically increasing, + if multiple elements cover the same index, take the best. + """ + results_ans = [r for r in results] + results_ans = [results[0]] + results_ans + results_ans[:-1] + results_ans = np.sort(results_ans) + else: + """ + each time step to record the result + [1.00000000e-05, 1.83298071e-05, 3.35981829e-05, 6.15848211e-05, + 1.12883789e-04, 2.06913808e-04, 3.79269019e-04, 6.95192796e-04, + 1.27427499e-03, 2.33572147e-03, 4.28133240e-03, 7.84759970e-03, + 1.43844989e-02, 2.63665090e-02, 4.83293024e-02, 8.85866790e-02, + 1.62377674e-01, 2.97635144e-01, 5.45559478e-01, 1.00000000e+00] + + The time steps when each result was recorded + [ + 1.0000e-05, # cover index 0 ~ 15 + 1.1112e-01, # cover index 16 + 2.2223e-01, # cover index 17 + 3.3334e-01, # cover index 18 + 4.4445e-01, # cover index 18 + 5.5556e-01, # cover index 19 + 6.6667e-01, # cover index 19 + 7.7778e-01, # cover index 19 + 8.8889e-01, # cover index 19 + 1.0000e+00 # cover index 19 + ] + Since the sequence is monotonically increasing, + if multiple elements cover the same index, take the best. + """ + results_ans = [ + *([results[0]] * 16), + results[1], + results[2], + results[4], + results[-1] + ] + + assert np.allclose(perf_by_time_step, results_ans)