From 0bd7d5f7e5f17af2166683a4d9663a00038dc988 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 10 Mar 2022 17:39:12 +0100 Subject: [PATCH 01/22] update eta for experiments --- autoPyTorch/optimizer/smbo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py index 43b2c80c8..bbdb154f9 100644 --- a/autoPyTorch/optimizer/smbo.py +++ b/autoPyTorch/optimizer/smbo.py @@ -93,7 +93,8 @@ def get_smac_object( initial_design=None, run_id=seed, intensifier=intensifier, - intensifier_kwargs=intensifier_kwargs, + intensifier_kwargs={'initial_budget': initial_budget, 'max_budget': max_budget, + 'eta': 2, 'min_chall': 1, 'instance_order': 'shuffle_once'}, dask_client=dask_client, n_jobs=n_jobs, ) From 76dae5489f725d4de20924fe50472a7b41f13bb2 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 11 Mar 2022 11:38:13 +0100 Subject: [PATCH 02/22] add check if True is in value range --- autoPyTorch/pipeline/base_pipeline.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 6ded2adf6..910d435d1 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -350,16 +350,18 @@ def _add_forbidden_conditions(self, cs: ConfigurationSpace) -> ConfigurationSpac cyclic_lr_name = 'CyclicLR' if cyclic_lr_name in available_schedulers: # disable snapshot ensembles and stochastic weight averaging - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - f'trainer:{trainer}:use_snapshot_ensemble'), True), - ForbiddenEqualsClause(cs.get_hyperparameter('lr_scheduler:__choice__'), cyclic_lr_name) - )) - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - f'trainer:{trainer}:use_stochastic_weight_averaging'), True), - ForbiddenEqualsClause(cs.get_hyperparameter('lr_scheduler:__choice__'), cyclic_lr_name) - )) + snapshot_ensemble_hyperparameter = cs.get_hyperparameter(f'trainer:{trainer}:use_snapshot_ensemble') + if True in snapshot_ensemble_hyperparameter.choices: + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(snapshot_ensemble_hyperparameter, True), + ForbiddenEqualsClause(cs.get_hyperparameter('lr_scheduler:__choice__'), cyclic_lr_name) + )) + swa_hyperparameter = cs.get_hyperparameter(f'trainer:{trainer}:use_stochastic_weight_averaging') + if True in swa_hyperparameter.choices: + cs.add_forbidden_clause(ForbiddenAndConjunction( + ForbiddenEqualsClause(swa_hyperparameter, True), + ForbiddenEqualsClause(cs.get_hyperparameter('lr_scheduler:__choice__'), cyclic_lr_name) + )) return cs def __repr__(self) -> str: From d26c611f3b6f43511f7dd5f960efa15443678620 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Mon, 14 Mar 2022 03:26:31 +0100 Subject: [PATCH 03/22] Reg cocktails common paper modifications 2 (#417) * remove remaining differences * Reg cocktails common paper modifications 5 (#418) * add hasttr * fix run summary --- autoPyTorch/evaluation/train_evaluator.py | 22 +++++++++++++++++++ autoPyTorch/pipeline/base_pipeline.py | 4 ++-- .../encoding/__init__.py | 2 +- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index f57d5b15a..67185ec2a 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -1,4 +1,6 @@ +import json from multiprocessing.queues import Queue +import os from typing import Any, Dict, List, Optional, Tuple, Union from ConfigSpace.configuration_space import Configuration @@ -21,6 +23,7 @@ ) from autoPyTorch.evaluation.utils import DisableFileOutputParameters from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric +from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline from autoPyTorch.utils.common import dict_repr, subsampler from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates @@ -193,6 +196,25 @@ def fit_predict_and_loss(self) -> None: additional_run_info = pipeline.get_additional_run_info() if hasattr( pipeline, 'get_additional_run_info') else {} + # add learning curve of configurations to additional_run_info + if isinstance(pipeline, TabularClassificationPipeline): + if hasattr(pipeline.named_steps['trainer'], 'run_summary'): + run_summary = pipeline.named_steps['trainer'].run_summary + split_types = ['train', 'val', 'test'] + run_summary_dict = dict( + run_summary={}, + budget=self.budget, + seed=self.seed, + config_id=self.configuration.config_id, + num_run=self.num_run + ) + for split_type in split_types: + run_summary_dict['run_summary'][f'{split_type}_loss'] = run_summary.performance_tracker.get(f'{split_type}_loss', None) + run_summary_dict['run_summary'][f'{split_type}_metrics'] = run_summary.performance_tracker.get(f'{split_type}_metrics', None) + self.logger.debug(f"run_summary_dict {json.dumps(run_summary_dict)}") + with open(os.path.join(self.backend.temporary_directory, 'run_summary.txt'), 'a') as file: + file.write(f"{json.dumps(run_summary_dict)}\n") + status = StatusType.SUCCESS self.logger.debug("In train evaluator.fit_predict_and_loss, num_run: {} loss:{}," diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 910d435d1..43890dd7d 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -351,13 +351,13 @@ def _add_forbidden_conditions(self, cs: ConfigurationSpace) -> ConfigurationSpac if cyclic_lr_name in available_schedulers: # disable snapshot ensembles and stochastic weight averaging snapshot_ensemble_hyperparameter = cs.get_hyperparameter(f'trainer:{trainer}:use_snapshot_ensemble') - if True in snapshot_ensemble_hyperparameter.choices: + if hasattr(snapshot_ensemble_hyperparameter, 'choices') and True in snapshot_ensemble_hyperparameter.choices: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(snapshot_ensemble_hyperparameter, True), ForbiddenEqualsClause(cs.get_hyperparameter('lr_scheduler:__choice__'), cyclic_lr_name) )) swa_hyperparameter = cs.get_hyperparameter(f'trainer:{trainer}:use_stochastic_weight_averaging') - if True in swa_hyperparameter.choices: + if hasattr(swa_hyperparameter, 'choices') and True in swa_hyperparameter.choices: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(swa_hyperparameter, True), ForbiddenEqualsClause(cs.get_hyperparameter('lr_scheduler:__choice__'), cyclic_lr_name) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/__init__.py index bca525781..eca46acb2 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/__init__.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/__init__.py @@ -86,7 +86,7 @@ def get_hyperparameter_search_space(self, "choices in {} got {}".format(self.__class__.__name__, available_preprocessors, choice_hyperparameter.value_range)) - if len(choice_hyperparameter) == 0: + if len(categorical_columns) == 0: assert len(choice_hyperparameter.value_range) == 1 assert 'NoEncoder' in choice_hyperparameter.value_range, \ "Provided {} in choices, however, the dataset " \ From bc60e31e4c58cc266151fbe11e0a7da14badb728 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 23 Mar 2022 12:19:16 +0100 Subject: [PATCH 04/22] have working embedding from pytroch --- autoPyTorch/pipeline/base_pipeline.py | 7 ++- .../setup/network_backbone/utils.py | 2 +- .../LearnedEntityEmbedding.py | 49 +++++++++---------- .../setup/network_embedding/NoEmbedding.py | 5 ++ .../base_network_embedding.py | 17 ++++--- 5 files changed, 41 insertions(+), 39 deletions(-) diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 43890dd7d..ca2ab5533 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -322,15 +322,13 @@ def _add_forbidden_conditions(self, cs: ConfigurationSpace) -> ConfigurationSpac possible_default_embeddings = copy(list(embeddings)) del possible_default_embeddings[possible_default_embeddings.index('LearnedEntityEmbedding')] - for encoder in encoders: - if encoder == 'OneHotEncoder': - continue + if 'OneHotEncoder' in encoders: while True: try: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(cs.get_hyperparameter( 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder) + ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), "OneHotEncoder") )) break except ValueError: @@ -340,6 +338,7 @@ def _add_forbidden_conditions(self, cs: ConfigurationSpace) -> ConfigurationSpac except IndexError: raise ValueError("Cannot find a legal default configuration") cs.get_hyperparameter('network_embedding:__choice__').default_value = default + # Disable CyclicLR until todo is completed. if 'lr_scheduler' in self.named_steps.keys() and 'trainer' in self.named_steps.keys(): diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index a3216c7c1..3e2de1887 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -27,7 +27,7 @@ def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...], has the network will return a Tuple, we will then only consider the first item :return: output_shape """ - placeholder = torch.randn((2, *input_shape), dtype=torch.float) + placeholder = torch.randint(high=2, size=(2, *input_shape), dtype=torch.float) with torch.no_grad(): if has_hidden_states: output = network(placeholder)[0] diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index fdcf051bd..e43b8f093 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -1,3 +1,4 @@ +from math import ceil from typing import Any, Dict, List, Optional, Tuple, Union from ConfigSpace.configuration_space import ConfigurationSpace @@ -19,7 +20,7 @@ class _LearnedEntityEmbedding(nn.Module): """ Learned entity embedding module for categorical features""" - def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_numerical_features: int): + def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, num_numerical_features: int): """ Args: config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer @@ -33,21 +34,19 @@ def __init__(self, config: Dict[str, Any], num_input_features: np.ndarray, num_n self.num_numerical = num_numerical_features # list of number of categories of categorical data # or 0 for numerical data - self.num_input_features = num_input_features - categorical_features: np.ndarray = self.num_input_features > 0 + self.num_categories_per_col = num_categories_per_col + categorical_features = self.num_categories_per_col > 0 - self.num_categorical_features = self.num_input_features[categorical_features] + self.num_categorical_features = self.num_categories_per_col[categorical_features] self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in - self.num_input_features] + self.num_categories_per_col] self.num_output_dimensions = [0] * num_numerical_features - self.num_output_dimensions.extend([config["dimension_reduction_" + str(i)] * num_in for i, num_in in + self.num_output_dimensions.extend([ceil(config["dimension_reduction_" + str(i)] * num_in) for i, num_in in enumerate(self.num_categorical_features)]) - self.num_output_dimensions = [int(np.clip(num_out, 1, num_in - 1)) for num_out, num_in in - zip(self.num_output_dimensions, self.num_input_features)] - self.num_output_dimensions = [num_out if embed else num_in for num_out, embed, num_in in - zip(self.num_output_dimensions, self.embed_features, - self.num_input_features)] + + self.num_output_dimensions = [num_out if embed else 1 for num_out, embed in + zip(self.num_output_dimensions, self.embed_features)] self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) self.ee_layers = self._create_ee_layers() @@ -86,32 +85,30 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # pass the columns of each categorical feature through entity embedding layer # before passing it through the model concat_seq = [] - last_concat = 0 + x_pointer = 0 layer_pointer = 0 - for num_in, embed in zip(self.num_input_features, self.embed_features): + for x_pointer, embed in enumerate(self.embed_features): + current_feature_slice = x[:, x_pointer] if not embed: x_pointer += 1 + concat_seq.append(current_feature_slice.view(-1, 1)) continue - if x_pointer > last_concat: - concat_seq.append(x[..., last_concat: x_pointer]) - categorical_feature_slice = x[..., x_pointer: x_pointer + num_in] - concat_seq.append(self.ee_layers[layer_pointer](categorical_feature_slice)) + current_feature_slice = current_feature_slice.to(torch.int) + concat_seq.append(self.ee_layers[layer_pointer](current_feature_slice)) layer_pointer += 1 - x_pointer += num_in - last_concat = x_pointer - concat_seq.append(x[..., last_concat:]) - return torch.cat(concat_seq, dim=-1) + return torch.cat(concat_seq, dim=1) def _create_ee_layers(self) -> nn.ModuleList: # entity embeding layers are Linear Layers layers = nn.ModuleList() - for i, (num_in, embed, num_out) in enumerate(zip(self.num_input_features, self.embed_features, - self.num_output_dimensions)): + for num_cat, embed, num_out in zip(self.num_categories_per_col, + self.embed_features, + self.num_output_dimensions): if not embed: continue - layers.append(nn.Linear(num_in, num_out)) + layers.append(nn.Embedding(num_cat, num_out)) return layers @@ -154,11 +151,11 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg self.config = kwargs def build_embedding(self, - num_input_features: np.ndarray, + num_categories_per_col: np.ndarray, num_numerical_features: int) -> Tuple[nn.Module, List[int]]: embedding = _LearnedEntityEmbedding(config=self.config, - num_input_features=num_input_features, + num_categories_per_col=num_categories_per_col, num_numerical_features=num_numerical_features) return embedding, embedding.num_output_dimensions diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py index 8fa03a65e..741dfad9d 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py @@ -27,10 +27,15 @@ class NoEmbedding(NetworkEmbeddingComponent): def __init__(self, random_state: Optional[np.random.RandomState] = None): super().__init__(random_state=random_state) +<<<<<<< HEAD def build_embedding(self, num_input_features: np.ndarray, num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]: return _NoEmbedding(), None +======= + def build_embedding(self, num_categories_per_col: np.ndarray, num_numerical_features: int) -> nn.Module: + return _NoEmbedding() +>>>>>>> have working embedding from pytroch @staticmethod def get_hyperparameter_search_space( diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 1ff5df13e..776464862 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -19,10 +19,10 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: - num_numerical_columns, num_input_features = self._get_args(X) + num_numerical_columns, num_categories_per_col = self._get_args(X) self.embedding, num_output_features = self.build_embedding( - num_input_features=num_input_features, + num_categories_per_col=num_categories_per_col, num_numerical_features=num_numerical_columns ) if "feature_shapes" in X['dataset_properties']: @@ -45,7 +45,7 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: return X def build_embedding(self, - num_input_features: np.ndarray, + num_categories_per_col: np.ndarray, num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]: raise NotImplementedError @@ -70,10 +70,11 @@ def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: else: num_numerical_columns = numerical_column_transformer.transform( X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] - num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])), - dtype=np.int32) + num_cols = num_numerical_columns + len(X['dataset_properties']['categorical_columns']) + num_categories_per_col = np.zeros(num_cols, dtype=np.int32) + categories = X['dataset_properties']['categories'] + for idx, cats in enumerate(categories, start=num_numerical_columns): + num_categories_per_col[idx] = len(cats) - for i, category in enumerate(categories): - num_input_features[num_numerical_columns + i, ] = len(category) - return num_numerical_columns, num_input_features + return num_numerical_columns, num_categories_per_col From 13fad760bec6d34075755ab02f9874d3c456c28c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 31 Mar 2022 14:36:01 +0200 Subject: [PATCH 05/22] divide columns to encode and embed based on threshold --- autoPyTorch/api/tabular_classification.py | 2 + autoPyTorch/data/base_feature_validator.py | 2 +- autoPyTorch/data/tabular_feature_validator.py | 6 +- autoPyTorch/datasets/tabular_dataset.py | 2 +- autoPyTorch/pipeline/base_pipeline.py | 49 +++++------ .../column_splitting/ColumnSplitter.py | 81 +++++++++++++++++++ .../encoding/OneHotEncoder.py | 3 +- .../encoding/base_encoder.py | 2 +- .../early_preprocessor/EarlyPreprocessing.py | 5 +- .../TimeSeriesEarlyPreProcessing.py | 34 ++++---- .../setup/network_backbone/utils.py | 2 + .../LearnedEntityEmbedding.py | 53 ++++++------ .../setup/network_embedding/NoEmbedding.py | 9 +-- .../base_network_embedding.py | 75 +++++++++-------- .../pipeline/tabular_classification.py | 74 ++++++++--------- 15 files changed, 237 insertions(+), 162 deletions(-) create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index aa6796ae2..358792161 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -447,6 +447,8 @@ def search( dataset_compression=self._dataset_compression, feat_types=feat_types) + # import sys + # sys.exit(0) return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py index 8f65f8607..bf9ad90ed 100644 --- a/autoPyTorch/data/base_feature_validator.py +++ b/autoPyTorch/data/base_feature_validator.py @@ -46,11 +46,11 @@ def __init__( # Required for dataset properties self.num_features: Optional[int] = None - self.categories: List[List[int]] = [] self.categorical_columns: List[int] = [] self.numerical_columns: List[int] = [] self.encode_columns: List[str] = [] + self.num_categories_per_col: Optional[List[int]] = [] self.all_nan_columns: Optional[Set[Union[int, str]]] = None self._is_fitted = False diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 3beb19cba..ce040f805 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -202,10 +202,8 @@ def _fit( encoded_categories = self.column_transformer.\ named_transformers_['categorical_pipeline'].\ named_steps['ordinalencoder'].categories_ - self.categories = [ - list(range(len(cat))) - for cat in encoded_categories - ] + + self.num_categories_per_col = [len(cat) for cat in encoded_categories] # differently to categorical_columns and numerical_columns, # this saves the index of the column. diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index 6cabfe525..04a5df96b 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -81,7 +81,7 @@ def __init__(self, self.categorical_columns = validator.feature_validator.categorical_columns self.numerical_columns = validator.feature_validator.numerical_columns self.num_features = validator.feature_validator.num_features - self.categories = validator.feature_validator.categories + self.num_categories_per_col = validator.feature_validator.num_categories_per_col super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle, resampling_strategy=resampling_strategy, diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index ca2ab5533..aa832ea48 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -314,30 +314,31 @@ def _add_forbidden_conditions(self, cs: ConfigurationSpace) -> ConfigurationSpac """ - # Learned Entity Embedding is only valid when encoder is one hot encoder - if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): - embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices - if 'LearnedEntityEmbedding' in embeddings: - encoders = cs.get_hyperparameter('encoder:__choice__').choices - possible_default_embeddings = copy(list(embeddings)) - del possible_default_embeddings[possible_default_embeddings.index('LearnedEntityEmbedding')] - - if 'OneHotEncoder' in encoders: - while True: - try: - cs.add_forbidden_clause(ForbiddenAndConjunction( - ForbiddenEqualsClause(cs.get_hyperparameter( - 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), "OneHotEncoder") - )) - break - except ValueError: - # change the default and try again - try: - default = possible_default_embeddings.pop() - except IndexError: - raise ValueError("Cannot find a legal default configuration") - cs.get_hyperparameter('network_embedding:__choice__').default_value = default + # # Learned Entity Embedding is only valid when encoder is one hot encoder + # if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): + # embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices + # if 'LearnedEntityEmbedding' in embeddings: + # encoders = cs.get_hyperparameter('encoder:__choice__').choices + + # if 'OneHotEncoder' in encoders: + # possible_default_encoders = copy(list(encoders)) + # del possible_default_encoders[possible_default_encoders.index('OneHotEncoder')] + + # while True: + # try: + # cs.add_forbidden_clause(ForbiddenAndConjunction( + # ForbiddenEqualsClause(cs.get_hyperparameter( + # 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), + # ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), "OneHotEncoder") + # )) + # break + # except ValueError: + # # change the default and try again + # try: + # default = possible_default_encoders.pop() + # except IndexError: + # raise ValueError("Cannot find a legal default configuration") + # cs.get_hyperparameter('encoder:__choice__').default_value = default # Disable CyclicLR until todo is completed. diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py new file mode 100644 index 000000000..09363983c --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py @@ -0,0 +1,81 @@ +from typing import Any, Dict, List, Optional, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformIntegerHyperparameter, +) + +import numpy as np + + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ + autoPyTorchTabularPreprocessingComponent +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, ispandas + + +class ColumnSplitter(autoPyTorchTabularPreprocessingComponent): + """ + Removes features that have the same value in the training data. + """ + def __init__( + self, + min_categories_for_embedding: float = 5, + random_state: Optional[np.random.RandomState] = None + ): + self.min_categories_for_embedding = min_categories_for_embedding + + self.special_feature_types = dict(encode_columns=[], embed_columns=[]) + self.num_categories_per_col: Optional[List] = None + super().__init__() + + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'ColumnSplitter': + + self.check_requirements(X, y) + + if len(X['dataset_properties']['categorical_columns']) > 0: + self.num_categories_per_col = [] + for categories_per_column, column in zip(X['dataset_properties']['num_categories_per_col'], X['dataset_properties']['categorical_columns']): + if ( + categories_per_column >= self.min_categories_for_embedding + ): + self.special_feature_types['embed_columns'].append(column) + # we only care about the categories for columns to be embedded + self.num_categories_per_col.append(categories_per_column) + else: + self.special_feature_types['encode_columns'].append(column) + + return self + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + if self.num_categories_per_col is not None: + # update such that only n categories for embedding columns is passed + X['dataset_properties']['num_categories_per_col'] = self.num_categories_per_col + X.update(self.special_feature_types) + return X + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + + return { + 'shortname': 'ColumnSplitter', + 'name': 'Column Splitter', + 'handles_sparse': False, + } + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, + min_categories_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace( + hyperparameter="min_categories_for_embedding", + value_range=(3, 7), + default_value=3, + log=True), + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + + add_hyperparameter(cs, min_categories_for_embedding, UniformIntegerHyperparameter) + + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py index 5c9281891..80cf3f748 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py @@ -22,8 +22,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: self.preprocessor['categorical'] = OHE( # It is safer to have the OHE produce a 0 array than to crash a good configuration - categories=X['dataset_properties']['categories'] - if len(X['dataset_properties']['categories']) > 0 else 'auto', + categories='auto', sparse=False, handle_unknown='ignore') return self diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py index eadc0a188..6ff913ae9 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py @@ -14,7 +14,7 @@ def __init__(self) -> None: super().__init__() self.add_fit_requirements([ FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), - FitRequirement('categories', (List,), user_defined=True, dataset_property=True)]) + ]) def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 597f14ca6..5b60ff4ed 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -40,7 +40,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X['X_train'] = preprocess(dataset=X_train, transforms=transforms) # We need to also save the preprocess transforms for inference - X.update({'preprocess_transforms': transforms}) + X.update({ + 'preprocess_transforms': transforms, + 'shape_after_preprocessing': X['X_train'].shape[1:] + }) return X @staticmethod diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py index 59035869e..ce6b930d4 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py @@ -19,7 +19,6 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None super(EarlyPreprocessing, self).__init__() self.random_state = random_state self.add_fit_requirements([ - FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True), FitRequirement('X_train', (pd.DataFrame, ), user_defined=True, dataset_property=False), FitRequirement('feature_names', (tuple,), user_defined=True, dataset_property=True), @@ -44,14 +43,13 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ transforms = get_preprocess_transforms(X) - if X['dataset_properties']['is_small_preprocess']: - if 'X_train' in X: - X_train = X['X_train'] - else: - # Incorporate the transform to the dataset - X_train = X['backend'].load_datamanager().train_tensors[0] + if 'X_train' in X: + X_train = X['X_train'] + else: + # Incorporate the transform to the dataset + X_train = X['backend'].load_datamanager().train_tensors[0] - X['X_train'] = time_series_preprocess(dataset=X_train, transforms=transforms) + X['X_train'] = time_series_preprocess(dataset=X_train, transforms=transforms) feature_names = X['dataset_properties']['feature_names'] numerical_columns = X['dataset_properties']['numerical_columns'] @@ -65,7 +63,10 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X['dataset_properties']['feature_names'] = tuple(new_feature_names) # We need to also save the preprocess transforms for inference - X.update({'preprocess_transforms': transforms}) + X.update({ + 'preprocess_transforms': transforms, + 'shape_after_preprocessing': X['X_train'].shape[1:] + }) return X @staticmethod @@ -90,14 +91,13 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: # TODO consider inverse transformation transforms = get_preprocess_transforms(X, preprocess_type=autoPyTorchTargetPreprocessingComponent) - if X['dataset_properties']['is_small_preprocess']: - if 'y_train' in X: - y_train = X['y_train'] - else: - # Incorporate the transform to the dataset - y_train = X['backend'].load_datamanager().train_tensors[1] - - X['y_train'] = time_series_preprocess(dataset=y_train, transforms=transforms) + if 'y_train' in X: + y_train = X['y_train'] + else: + # Incorporate the transform to the dataset + y_train = X['backend'].load_datamanager().train_tensors[1] + + X['y_train'] = time_series_preprocess(dataset=y_train, transforms=transforms) # We need to also save the preprocess transforms for inference X.update({'preprocess_target_transforms': transforms}) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index 3e2de1887..ae8ea57e7 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -27,6 +27,8 @@ def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...], has the network will return a Tuple, we will then only consider the first item :return: output_shape """ + # as we are using nn embedding, 2 is a safe upper limit as 3 + # is the lowest `min_values_for_embedding` can be placeholder = torch.randint(high=2, size=(2, *input_shape), dtype=torch.float) with torch.no_grad(): if has_hidden_states: diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index e43b8f093..98cbc73e5 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -4,13 +4,12 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( UniformFloatHyperparameter, - UniformIntegerHyperparameter ) import numpy as np import torch -from torch import nn +from torch import embedding, nn from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.setup.network_embedding.base_network_embedding import NetworkEmbeddingComponent @@ -20,34 +19,28 @@ class _LearnedEntityEmbedding(nn.Module): """ Learned entity embedding module for categorical features""" - def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, num_numerical_features: int): + def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, num_features_excl_embed: int): """ Args: config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer num_input_features (np.ndarray): column wise information of number of output columns after transformation for each categorical column and 0 for numerical columns - num_numerical_features (int): number of numerical features in X + num_features_excl_embed (int): number of features in X excluding the features that need to be embedded """ super().__init__() self.config = config - - self.num_numerical = num_numerical_features # list of number of categories of categorical data # or 0 for numerical data self.num_categories_per_col = num_categories_per_col - categorical_features = self.num_categories_per_col > 0 + self.embed_features = self.num_categories_per_col > 0 - self.num_categorical_features = self.num_categories_per_col[categorical_features] + self.num_embed_features = self.num_categories_per_col[self.embed_features] - self.embed_features = [num_in >= config["min_unique_values_for_embedding"] for num_in in - self.num_categories_per_col] - self.num_output_dimensions = [0] * num_numerical_features + self.num_output_dimensions = [1] * num_features_excl_embed self.num_output_dimensions.extend([ceil(config["dimension_reduction_" + str(i)] * num_in) for i, num_in in - enumerate(self.num_categorical_features)]) + enumerate(self.num_embed_features)]) - self.num_output_dimensions = [num_out if embed else 1 for num_out, embed in - zip(self.num_output_dimensions, self.embed_features)] - self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions) self.ee_layers = self._create_ee_layers() @@ -150,33 +143,33 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None, **kwarg super().__init__(random_state=random_state) self.config = kwargs - def build_embedding(self, - num_categories_per_col: np.ndarray, - num_numerical_features: int) -> Tuple[nn.Module, List[int]]: + def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_embed: int) -> nn.Module: embedding = _LearnedEntityEmbedding(config=self.config, - num_categories_per_col=num_categories_per_col, - num_numerical_features=num_numerical_features) + num_categories_per_col=num_categories_per_col, + num_features_excl_embed=num_features_excl_embed) + return embedding, embedding.num_output_dimensions @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, - min_unique_values_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace( - hyperparameter="min_unique_values_for_embedding", - value_range=(3, 7), - default_value=5, - log=True), - dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace( - hyperparameter="dimension_reduction", - value_range=(0, 1), - default_value=0.5), + dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction", + value_range=(0, 1), + default_value=0.5), ) -> ConfigurationSpace: cs = ConfigurationSpace() - add_hyperparameter(cs, min_unique_values_for_embedding, UniformIntegerHyperparameter) if dataset_properties is not None: for i in range(len(dataset_properties['categorical_columns']) if isinstance(dataset_properties['categorical_columns'], List) else 0): + # currently as we dont have information about the embedding columns + # we search for more dimensions than necessary. This can be solved by + # not having `min_unique_values_for_embedding` as a hyperparameter and + # instead passing it as a parameter to the feature validator, which + # allows us to pass embed_columns to the dataset properties. + # TODO: test the trade off + # Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding` in one custom transformer. + # this will also allow users to use this transformer outside the pipeline ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i), value_range=dimension_reduction.value_range, default_value=dimension_reduction.default_value, diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py index 741dfad9d..abe76fdec 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/NoEmbedding.py @@ -27,15 +27,10 @@ class NoEmbedding(NetworkEmbeddingComponent): def __init__(self, random_state: Optional[np.random.RandomState] = None): super().__init__(random_state=random_state) -<<<<<<< HEAD def build_embedding(self, - num_input_features: np.ndarray, - num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]: + num_categories_per_col: np.ndarray, + num_features_excl_embed: int) -> Tuple[nn.Module, Optional[List[int]]]: return _NoEmbedding(), None -======= - def build_embedding(self, num_categories_per_col: np.ndarray, num_numerical_features: int) -> nn.Module: - return _NoEmbedding() ->>>>>>> have working embedding from pytroch @staticmethod def get_hyperparameter_search_space( diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 776464862..36ffee9c1 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -1,5 +1,4 @@ -import copy -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple import numpy as np @@ -8,30 +7,35 @@ from torch import nn from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent +from autoPyTorch.utils.common import FitRequirement class NetworkEmbeddingComponent(autoPyTorchSetupComponent): - def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None): - super().__init__() + def __init__(self, random_state: Optional[np.random.RandomState] = None): + super().__init__(random_state=random_state) + self.add_fit_requirements([ + FitRequirement('num_categories_per_col', (List,), user_defined=True, dataset_property=True), + FitRequirement('shape_after_preprocessing', (Tuple), user_defined=False, dataset_property=False)]) + self.embedding: Optional[nn.Module] = None self.random_state = random_state self.feature_shapes: Dict[str, int] = {} def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: - num_numerical_columns, num_categories_per_col = self._get_args(X) + num_features_excl_embed, num_categories_per_col = self._get_required_info_from_data(X) self.embedding, num_output_features = self.build_embedding( num_categories_per_col=num_categories_per_col, - num_numerical_features=num_numerical_columns + num_features_excl_embed=num_features_excl_embed ) if "feature_shapes" in X['dataset_properties']: if num_output_features is not None: feature_shapes = X['dataset_properties']['feature_shapes'] # forecasting tasks feature_names = X['dataset_properties']['feature_names'] - for idx_cat, n_output_cat in enumerate(num_output_features[num_numerical_columns:]): - cat_feature_name = feature_names[idx_cat + num_numerical_columns] + for idx_cat, n_output_cat in enumerate(num_output_features[num_features_excl_embed:]): + cat_feature_name = feature_names[idx_cat + num_features_excl_embed] feature_shapes[cat_feature_name] = n_output_cat self.feature_shapes = feature_shapes else: @@ -49,32 +53,33 @@ def build_embedding(self, num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]: raise NotImplementedError - def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: - # Feature preprocessors can alter numerical columns - if len(X['dataset_properties']['numerical_columns']) == 0: - num_numerical_columns = 0 - else: - X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2]) - - if 'tabular_transformer' in X: - numerical_column_transformer = X['tabular_transformer'].preprocessor. \ - named_transformers_['numerical_pipeline'] - elif 'time_series_feature_transformer' in X: - numerical_column_transformer = X['time_series_feature_transformer'].preprocessor. \ - named_transformers_['numerical_pipeline'] - else: - raise ValueError("Either a tabular or time_series transformer must be contained!") - if hasattr(X_train, 'iloc'): - num_numerical_columns = numerical_column_transformer.transform( - X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1] - else: - num_numerical_columns = numerical_column_transformer.transform( - X_train[:, X['dataset_properties']['numerical_columns']]).shape[1] - num_cols = num_numerical_columns + len(X['dataset_properties']['categorical_columns']) - num_categories_per_col = np.zeros(num_cols, dtype=np.int32) + def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]: + """ + Returns the number of numerical columns after preprocessing and + an array of size equal to the number of input features + containing zeros for numerical data and number of categories + for categorical data. This is required to build the embedding. + + Args: + X (Dict[str, Any]): + Fit dictionary + + Returns: + Tuple[int, np.ndarray]: + number of numerical columns and array indicating + number of categories for categorical columns and + 0 for numerical columns + """ + num_cols = X['shape_after_preprocessing'] + # only works for 2D(rows, features) tabular data + num_features_excl_embed = num_cols[0] - len(X['embed_columns']) + + num_categories_per_col = np.zeros(num_cols, dtype=np.int16) + + categories_per_embed_col = X['dataset_properties']['num_categories_per_col'] - categories = X['dataset_properties']['categories'] - for idx, cats in enumerate(categories, start=num_numerical_columns): - num_categories_per_col[idx] = len(cats) + # only fill num categories for embedding columns + for idx, cats in enumerate(categories_per_embed_col, start=num_features_excl_embed): + num_categories_per_col[idx] = cats - return num_numerical_columns, num_categories_per_col + return num_features_excl_embed, num_categories_per_col diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index 09eb47485..91fc1bb5b 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -17,6 +17,9 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import ( + ColumnSplitter +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( CoalescerChoice ) @@ -132,21 +135,24 @@ def __init__( # model, so we comply with https://pytorch.org/docs/stable/notes/randomness.html torch.manual_seed(self.random_state.get_state()[1][0]) - def _predict_proba(self, X: np.ndarray) -> np.ndarray: - # Pre-process X - loader = self.named_steps['data_loader'].get_loader(X=X) - pred = self.named_steps['network'].predict(loader) - if isinstance(self.dataset_properties['output_shape'], int): - # The final layer is always softmax now (`pred` already gives pseudo proba) - return pred - else: - raise ValueError("Expected output_shape to be integer, got {}," - "Tabular Classification only supports 'binary' and 'multiclass' outputs" - "got {}".format(type(self.dataset_properties['output_shape']), - self.dataset_properties['output_type'])) + def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray: + """Predict the output using the selected model. + + Args: + X (np.ndarray): input data to the array + batch_size (Optional[int]): batch_size controls whether the pipeline will be + called on small chunks of the data. Useful when calling the + predict method on the whole array X results in a MemoryError. + + Returns: + np.ndarray: the predicted values given input X + """ + probas = super().predict(X=X, batch_size=batch_size) + return np.argmax(probas, axis=1) + def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray: - """predict_proba. + """predict probabilities. Args: X (np.ndarray): @@ -160,30 +166,19 @@ def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.n Probabilities of the target being certain class """ if batch_size is None: - y = self._predict_proba(X) - + warnings.warn("Batch size not provided. " + "Will predict on the whole data in a single iteration") + batch_size = X.shape[0] + loader = self.named_steps['data_loader'].get_loader(X=X, batch_size=batch_size) + pred = self.named_steps['network'].predict(loader) + if isinstance(self.dataset_properties['output_shape'], int): + # The final layer is always softmax now (`pred` already gives pseudo proba) + return pred else: - if not isinstance(batch_size, int): - raise ValueError("Argument 'batch_size' must be of type int, " - "but is '%s'" % type(batch_size)) - if batch_size <= 0: - raise ValueError("Argument 'batch_size' must be positive, " - "but is %d" % batch_size) - - else: - # Probe for the target array dimensions - target = self.predict_proba(X[0:2].copy()) - - y = np.zeros((X.shape[0], target.shape[1]), - dtype=np.float32) - - for k in range(max(1, int(np.ceil(float(X.shape[0]) / batch_size)))): - batch_from = k * batch_size - batch_to = min([(k + 1) * batch_size, X.shape[0]]) - pred_prob = self.predict_proba(X[batch_from:batch_to], batch_size=None) - y[batch_from:batch_to] = pred_prob.astype(np.float32) - - return y + raise ValueError("Expected output_shape to be integer, got {}," + "Tabular Classification only supports 'binary' and 'multiclass' outputs" + "got {}".format(type(self.dataset_properties['output_shape']), + self.dataset_properties['output_type'])) def score(self, X: np.ndarray, y: np.ndarray, batch_size: Optional[int] = None, @@ -207,7 +202,7 @@ def score(self, X: np.ndarray, y: np.ndarray, """ from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics, calculate_score metrics = get_metrics(self.dataset_properties, [metric_name]) - y_pred = self.predict(X, batch_size=batch_size) + y_pred = self.predict_proba(X, batch_size=batch_size) score = calculate_score(y, y_pred, task_type=STRING_TO_TASK_TYPES[str(self.dataset_properties['task_type'])], metrics=metrics)[metric_name] return score @@ -286,8 +281,9 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), - ("variance_threshold", VarianceThreshold(random_state=self.random_state)), - ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), + # ("variance_threshold", VarianceThreshold(random_state=self.random_state)), + # ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), + ("column_splitter", ColumnSplitter(random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, From cf4fd98da471c777a49acb40b0297a34da440d1f Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 31 Mar 2022 14:44:07 +0200 Subject: [PATCH 06/22] cleanup unwanted changes --- autoPyTorch/api/tabular_classification.py | 2 -- autoPyTorch/pipeline/base_pipeline.py | 27 ----------------------- 2 files changed, 29 deletions(-) diff --git a/autoPyTorch/api/tabular_classification.py b/autoPyTorch/api/tabular_classification.py index 358792161..aa6796ae2 100644 --- a/autoPyTorch/api/tabular_classification.py +++ b/autoPyTorch/api/tabular_classification.py @@ -447,8 +447,6 @@ def search( dataset_compression=self._dataset_compression, feat_types=feat_types) - # import sys - # sys.exit(0) return self._search( dataset=self.dataset, optimize_metric=optimize_metric, diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index aa832ea48..b3c2e4158 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -314,33 +314,6 @@ def _add_forbidden_conditions(self, cs: ConfigurationSpace) -> ConfigurationSpac """ - # # Learned Entity Embedding is only valid when encoder is one hot encoder - # if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys(): - # embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices - # if 'LearnedEntityEmbedding' in embeddings: - # encoders = cs.get_hyperparameter('encoder:__choice__').choices - - # if 'OneHotEncoder' in encoders: - # possible_default_encoders = copy(list(encoders)) - # del possible_default_encoders[possible_default_encoders.index('OneHotEncoder')] - - # while True: - # try: - # cs.add_forbidden_clause(ForbiddenAndConjunction( - # ForbiddenEqualsClause(cs.get_hyperparameter( - # 'network_embedding:__choice__'), 'LearnedEntityEmbedding'), - # ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), "OneHotEncoder") - # )) - # break - # except ValueError: - # # change the default and try again - # try: - # default = possible_default_encoders.pop() - # except IndexError: - # raise ValueError("Cannot find a legal default configuration") - # cs.get_hyperparameter('encoder:__choice__').default_value = default - - # Disable CyclicLR until todo is completed. if 'lr_scheduler' in self.named_steps.keys() and 'trainer' in self.named_steps.keys(): trainers = cs.get_hyperparameter('trainer:__choice__').choices From af41dd7687c27a25af77012dcb7fd416fb32c25c Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 31 Mar 2022 15:58:07 +0200 Subject: [PATCH 07/22] use shape after preprocessing in base network backbone --- .../setup/network_backbone/base_network_backbone.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index ef3cc1768..f63ebd578 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -30,8 +30,7 @@ def __init__(self, self.add_fit_requirements([ FitRequirement('X_train', (np.ndarray, pd.DataFrame, spmatrix), user_defined=True, dataset_property=False), - FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), - FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False), + FitRequirement('shape_after_preprocessing', (Iterable,), user_defined=False, dataset_property=False), FitRequirement('network_embedding', (nn.Module,), user_defined=False, dataset_property=False) ]) self.backbone: nn.Module = None @@ -49,9 +48,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: Self """ self.check_requirements(X, y) - X_train = X['X_train'] - input_shape = X_train.shape[1:] + input_shape = X['shape_after_preprocessing'] input_shape = get_output_shape(X['network_embedding'], input_shape=input_shape) self.input_shape = input_shape From 9706875989690fad9f0ff3ae9f15fbc80d9002d8 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 5 Apr 2022 19:21:57 +0200 Subject: [PATCH 08/22] remove redundant call to load datamanager --- autoPyTorch/pipeline/components/training/trainer/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index b70467837..8c82e241c 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -492,7 +492,7 @@ def _get_train_label(self, X: Dict[str, Any]) -> List[int]: Verifies and validates the labels from train split. """ # Ensure that the split is not missing any class. - labels: List[int] = X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]] + labels: List[int] = X['y_train'][X['train_indices']] if STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']] in CLASSIFICATION_TASKS: unique_labels = len(np.unique(labels)) if unique_labels < X['dataset_properties']['output_shape']: From def144c716607442d68e5dfdab7b114ce58b617b Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 11 Apr 2022 13:21:36 +0200 Subject: [PATCH 09/22] add init file for column splitting --- .../tabular_preprocessing/column_splitting/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/__init__.py diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/__init__.py new file mode 100644 index 000000000..e69de29bb From 926a7576c84fbd752873274082268fbc81fd92d9 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 14 Jun 2022 21:34:18 +0200 Subject: [PATCH 10/22] fix tests --- autoPyTorch/api/base_task.py | 1 + autoPyTorch/evaluation/train_evaluator.py | 36 +++++++++---------- .../coalescer/base_coalescer.py | 1 - .../feature_preprocessing/utils.py | 14 ++++---- .../components/training/trainer/__init__.py | 2 +- autoPyTorch/pipeline/tabular_regression.py | 8 +++-- 6 files changed, 32 insertions(+), 30 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 8618731f5..57614407d 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -256,6 +256,7 @@ def __init__( self.input_validator: Optional[BaseInputValidator] = None self.search_space_updates = search_space_updates + if search_space_updates is not None: if not isinstance(self.search_space_updates, HyperparameterSearchSpaceUpdates): diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index 67185ec2a..31e2c324b 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -196,24 +196,24 @@ def fit_predict_and_loss(self) -> None: additional_run_info = pipeline.get_additional_run_info() if hasattr( pipeline, 'get_additional_run_info') else {} - # add learning curve of configurations to additional_run_info - if isinstance(pipeline, TabularClassificationPipeline): - if hasattr(pipeline.named_steps['trainer'], 'run_summary'): - run_summary = pipeline.named_steps['trainer'].run_summary - split_types = ['train', 'val', 'test'] - run_summary_dict = dict( - run_summary={}, - budget=self.budget, - seed=self.seed, - config_id=self.configuration.config_id, - num_run=self.num_run - ) - for split_type in split_types: - run_summary_dict['run_summary'][f'{split_type}_loss'] = run_summary.performance_tracker.get(f'{split_type}_loss', None) - run_summary_dict['run_summary'][f'{split_type}_metrics'] = run_summary.performance_tracker.get(f'{split_type}_metrics', None) - self.logger.debug(f"run_summary_dict {json.dumps(run_summary_dict)}") - with open(os.path.join(self.backend.temporary_directory, 'run_summary.txt'), 'a') as file: - file.write(f"{json.dumps(run_summary_dict)}\n") + # # add learning curve of configurations to additional_run_info + # if isinstance(pipeline, TabularClassificationPipeline): + # if hasattr(pipeline.named_steps['trainer'], 'run_summary'): + # run_summary = pipeline.named_steps['trainer'].run_summary + # split_types = ['train', 'val', 'test'] + # run_summary_dict = dict( + # run_summary={}, + # budget=self.budget, + # seed=self.seed, + # config_id=self.configuration.config_id, + # num_run=self.num_run + # ) + # for split_type in split_types: + # run_summary_dict['run_summary'][f'{split_type}_loss'] = run_summary.performance_tracker.get(f'{split_type}_loss', None) + # run_summary_dict['run_summary'][f'{split_type}_metrics'] = run_summary.performance_tracker.get(f'{split_type}_metrics', None) + # self.logger.debug(f"run_summary_dict {json.dumps(run_summary_dict)}") + # with open(os.path.join(self.backend.temporary_directory, 'run_summary.txt'), 'a') as file: + # file.write(f"{json.dumps(run_summary_dict)}\n") status = StatusType.SUCCESS diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py index b572f8343..59918f62c 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/coalescer/base_coalescer.py @@ -12,7 +12,6 @@ def __init__(self) -> None: self._processing = True self.add_fit_requirements([ FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), - FitRequirement('categories', (List,), user_defined=True, dataset_property=True) ]) def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py index a8c57959e..c7485835a 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py @@ -82,17 +82,15 @@ def percentage_value_range_to_integer_range( else: log = hyperparameter_search_space.log - min_hyperparameter_value = hyperparameter_search_space.value_range[0] - if len(hyperparameter_search_space.value_range) > 1: - max_hyperparameter_value = hyperparameter_search_space.value_range[1] - else: - max_hyperparameter_value = hyperparameter_search_space.value_range[0] + value_range = ( + floor(float(hyperparameter_search_space.value_range[0]) * n_features), + floor(float(hyperparameter_search_space.value_range[-1]) * n_features)) \ + if len(hyperparameter_search_space.value_range) == 2 else \ + (floor(float(hyperparameter_search_space.value_range[0]) * n_features),) hyperparameter_search_space = HyperparameterSearchSpace( hyperparameter=hyperparameter_name, - value_range=( - floor(float(min_hyperparameter_value) * n_features), - floor(float(max_hyperparameter_value) * n_features)), + value_range=value_range, default_value=ceil(float(hyperparameter_search_space.default_value) * n_features), log=log) else: diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index 8c82e241c..c8f420ac8 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -453,7 +453,7 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic # change model update_model_state_dict_from_swa(X['network'], self.choice.swa_model.state_dict()) - if self.choice.use_snapshot_ensemble: + if self.choice.use_snapshot_ensemble and len(self.choice.model_snapshots) > 0: # we update only the last network which pertains to the stochastic weight averaging model swa_utils.update_bn(X['train_data_loader'], self.choice.model_snapshots[-1].double()) diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 4cd67bb9f..2dc4af727 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -17,6 +17,9 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import ( + ColumnSplitter +) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( CoalescerChoice ) @@ -234,8 +237,9 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), - ("variance_threshold", VarianceThreshold(random_state=self.random_state)), - ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), + # ("variance_threshold", VarianceThreshold(random_state=self.random_state)), + # ("coalescer", CoalescerChoice(default_dataset_properties, random_state=self.random_state)), + ("column_splitter", ColumnSplitter(random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, From 7567d2688889c02af74b1542e821e1ca6a782e60 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 14 Jun 2022 22:07:25 +0200 Subject: [PATCH 11/22] fix precommit and add test changes --- autoPyTorch/api/base_task.py | 5 --- autoPyTorch/data/tabular_feature_validator.py | 1 - autoPyTorch/evaluation/train_evaluator.py | 43 +++++++++++-------- autoPyTorch/pipeline/base_pipeline.py | 3 +- .../column_splitting/ColumnSplitter.py | 24 ++++++----- .../encoding/base_encoder.py | 3 +- .../feature_preprocessing/utils.py | 13 +++--- .../early_preprocessor/EarlyPreprocessing.py | 6 +-- .../LearnedEntityEmbedding.py | 5 ++- .../base_network_embedding.py | 4 +- .../pipeline/tabular_classification.py | 6 --- autoPyTorch/pipeline/tabular_regression.py | 5 --- test/test_api/test_api.py | 5 ++- test/test_data/test_feature_validator.py | 10 ++--- .../components/setup/test_setup_networks.py | 2 +- .../setup/test_setup_preprocessing_node.py | 39 +---------------- .../components/training/test_training.py | 2 +- .../test_tabular_classification.py | 8 ++-- test/test_pipeline/test_tabular_regression.py | 5 +-- 19 files changed, 72 insertions(+), 117 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 57614407d..274e2a316 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -270,13 +270,8 @@ def build_pipeline( include_components: Optional[Dict[str, Any]] = None, exclude_components: Optional[Dict[str, Any]] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None - ) -> BasePipeline: """ Build pipeline according to current task - and for the passed dataset properties - - Args: - dataset_properties (Dict[str, Any]): Characteristics of the dataset to guide the pipeline choices of components include_components (Optional[Dict[str, Any]]): diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index ce040f805..5ea892f7d 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -281,7 +281,6 @@ def transform( X = self.numpy_to_pandas(X) if ispandas(X) and not issparse(X): - if self.all_nan_columns is None: raise ValueError('_fit must be called before calling transform') diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index 31e2c324b..392eee418 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -1,6 +1,6 @@ import json -from multiprocessing.queues import Queue import os +from multiprocessing.queues import Queue from typing import Any, Dict, List, Optional, Tuple, Union from ConfigSpace.configuration_space import Configuration @@ -22,6 +22,7 @@ fit_and_suppress_warnings ) from autoPyTorch.evaluation.utils import DisableFileOutputParameters +from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline from autoPyTorch.utils.common import dict_repr, subsampler @@ -196,24 +197,7 @@ def fit_predict_and_loss(self) -> None: additional_run_info = pipeline.get_additional_run_info() if hasattr( pipeline, 'get_additional_run_info') else {} - # # add learning curve of configurations to additional_run_info - # if isinstance(pipeline, TabularClassificationPipeline): - # if hasattr(pipeline.named_steps['trainer'], 'run_summary'): - # run_summary = pipeline.named_steps['trainer'].run_summary - # split_types = ['train', 'val', 'test'] - # run_summary_dict = dict( - # run_summary={}, - # budget=self.budget, - # seed=self.seed, - # config_id=self.configuration.config_id, - # num_run=self.num_run - # ) - # for split_type in split_types: - # run_summary_dict['run_summary'][f'{split_type}_loss'] = run_summary.performance_tracker.get(f'{split_type}_loss', None) - # run_summary_dict['run_summary'][f'{split_type}_metrics'] = run_summary.performance_tracker.get(f'{split_type}_metrics', None) - # self.logger.debug(f"run_summary_dict {json.dumps(run_summary_dict)}") - # with open(os.path.join(self.backend.temporary_directory, 'run_summary.txt'), 'a') as file: - # file.write(f"{json.dumps(run_summary_dict)}\n") + # self._write_run_summary(pipeline) status = StatusType.SUCCESS @@ -370,6 +354,27 @@ def fit_predict_and_loss(self) -> None: status=status, ) + def _write_run_summary(self, pipeline: BasePipeline) -> None: + # add learning curve of configurations to additional_run_info + if isinstance(pipeline, TabularClassificationPipeline): + assert isinstance(self.configuration, Configuration) + if hasattr(pipeline.named_steps['trainer'], 'run_summary'): + run_summary = pipeline.named_steps['trainer'].run_summary + split_types = ['train', 'val', 'test'] + run_summary_dict = dict( + run_summary={}, + budget=self.budget, + seed=self.seed, + config_id=self.configuration.config_id, + num_run=self.num_run) + for split_type in split_types: + run_summary_dict['run_summary'][f'{split_type}_loss'] = run_summary.performance_tracker.get( + f'{split_type}_loss', None) + run_summary_dict['run_summary'][f'{split_type}_metrics'] = run_summary.performance_tracker.get( + f'{split_type}_metrics', None) + with open(os.path.join(self.backend.temporary_directory, 'run_summary.txt'), 'a') as file: + file.write(f"{json.dumps(run_summary_dict)}\n") + def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List], test_indices: Union[np.ndarray, List], add_pipeline_to_self: bool diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index b3c2e4158..3caed7246 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -324,7 +324,8 @@ def _add_forbidden_conditions(self, cs: ConfigurationSpace) -> ConfigurationSpac if cyclic_lr_name in available_schedulers: # disable snapshot ensembles and stochastic weight averaging snapshot_ensemble_hyperparameter = cs.get_hyperparameter(f'trainer:{trainer}:use_snapshot_ensemble') - if hasattr(snapshot_ensemble_hyperparameter, 'choices') and True in snapshot_ensemble_hyperparameter.choices: + if hasattr(snapshot_ensemble_hyperparameter, 'choices') and \ + True in snapshot_ensemble_hyperparameter.choices: cs.add_forbidden_clause(ForbiddenAndConjunction( ForbiddenEqualsClause(snapshot_ensemble_hyperparameter, True), ForbiddenEqualsClause(cs.get_hyperparameter('lr_scheduler:__choice__'), cyclic_lr_name) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py index 09363983c..eeca9fdc4 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py @@ -11,7 +11,7 @@ from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ autoPyTorchTabularPreprocessingComponent -from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, ispandas +from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter class ColumnSplitter(autoPyTorchTabularPreprocessingComponent): @@ -24,8 +24,9 @@ def __init__( random_state: Optional[np.random.RandomState] = None ): self.min_categories_for_embedding = min_categories_for_embedding + self.random_state = random_state - self.special_feature_types = dict(encode_columns=[], embed_columns=[]) + self.special_feature_types: Dict[str, List] = dict(encode_columns=[], embed_columns=[]) self.num_categories_per_col: Optional[List] = None super().__init__() @@ -35,15 +36,16 @@ def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'ColumnSplitter': if len(X['dataset_properties']['categorical_columns']) > 0: self.num_categories_per_col = [] - for categories_per_column, column in zip(X['dataset_properties']['num_categories_per_col'], X['dataset_properties']['categorical_columns']): - if ( - categories_per_column >= self.min_categories_for_embedding - ): - self.special_feature_types['embed_columns'].append(column) - # we only care about the categories for columns to be embedded - self.num_categories_per_col.append(categories_per_column) - else: - self.special_feature_types['encode_columns'].append(column) + for categories_per_column, column in zip(X['dataset_properties']['num_categories_per_col'], + X['dataset_properties']['categorical_columns']): + if ( + categories_per_column >= self.min_categories_for_embedding + ): + self.special_feature_types['embed_columns'].append(column) + # we only care about the categories for columns to be embedded + self.num_categories_per_col.append(categories_per_column) + else: + self.special_feature_types['encode_columns'].append(column) return self diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py index 6ff913ae9..0a2486420 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py @@ -13,8 +13,7 @@ class BaseEncoder(autoPyTorchTabularPreprocessingComponent): def __init__(self) -> None: super().__init__() self.add_fit_requirements([ - FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), - ]) + FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), ]) def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py index c7485835a..1968e9f3e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py @@ -1,6 +1,6 @@ import warnings from math import ceil, floor -from typing import Dict, List, Optional, Sequence +from typing import Dict, List, Optional, Sequence, Tuple from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.utils.common import HyperparameterSearchSpace, HyperparameterValueType @@ -82,11 +82,12 @@ def percentage_value_range_to_integer_range( else: log = hyperparameter_search_space.log - value_range = ( - floor(float(hyperparameter_search_space.value_range[0]) * n_features), - floor(float(hyperparameter_search_space.value_range[-1]) * n_features)) \ - if len(hyperparameter_search_space.value_range) == 2 else \ - (floor(float(hyperparameter_search_space.value_range[0]) * n_features),) + value_range: Tuple + if len(hyperparameter_search_space.value_range) == 2: + value_range = (floor(float(hyperparameter_search_space.value_range[0]) * n_features), + floor(float(hyperparameter_search_space.value_range[-1]) * n_features)) + else: + value_range = (floor(float(hyperparameter_search_space.value_range[0]) * n_features),) hyperparameter_search_space = HyperparameterSearchSpace( hyperparameter=hyperparameter_name, diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 5b60ff4ed..486ce2ef7 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -41,9 +41,9 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: # We need to also save the preprocess transforms for inference X.update({ - 'preprocess_transforms': transforms, - 'shape_after_preprocessing': X['X_train'].shape[1:] - }) + 'preprocess_transforms': transforms, + 'shape_after_preprocessing': X['X_train'].shape[1:] + }) return X @staticmethod diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index 98cbc73e5..0fb67da71 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -168,8 +168,9 @@ def get_hyperparameter_search_space( # instead passing it as a parameter to the feature validator, which # allows us to pass embed_columns to the dataset properties. # TODO: test the trade off - # Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding` in one custom transformer. - # this will also allow users to use this transformer outside the pipeline + # Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding` + # in one custom transformer. this will also allow users to use this transformer + # outside the pipeline ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i), value_range=dimension_reduction.value_range, default_value=dimension_reduction.default_value, diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py index 36ffee9c1..0f3a95439 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py @@ -15,7 +15,7 @@ def __init__(self, random_state: Optional[np.random.RandomState] = None): super().__init__(random_state=random_state) self.add_fit_requirements([ FitRequirement('num_categories_per_col', (List,), user_defined=True, dataset_property=True), - FitRequirement('shape_after_preprocessing', (Tuple), user_defined=False, dataset_property=False)]) + FitRequirement('shape_after_preprocessing', (Tuple[int],), user_defined=False, dataset_property=False)]) self.embedding: Optional[nn.Module] = None self.random_state = random_state @@ -73,7 +73,7 @@ def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarr num_cols = X['shape_after_preprocessing'] # only works for 2D(rows, features) tabular data num_features_excl_embed = num_cols[0] - len(X['embed_columns']) - + num_categories_per_col = np.zeros(num_cols, dtype=np.int16) categories_per_embed_col = X['dataset_properties']['num_categories_per_col'] diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index 91fc1bb5b..df9963834 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -20,9 +20,6 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import ( ColumnSplitter ) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( - CoalescerChoice -) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import ( EncoderChoice ) @@ -31,8 +28,6 @@ ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ - VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent @@ -150,7 +145,6 @@ def predict(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray probas = super().predict(X=X, batch_size=batch_size) return np.argmax(probas, axis=1) - def predict_proba(self, X: np.ndarray, batch_size: Optional[int] = None) -> np.ndarray: """predict probabilities. diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 2dc4af727..0fdad6671 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -20,9 +20,6 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.column_splitting.ColumnSplitter import ( ColumnSplitter ) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.coalescer import ( - CoalescerChoice -) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import ( EncoderChoice ) @@ -31,8 +28,6 @@ ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ - VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 12b12c3ad..3e8847110 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -4,6 +4,7 @@ import pickle import tempfile import unittest +import unittest.mock from test.test_api.utils import ( dummy_do_dummy_prediction, dummy_eval_train_function, @@ -681,6 +682,7 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): del estimator +@pytest.skip("Fix with new portfolio PR") @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function', new=dummy_eval_train_function) @pytest.mark.parametrize('openml_id', (40981, )) @@ -723,6 +725,7 @@ def test_portfolio_selection(openml_id, backend, n_samples): assert any(successful_config in portfolio_configs for successful_config in successful_configs) +@pytest.skip("Fix with new portfolio PR") @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function', new=dummy_eval_train_function) @pytest.mark.parametrize('openml_id', (40981, )) @@ -871,7 +874,7 @@ def test_pipeline_fit(openml_id, configuration = estimator.get_search_space(dataset).get_default_configuration() pipeline, run_info, run_value, dataset = estimator.fit_pipeline(dataset=dataset, configuration=configuration, - run_time_limit_secs=50, + run_time_limit_secs=70, disable_file_output=disable_file_output, budget_type='epochs', budget=budget diff --git a/test/test_data/test_feature_validator.py b/test/test_data/test_feature_validator.py index 099ee691f..1ccb91b2f 100644 --- a/test/test_data/test_feature_validator.py +++ b/test/test_data/test_feature_validator.py @@ -288,7 +288,7 @@ def test_features_unsupported_calls_are_raised(): expected """ validator = TabularFeatureValidator() - with pytest.raises(TypeError, match=r"Valid types are `numerical`, `categorical` or `boolean`, but input column"): + with pytest.raises(TypeError, match=r"Valid types are .*"): validator.fit( pd.DataFrame({'datetime': [pd.Timestamp('20180310')]}) ) @@ -298,7 +298,7 @@ def test_features_unsupported_calls_are_raised(): validator.fit({'input1': 1, 'input2': 2}) validator = TabularFeatureValidator() - with pytest.raises(TypeError, match=r"Valid types are `numerical`, `categorical` or `boolean`, but input column"): + with pytest.raises(TypeError, match=r"Valid types are .*"): validator.fit(pd.DataFrame([{'A': 1, 'B': 2}], dtype='string')) validator = TabularFeatureValidator() @@ -430,7 +430,7 @@ def test_unknown_encode_value(): assert expected_row == x_t[0].tolist() # Notice how there is only one column 'c' to encode - assert validator.categories == [list(range(2)) for i in range(1)] + assert validator.num_categories_per_col == [2] # Actual checks for the features @@ -485,13 +485,13 @@ def test_feature_validator_new_data_after_fit( if train_data_type == 'pandas': old_dtypes = copy.deepcopy(validator.dtypes) validator.dtypes = ['dummy' for dtype in X_train.dtypes] - with pytest.raises(ValueError, match=r"The dtype of the features must not be changed after fit()"): + with pytest.raises(ValueError, match=r"The dtype of the features must not be changed after fit.*"): transformed_X = validator.transform(X_test) validator.dtypes = old_dtypes if test_data_type == 'pandas': columns = X_test.columns.tolist() X_test = X_test[reversed(columns)] - with pytest.raises(ValueError, match=r"The column order of the features"): + with pytest.raises(ValueError, match=r"The column order of the features must not be changed after fit.*"): transformed_X = validator.transform(X_test) diff --git a/test/test_pipeline/components/setup/test_setup_networks.py b/test/test_pipeline/components/setup/test_setup_networks.py index f5e9b1bb7..1036f8304 100644 --- a/test/test_pipeline/components/setup/test_setup_networks.py +++ b/test/test_pipeline/components/setup/test_setup_networks.py @@ -20,7 +20,7 @@ def head(request): # TODO: add 'LearnedEntityEmbedding' after preprocessing dix -@pytest.fixture(params=['NoEmbedding']) +@pytest.fixture(params=['NoEmbedding', 'LearnedEntityEmbedding']) def embedding(request): return request.param diff --git a/test/test_pipeline/components/setup/test_setup_preprocessing_node.py b/test/test_pipeline/components/setup/test_setup_preprocessing_node.py index 1ec858864..5d3b49923 100644 --- a/test/test_pipeline/components/setup/test_setup_preprocessing_node.py +++ b/test/test_pipeline/components/setup/test_setup_preprocessing_node.py @@ -37,7 +37,7 @@ def test_tabular_preprocess(self): 'is_small_preprocess': True, 'input_shape': (15,), 'output_shape': 2, - 'categories': [], + 'num_categories_per_col': [], 'issparse': False } X = dict(X_train=np.random.random((10, 15)), @@ -64,43 +64,6 @@ def test_tabular_preprocess(self): # We expect the transformation always for inference self.assertIn('preprocess_transforms', X.keys()) - def test_tabular_no_preprocess(self): - dataset_properties = { - 'numerical_columns': list(range(15)), - 'categorical_columns': [], - 'task_type': TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION], - 'output_type': OUTPUT_TYPES_TO_STRING[MULTICLASS], - 'is_small_preprocess': False, - 'input_shape': (15,), - 'output_shape': 2, - 'categories': [], - 'issparse': False - } - X = dict(X_train=np.random.random((10, 15)), - y_train=np.random.random(10), - train_indices=[0, 1, 2, 3, 4, 5], - val_indices=[6, 7, 8, 9], - dataset_properties=dataset_properties, - # Training configuration - num_run=16, - device='cpu', - budget_type='epochs', - epochs=10, - torch_num_threads=1, - early_stopping=20, - split_id=0, - backend=self.backend, - ) - - pipeline = TabularClassificationPipeline(dataset_properties=dataset_properties) - # Remove the trainer - pipeline.steps.pop() - pipeline = pipeline.fit(X) - X = pipeline.transform(X) - self.assertIn('preprocess_transforms', X.keys()) - self.assertIsInstance(X['preprocess_transforms'], list) - self.assertIsInstance(X['preprocess_transforms'][-1].preprocessor, BaseEstimator) - class ImagePreprocessingTest(unittest.TestCase): def setUp(self): diff --git a/test/test_pipeline/components/training/test_training.py b/test/test_pipeline/components/training/test_training.py index ae85cad4d..dc8e842ce 100644 --- a/test/test_pipeline/components/training/test_training.py +++ b/test/test_pipeline/components/training/test_training.py @@ -513,7 +513,7 @@ def dummy_performance(*args, **kwargs): 'step_interval': StepIntervalUnit.batch } for item in ['backend', 'lr_scheduler', 'network', 'optimizer', 'train_data_loader', 'val_data_loader', - 'device', 'y_train', 'network_snapshots']: + 'device', 'y_train', 'network_snapshots', 'train_indices']: fit_dictionary[item] = unittest.mock.MagicMock() fit_dictionary['backend'].temporary_directory = tempfile.mkdtemp() diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index 3e4e3bde5..2b6d34df7 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -123,8 +123,8 @@ def test_pipeline_predict(self, fit_dictionary_tabular, exclude): pipeline.fit(fit_dictionary_tabular) # we expect the output to have the same batch size as the test input, - # and number of outputs per batch sample equal to the number of outputs - expected_output_shape = (X.shape[0], fit_dictionary_tabular["dataset_properties"]["output_shape"]) + # and number of outputs per batch sample equal to 1 + expected_output_shape = (X.shape[0], ) prediction = pipeline.predict(X) assert isinstance(prediction, np.ndarray) @@ -435,9 +435,9 @@ def test_trainer_cocktails(self, fit_dictionary_tabular, mocker, lr_scheduler, t len(X['network_snapshots']) == config.get(f'trainer:{trainer}:se_lastk') mocker.patch("autoPyTorch.pipeline.components.setup.network.base_network.NetworkComponent._predict", - return_value=torch.Tensor([1])) + return_value=torch.Tensor([[1, 0]])) # Assert that predict gives no error when swa and se are on - assert isinstance(pipeline.predict(fit_dictionary_tabular['X_train']), np.ndarray) + assert isinstance(pipeline.predict(X['X_train']), np.ndarray) # As SE is True, _predict should be called 3 times assert pipeline.named_steps['network']._predict.call_count == 3 diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py index a2c3b695e..48b6daa5e 100644 --- a/test/test_pipeline/test_tabular_regression.py +++ b/test/test_pipeline/test_tabular_regression.py @@ -152,13 +152,10 @@ def test_pipeline_transform(self, fit_dictionary_tabular): # We expect the transformations to be in the pipeline at anytime for inference assert 'preprocess_transforms' in transformed_fit_dictionary_tabular.keys() - @pytest.mark.parametrize("is_small_preprocess", [True, False]) - def test_default_configuration(self, fit_dictionary_tabular, is_small_preprocess): + def test_default_configuration(self, fit_dictionary_tabular): """Makes sure that when no config is set, we can trust the default configuration from the space""" - fit_dictionary_tabular['is_small_preprocess'] = is_small_preprocess - pipeline = TabularRegressionPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties'], exclude={'trainer': ['AdversarialTrainer']}) From 09fdc0dcd9c0d9227bfd2d43a638d61f08e3e3d3 Mon Sep 17 00:00:00 2001 From: Ravin Kohli <13005107+ravinkohli@users.noreply.github.com> Date: Sat, 16 Jul 2022 17:31:36 +0200 Subject: [PATCH 12/22] [ADD] Calculate memory of dataset after one hot encoding (pytorch embedding) (#437) * add updates for apt1.0+reg_cocktails * debug loggers for checking data and network memory usage * add support for pandas, test for data passing, remove debug loggers * remove unwanted changes * : * Adjust formula to account for embedding columns * Apply suggestions from code review Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> * remove unwanted additions * Update autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py Co-authored-by: nabenabe0928 <47781922+nabenabe0928@users.noreply.github.com> --- autoPyTorch/constants.py | 2 + autoPyTorch/data/tabular_validator.py | 2 + autoPyTorch/data/utils.py | 53 ++++++++++++++++--- .../TabularColumnTransformer.py | 1 + .../column_splitting/ColumnSplitter.py | 4 +- .../encoding/OneHotEncoder.py | 3 +- test/test_api/test_api.py | 4 +- test/test_data/test_utils.py | 21 ++++++-- test/test_data/test_validation.py | 29 ++++++++-- 9 files changed, 98 insertions(+), 21 deletions(-) diff --git a/autoPyTorch/constants.py b/autoPyTorch/constants.py index bfd56d27f..154d562ac 100644 --- a/autoPyTorch/constants.py +++ b/autoPyTorch/constants.py @@ -78,3 +78,5 @@ # To avoid that we get a sequence that is too long to be fed to a network MAX_WINDOW_SIZE_BASE = 500 + +MIN_CATEGORIES_FOR_EMBEDDING_MAX = 7 diff --git a/autoPyTorch/data/tabular_validator.py b/autoPyTorch/data/tabular_validator.py index 0f6f89e1c..0735d49b4 100644 --- a/autoPyTorch/data/tabular_validator.py +++ b/autoPyTorch/data/tabular_validator.py @@ -111,6 +111,8 @@ def _compress_dataset( y=y, is_classification=self.is_classification, random_state=self.seed, + categorical_columns=self.feature_validator.categorical_columns, + n_categories_per_cat_column=self.feature_validator.num_categories_per_col, **self.dataset_compression # type: ignore [arg-type] ) self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype diff --git a/autoPyTorch/data/utils.py b/autoPyTorch/data/utils.py index 20ad5612e..2a44dd5c2 100644 --- a/autoPyTorch/data/utils.py +++ b/autoPyTorch/data/utils.py @@ -25,6 +25,7 @@ from sklearn.utils import _approximate_mode, check_random_state from sklearn.utils.validation import _num_samples, check_array +from autoPyTorch.constants import MIN_CATEGORIES_FOR_EMBEDDING_MAX from autoPyTorch.data.base_target_validator import SupportedTargetTypes from autoPyTorch.utils.common import ispandas @@ -459,8 +460,8 @@ def _subsample_by_indices( return X, y -def megabytes(arr: DatasetCompressionInputType) -> float: - +def get_raw_memory_usage(arr: DatasetCompressionInputType) -> float: + memory_in_bytes: float if isinstance(arr, np.ndarray): memory_in_bytes = arr.nbytes elif issparse(arr): @@ -470,8 +471,43 @@ def megabytes(arr: DatasetCompressionInputType) -> float: else: raise ValueError(f"Unrecognised data type of X, expected data type to " f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}") + return memory_in_bytes + + +def get_approximate_mem_usage_in_mb( + arr: DatasetCompressionInputType, + categorical_columns: List, + n_categories_per_cat_column: Optional[List[int]] = None +) -> float: + + err_msg = "Value number of categories per categorical is required when the data has categorical columns" + if ispandas(arr): + arr_dtypes = arr.dtypes.to_dict() + multipliers = [dtype.itemsize for col, dtype in arr_dtypes.items() if col not in categorical_columns] + if len(categorical_columns) > 0: + if n_categories_per_cat_column is None: + raise ValueError(err_msg) + for col, num_cat in zip(categorical_columns, n_categories_per_cat_column): + if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX: + multipliers.append(num_cat * arr_dtypes[col].itemsize) + else: + multipliers.append(arr_dtypes[col].itemsize) + size_one_row = sum(multipliers) + + elif isinstance(arr, (np.ndarray, spmatrix)): + n_cols = arr.shape[-1] - len(categorical_columns) + multiplier = arr.dtype.itemsize + if len(categorical_columns) > 0: + if n_categories_per_cat_column is None: + raise ValueError(err_msg) + # multiply num categories with the size of the column to capture memory after one hot encoding + n_cols += sum(num_cat if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX else 1 for num_cat in n_categories_per_cat_column) + size_one_row = n_cols * multiplier + else: + raise ValueError(f"Unrecognised data type of X, expected data type to " + f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(arr)}") - return float(memory_in_bytes / (2**20)) + return float(arr.shape[0] * size_one_row / (2**20)) def reduce_dataset_size_if_too_large( @@ -479,10 +515,13 @@ def reduce_dataset_size_if_too_large( memory_allocation: Union[int, float], is_classification: bool, random_state: Union[int, np.random.RandomState], + categorical_columns: List, + n_categories_per_cat_column: Optional[List[int]] = None, y: Optional[SupportedTargetTypes] = None, methods: List[str] = ['precision', 'subsample'], ) -> DatasetCompressionInputType: - f""" Reduces the size of the dataset if it's too close to the memory limit. + f""" + Reduces the size of the dataset if it's too close to the memory limit. Follows the order of the operations passed in and retains the type of its input. @@ -513,7 +552,6 @@ def reduce_dataset_size_if_too_large( Reduce the amount of samples of the dataset such that it fits into the allocated memory. Ensures stratification and that unique labels are present - memory_allocation (Union[int, float]): The amount of memory to allocate to the dataset. It should specify an absolute amount. @@ -524,7 +562,7 @@ def reduce_dataset_size_if_too_large( """ for method in methods: - if megabytes(X) <= memory_allocation: + if get_approximate_mem_usage_in_mb(X, categorical_columns, n_categories_per_cat_column) <= memory_allocation: break if method == 'precision': @@ -540,7 +578,8 @@ def reduce_dataset_size_if_too_large( # into the allocated memory, we subsample it so that it does n_samples_before = X.shape[0] - sample_percentage = memory_allocation / megabytes(X) + sample_percentage = memory_allocation / get_approximate_mem_usage_in_mb( + X, categorical_columns, n_categories_per_cat_column) # NOTE: type ignore # diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 6b38b4650..48f40e9fe 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -24,6 +24,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N self.add_fit_requirements([ FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)]) + def get_column_transformer(self) -> ColumnTransformer: """ diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py index eeca9fdc4..437198d9e 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py @@ -7,7 +7,7 @@ import numpy as np - +from autoPyTorch.constants import MIN_CATEGORIES_FOR_EMBEDDING_MAX from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ autoPyTorchTabularPreprocessingComponent @@ -72,7 +72,7 @@ def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, min_categories_for_embedding: HyperparameterSearchSpace = HyperparameterSearchSpace( hyperparameter="min_categories_for_embedding", - value_range=(3, 7), + value_range=(3, MIN_CATEGORIES_FOR_EMBEDDING_MAX), default_value=3, log=True), ) -> ConfigurationSpace: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py index 80cf3f748..4f8878615 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py @@ -24,7 +24,8 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: # It is safer to have the OHE produce a 0 array than to crash a good configuration categories='auto', sparse=False, - handle_unknown='ignore') + handle_unknown='ignore', + dtype=np.float32) return self @staticmethod diff --git a/test/test_api/test_api.py b/test/test_api/test_api.py index 3e8847110..f71ad3f5f 100644 --- a/test/test_api/test_api.py +++ b/test/test_api/test_api.py @@ -682,10 +682,10 @@ def test_do_dummy_prediction(dask_client, fit_dictionary_tabular): del estimator -@pytest.skip("Fix with new portfolio PR") @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function', new=dummy_eval_train_function) @pytest.mark.parametrize('openml_id', (40981, )) +@pytest.mark.skip(reason="Fix with new portfolio PR") def test_portfolio_selection(openml_id, backend, n_samples): # Get the data and check that contents of data-manager make sense @@ -725,7 +725,7 @@ def test_portfolio_selection(openml_id, backend, n_samples): assert any(successful_config in portfolio_configs for successful_config in successful_configs) -@pytest.skip("Fix with new portfolio PR") +@pytest.mark.skip(reason="Fix with new portfolio PR") @unittest.mock.patch('autoPyTorch.evaluation.tae.eval_train_function', new=dummy_eval_train_function) @pytest.mark.parametrize('openml_id', (40981, )) diff --git a/test/test_data/test_utils.py b/test/test_data/test_utils.py index 4269c4e5f..6228740b0 100644 --- a/test/test_data/test_utils.py +++ b/test/test_data/test_utils.py @@ -25,7 +25,7 @@ from autoPyTorch.data.utils import ( default_dataset_compression_arg, get_dataset_compression_mapping, - megabytes, + get_raw_memory_usage, reduce_dataset_size_if_too_large, reduce_precision, subsample, @@ -45,13 +45,14 @@ def test_reduce_dataset_if_too_large(openmlid, as_frame, n_samples): X.copy(), y=y.copy(), is_classification=True, + categorical_columns=[], random_state=1, - memory_allocation=0.001) + memory_allocation=0.01) assert X_converted.shape[0] < X.shape[0] assert y_converted.shape[0] < y.shape[0] - assert megabytes(X_converted) < megabytes(X) + assert get_raw_memory_usage(X_converted) < get_raw_memory_usage(X) @pytest.mark.parametrize("X", [np.asarray([[1, 1, 1]] * 30)]) @@ -211,8 +212,18 @@ def test_unsupported_errors(): ['a', 'b', 'c', 'a', 'b', 'c'], ['a', 'b', 'd', 'r', 'b', 'c']]) with pytest.raises(ValueError, match=r'X.dtype = .*'): - reduce_dataset_size_if_too_large(X, is_classification=True, random_state=1, memory_allocation=0) + reduce_dataset_size_if_too_large( + X, + is_classification=True, + categorical_columns=[], + random_state=1, + memory_allocation=0) X = [[1, 2], [2, 3]] with pytest.raises(ValueError, match=r'Unrecognised data type of X, expected data type to be in .*'): - reduce_dataset_size_if_too_large(X, is_classification=True, random_state=1, memory_allocation=0) + reduce_dataset_size_if_too_large( + X, + is_classification=True, + categorical_columns=[], + random_state=1, + memory_allocation=0) diff --git a/test/test_data/test_validation.py b/test/test_data/test_validation.py index af46be55f..b6f05f7ba 100644 --- a/test/test_data/test_validation.py +++ b/test/test_data/test_validation.py @@ -8,7 +8,8 @@ import sklearn.model_selection from autoPyTorch.data.tabular_validator import TabularInputValidator -from autoPyTorch.data.utils import megabytes +from autoPyTorch.data.utils import get_approximate_mem_usage_in_mb +from autoPyTorch.utils.common import ispandas @pytest.mark.parametrize('openmlid', [2, 40975, 40984]) @@ -148,16 +149,36 @@ def test_featurevalidator_dataset_compression(input_data_featuretest): X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( input_data_featuretest, input_data_targets, test_size=0.1, random_state=1) validator = TabularInputValidator( - dataset_compression={'memory_allocation': 0.8 * megabytes(X_train), 'methods': ['precision', 'subsample']} + dataset_compression={ + 'memory_allocation': 0.8 * get_approximate_mem_usage_in_mb(X_train, [], None), + 'methods': ['precision', 'subsample']} ) validator.fit(X_train=X_train, y_train=y_train) transformed_X_train, _ = validator.transform(X_train.copy(), y_train.copy()) + if ispandas(X_train): + # input validator converts transformed_X_train to numpy and the cat columns are chosen as column indices + columns = X_train.columns + categorical_columns = [columns[col] for col in validator.feature_validator.categorical_columns] + else: + categorical_columns = validator.feature_validator.categorical_columns + assert validator._reduced_dtype is not None - assert megabytes(transformed_X_train) < megabytes(X_train) + assert get_approximate_mem_usage_in_mb( + transformed_X_train, + validator.feature_validator.categorical_columns, + validator.feature_validator.num_categories_per_col + ) < get_approximate_mem_usage_in_mb( + X_train, categorical_columns, validator.feature_validator.num_categories_per_col) transformed_X_test, _ = validator.transform(X_test.copy(), y_test.copy()) - assert megabytes(transformed_X_test) < megabytes(X_test) + assert get_approximate_mem_usage_in_mb( + transformed_X_test, + validator.feature_validator.categorical_columns, + validator.feature_validator.num_categories_per_col + ) < get_approximate_mem_usage_in_mb( + X_test, categorical_columns, validator.feature_validator.num_categories_per_col) + if hasattr(transformed_X_train, 'iloc'): assert all(transformed_X_train.dtypes == transformed_X_test.dtypes) assert all(transformed_X_train.dtypes == validator._precision) From 3aef02e1a6fb58b8cb41e445e288a8eaf50afd0d Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Mon, 18 Jul 2022 20:42:19 +0200 Subject: [PATCH 13/22] suggestions from review --- autoPyTorch/pipeline/base_pipeline.py | 3 +- .../LearnedEntityEmbedding.py | 3 +- .../pipeline/tabular_classification.py | 29 ++++++++++--------- autoPyTorch/pipeline/tabular_regression.py | 29 ++++++++++--------- 4 files changed, 32 insertions(+), 32 deletions(-) diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py index 3caed7246..e6ae1bd59 100644 --- a/autoPyTorch/pipeline/base_pipeline.py +++ b/autoPyTorch/pipeline/base_pipeline.py @@ -300,8 +300,7 @@ def _get_hyperparameter_search_space(self, def _add_forbidden_conditions(self, cs: ConfigurationSpace) -> ConfigurationSpace: """ Add forbidden conditions to ensure valid configurations. - Currently, Learned Entity Embedding is only valid when encoder is one hot encoder - and CyclicLR is disabled when using stochastic weight averaging and snapshot + Currently, CyclicLR is disabled when using stochastic weight averaging and snapshot ensembling. Args: diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index 0fb67da71..f75db44b3 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -23,8 +23,7 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n """ Args: config (Dict[str, Any]): The configuration sampled by the hyperparameter optimizer - num_input_features (np.ndarray): column wise information of number of output columns after transformation - for each categorical column and 0 for numerical columns + num_categories_per_col (np.ndarray): number of categories per categorical columns that will be embedded num_features_excl_embed (int): number of features in X excluding the features that need to be embedded """ super().__init__() diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index df9963834..1b49f0d36 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -51,20 +51,21 @@ class TabularClassificationPipeline(ClassifierMixin, BasePipeline): It implements a pipeline, which includes the following as steps: 1. `imputer` - 2. `encoder` - 3. `scaler` - 4. `feature_preprocessor` - 5. `tabular_transformer` - 6. `preprocessing` - 7. `network_embedding` - 8. `network_backbone` - 9. `network_head` - 10. `network` - 11. `network_init` - 12. `optimizer` - 13. `lr_scheduler` - 14. `data_loader` - 15. `trainer` + 2. `column_splitter + 3. `encoder` + 4. `scaler` + 5. `feature_preprocessor` + 6. `tabular_transformer` + 7. `preprocessing` + 8. `network_embedding` + 9. `network_backbone` + 10. `network_head` + 11. `network` + 12. `network_init` + 13. `optimizer` + 14. `lr_scheduler` + 15. `data_loader` + 16. `trainer` Contrary to the sklearn API it is not possible to enumerate the possible parameters in the __init__ function because we only know the diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 0fdad6671..1cf60e561 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -53,20 +53,21 @@ class TabularRegressionPipeline(RegressorMixin, BasePipeline): It implements a pipeline, which includes the following as steps: 1. `imputer` - 2. `encoder` - 3. `scaler` - 4. `feature_preprocessor` - 5. `tabular_transformer` - 6. `preprocessing` - 7. `network_embedding` - 8. `network_backbone` - 9. `network_head` - 10. `network` - 11. `network_init` - 12. `optimizer` - 13. `lr_scheduler` - 14. `data_loader` - 15. `trainer` + 2. `column_splitter + 3. `encoder` + 4. `scaler` + 5. `feature_preprocessor` + 6. `tabular_transformer` + 7. `preprocessing` + 8. `network_embedding` + 9. `network_backbone` + 10. `network_head` + 11. `network` + 12. `network_init` + 13. `optimizer` + 14. `lr_scheduler` + 15. `data_loader` + 16. `trainer` Contrary to the sklearn API it is not possible to enumerate the possible parameters in the __init__ function because we only know the From 8e3dbefb78ba6573a3422174c8a157fd5d68c8a6 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 9 Aug 2022 15:54:04 +0200 Subject: [PATCH 14/22] add preprocessed_dtype to determine double or float --- autoPyTorch/api/base_task.py | 1 + .../setup/early_preprocessor/EarlyPreprocessing.py | 3 ++- .../TimeSeriesEarlyPreProcessing.py | 3 ++- .../pipeline/components/training/trainer/__init__.py | 11 +++++++---- .../components/training/test_image_data_loader.py | 1 - .../components/training/test_training.py | 2 +- test/test_pipeline/test_tabular_classification.py | 5 +---- 7 files changed, 14 insertions(+), 12 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 274e2a316..51414fb02 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -270,6 +270,7 @@ def build_pipeline( include_components: Optional[Dict[str, Any]] = None, exclude_components: Optional[Dict[str, Any]] = None, search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + ) -> BasePipeline: """ Build pipeline according to current task Characteristics of the dataset to guide the pipeline diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 486ce2ef7..fda7d6424 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -42,7 +42,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: # We need to also save the preprocess transforms for inference X.update({ 'preprocess_transforms': transforms, - 'shape_after_preprocessing': X['X_train'].shape[1:] + 'shape_after_preprocessing': X['X_train'].shape[1:], + 'preprocessed_dtype': X['X_train'].dtype.name }) return X diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py index ce6b930d4..f6ed83a72 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py @@ -65,7 +65,8 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: # We need to also save the preprocess transforms for inference X.update({ 'preprocess_transforms': transforms, - 'shape_after_preprocessing': X['X_train'].shape[1:] + 'shape_after_preprocessing': X['X_train'].shape[1:], + 'preprocessed_dtype': X['X_train'].dtype.name }) return X diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index c8f420ac8..4a281323f 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -447,15 +447,18 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic raise RuntimeError("Budget exhausted without finishing an epoch.") if self.choice.use_stochastic_weight_averaging and self.choice.swa_updated: + use_double = 'float64' in X['preprocessed_dtype'] # update batch norm statistics - swa_utils.update_bn(loader=X['train_data_loader'], model=self.choice.swa_model.double()) - + swa_model = self.choice.swa_model.double() if use_double else self.choice.swa_model + swa_utils.update_bn(loader=X['train_data_loader'], model=swa_model) # change model update_model_state_dict_from_swa(X['network'], self.choice.swa_model.state_dict()) - if self.choice.use_snapshot_ensemble and len(self.choice.model_snapshots) > 0: + if self.choice.use_snapshot_ensemble: # we update only the last network which pertains to the stochastic weight averaging model - swa_utils.update_bn(X['train_data_loader'], self.choice.model_snapshots[-1].double()) + snapshot_model = self.choice.model_snapshots[-1].double() if use_double else self.choice.model_snapshots[-1] + swa_utils.update_bn(X['train_data_loader'], snapshot_model) + update_model_state_dict_from_swa(X['network_snapshots'][-1], self.choice.swa_model.state_dict()) # wrap up -- add score if not evaluating every epoch if not self.eval_valid_each_epoch(X): diff --git a/test/test_pipeline/components/training/test_image_data_loader.py b/test/test_pipeline/components/training/test_image_data_loader.py index af70cf77b..98a10373b 100644 --- a/test/test_pipeline/components/training/test_image_data_loader.py +++ b/test/test_pipeline/components/training/test_image_data_loader.py @@ -16,7 +16,6 @@ def test_imageloader_build_transform(): fit_dictionary = dict() fit_dictionary['dataset_properties'] = dict() - fit_dictionary['dataset_properties']['is_small_preprocess'] = unittest.mock.Mock(()) fit_dictionary['image_augmenter'] = unittest.mock.Mock() fit_dictionary['preprocess_transforms'] = unittest.mock.Mock() diff --git a/test/test_pipeline/components/training/test_training.py b/test/test_pipeline/components/training/test_training.py index dc8e842ce..397488468 100644 --- a/test/test_pipeline/components/training/test_training.py +++ b/test/test_pipeline/components/training/test_training.py @@ -101,7 +101,7 @@ def test_fit_transform(self): 'y_train': np.array([0, 1, 0]), 'train_indices': [0, 1], 'val_indices': [2], - 'dataset_properties': {'is_small_preprocess': True}, + 'dataset_properties': {}, 'working_dir': '/tmp', 'split_id': 0, 'backend': backend, diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index 2b6d34df7..c3f7f49f8 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -205,15 +205,12 @@ def test_pipeline_transform(self, fit_dictionary_tabular, exclude): # We expect the transformations to be in the pipeline at anytime for inference assert 'preprocess_transforms' in transformed_fit_dictionary_tabular.keys() - @pytest.mark.parametrize("is_small_preprocess", [True, False]) - def test_default_configuration(self, fit_dictionary_tabular, is_small_preprocess, exclude): + def test_default_configuration(self, fit_dictionary_tabular, exclude): """Makes sure that when no config is set, we can trust the default configuration from the space""" fit_dictionary_tabular['epochs'] = 5 - fit_dictionary_tabular['is_small_preprocess'] = is_small_preprocess - pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties'], exclude=exclude) From 52427bc65cf9fe4cb5b77f010d419e92f5cb0b87 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Tue, 16 Aug 2022 17:22:03 +0200 Subject: [PATCH 15/22] test fix in progress --- autoPyTorch/data/tabular_feature_validator.py | 7 +++---- autoPyTorch/datasets/time_series_dataset.py | 4 ++-- .../column_splitting/ColumnSplitter.py | 2 +- .../time_series_preprocessing/encoding/OneHotEncoder.py | 8 ++++---- .../encoding/time_series_base_encoder.py | 4 ++-- .../pipeline/components/training/trainer/__init__.py | 9 +++++++-- 6 files changed, 19 insertions(+), 15 deletions(-) diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py index 5ea892f7d..a34e03131 100644 --- a/autoPyTorch/data/tabular_feature_validator.py +++ b/autoPyTorch/data/tabular_feature_validator.py @@ -77,10 +77,9 @@ class TabularFeatureValidator(BaseFeatureValidator): transformer. Attributes: - categories (List[List[str]]): - List for which an element at each index is a - list containing the categories for the respective - categorical column. + num_categories_per_col (List[int]): + List for which an element at each index is the number + of categories for the respective categorical column. transformed_columns (List[str]) List of columns that were transformed. column_transformer (Optional[BaseEstimator]) diff --git a/autoPyTorch/datasets/time_series_dataset.py b/autoPyTorch/datasets/time_series_dataset.py index 670eb44c9..9296f47df 100644 --- a/autoPyTorch/datasets/time_series_dataset.py +++ b/autoPyTorch/datasets/time_series_dataset.py @@ -559,7 +559,7 @@ def __init__(self, self.num_features: int = self.validator.feature_validator.num_features # type: ignore[assignment] self.num_targets: int = self.validator.target_validator.out_dimensionality # type: ignore[assignment] - self.categories = self.validator.feature_validator.categories + self.num_categories_per_col = self.validator.feature_validator.num_categories_per_col self.feature_shapes = self.validator.feature_shapes self.feature_names = tuple(self.validator.feature_names) @@ -1072,7 +1072,7 @@ def get_required_dataset_info(self) -> Dict[str, Any]: 'categorical_features': self.categorical_features, 'numerical_columns': self.numerical_columns, 'categorical_columns': self.categorical_columns, - 'categories': self.categories, + 'num_categories_per_col': self.num_categories_per_col, }) return info diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py index 437198d9e..6902fb1bb 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py @@ -16,7 +16,7 @@ class ColumnSplitter(autoPyTorchTabularPreprocessingComponent): """ - Removes features that have the same value in the training data. + Splits categorical columns into embed or encode columns based on a hyperparameter. """ def __init__( self, diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py index 5ac5e2550..2a3dfa0f4 100644 --- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/OneHotEncoder.py @@ -19,14 +19,14 @@ def __init__(self, def fit(self, X: Dict[str, Any], y: Any = None) -> TimeSeriesBaseEncoder: OneHotEncoder.fit(self, X, y) categorical_columns = X['dataset_properties']['categorical_columns'] - n_features_cat = X['dataset_properties']['categories'] + num_categories_per_col = X['dataset_properties']['num_categories_per_col'] feature_names = X['dataset_properties']['feature_names'] feature_shapes = X['dataset_properties']['feature_shapes'] - if len(n_features_cat) == 0: - n_features_cat = self.preprocessor['categorical'].categories # type: ignore + if len(num_categories_per_col) == 0: + num_categories_per_col = [len(cat) for cat in self.preprocessor['categorical'].categories] # type: ignore for i, cat_column in enumerate(categorical_columns): - feature_shapes[feature_names[cat_column]] = len(n_features_cat[i]) + feature_shapes[feature_names[cat_column]] = num_categories_per_col[i] self.feature_shapes = feature_shapes return self diff --git a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py index a3d64ee92..779d673a2 100644 --- a/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/time_series_preprocessing/encoding/time_series_base_encoder.py @@ -15,11 +15,11 @@ def __init__(self) -> None: super(TimeSeriesBaseEncoder, self).__init__() self.add_fit_requirements([ FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), - FitRequirement('categories', (List,), user_defined=True, dataset_property=True), + FitRequirement('num_categories_per_col', (List,), user_defined=True, dataset_property=True), FitRequirement('feature_names', (tuple,), user_defined=True, dataset_property=True), FitRequirement('feature_shapes', (Dict, ), user_defined=True, dataset_property=True), ]) - self.feature_shapes: Union[Dict[str, int]] = {} + self.feature_shapes: Dict[str, int] = {} def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index 4a281323f..7bac27d7d 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -447,7 +447,13 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic raise RuntimeError("Budget exhausted without finishing an epoch.") if self.choice.use_stochastic_weight_averaging and self.choice.swa_updated: - use_double = 'float64' in X['preprocessed_dtype'] + # By default, we assume the data is double. Only if the data was preprocessed, + # we check the dtype and use it accordingly + preprocessed_dtype = X.get('preprocessed_dtype', None) + if preprocessed_dtype is None: + use_double = True + else: + use_double = 'float64' in preprocessed_dtype # update batch norm statistics swa_model = self.choice.swa_model.double() if use_double else self.choice.swa_model @@ -458,7 +464,6 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic # we update only the last network which pertains to the stochastic weight averaging model snapshot_model = self.choice.model_snapshots[-1].double() if use_double else self.choice.model_snapshots[-1] swa_utils.update_bn(X['train_data_loader'], snapshot_model) - update_model_state_dict_from_swa(X['network_snapshots'][-1], self.choice.swa_model.state_dict()) # wrap up -- add score if not evaluating every epoch if not self.eval_valid_each_epoch(X): From 90512eedb6d41bc4db14846d6eb946a71ebc1bd9 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 17 Aug 2022 15:02:09 +0200 Subject: [PATCH 16/22] TODO: fix errors after rebase --- .../setup/early_preprocessor/EarlyPreprocessing.py | 6 ++++-- .../early_preprocessor/TimeSeriesEarlyPreProcessing.py | 5 +++-- .../pipeline/components/setup/early_preprocessor/utils.py | 8 ++++++++ .../pipeline/components/training/trainer/__init__.py | 2 +- .../test_time_series_forecasting_pipeline.py | 2 +- 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index fda7d6424..f912b07c1 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -10,7 +10,7 @@ from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType from autoPyTorch.pipeline.components.setup.base_setup import autoPyTorchSetupComponent -from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms, preprocess +from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import get_preprocess_transforms, get_preprocessed_dtype, preprocess from autoPyTorch.utils.common import FitRequirement @@ -39,11 +39,13 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: X['X_train'] = preprocess(dataset=X_train, transforms=transforms) + preprocessed_dtype = get_preprocessed_dtype(X['X_train']) + # We need to also save the preprocess transforms for inference X.update({ 'preprocess_transforms': transforms, 'shape_after_preprocessing': X['X_train'].shape[1:], - 'preprocessed_dtype': X['X_train'].dtype.name + 'preprocessed_dtype': preprocessed_dtype }) return X diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py index f6ed83a72..7fb600d4c 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/TimeSeriesEarlyPreProcessing.py @@ -10,7 +10,7 @@ from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import \ EarlyPreprocessing from autoPyTorch.pipeline.components.setup.early_preprocessor.utils import ( - get_preprocess_transforms, time_series_preprocess) + get_preprocess_transforms, get_preprocessed_dtype, time_series_preprocess) from autoPyTorch.utils.common import FitRequirement @@ -62,11 +62,12 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: new_feature_names += list(set(feature_names) - set(new_feature_names)) X['dataset_properties']['feature_names'] = tuple(new_feature_names) + preprocessed_dtype = get_preprocessed_dtype(X['X_train']) # We need to also save the preprocess transforms for inference X.update({ 'preprocess_transforms': transforms, 'shape_after_preprocessing': X['X_train'].shape[1:], - 'preprocessed_dtype': X['X_train'].dtype.name + 'preprocessed_dtype': preprocessed_dtype }) return X diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py index 830beced9..667e9c008 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/utils.py @@ -13,6 +13,7 @@ autoPyTorchPreprocessingComponent as aPTPre, autoPyTorchTargetPreprocessingComponent as aPTTPre ) +from .....utils.common import ispandas def get_preprocess_transforms(X: Dict[str, Any], @@ -71,3 +72,10 @@ def time_series_preprocess(dataset: pd.DataFrame, transforms: torchvision.transf sub_dataset = composite_transforms(sub_dataset) dataset.iloc[:, indices] = sub_dataset return dataset + + +def get_preprocessed_dtype(X_train: Union[np.ndarray, pd.DataFrame]): + if ispandas(X_train): + return X_train.dtypes[X_train.columns].name + else: + return X_train.dtype.name \ No newline at end of file diff --git a/autoPyTorch/pipeline/components/training/trainer/__init__.py b/autoPyTorch/pipeline/components/training/trainer/__init__.py index 7bac27d7d..13a106de6 100755 --- a/autoPyTorch/pipeline/components/training/trainer/__init__.py +++ b/autoPyTorch/pipeline/components/training/trainer/__init__.py @@ -453,7 +453,7 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic if preprocessed_dtype is None: use_double = True else: - use_double = 'float64' in preprocessed_dtype + use_double = 'float64' in preprocessed_dtype or 'int64' in preprocessed_dtype # update batch norm statistics swa_model = self.choice.swa_model.double() if use_double else self.choice.swa_model diff --git a/test/test_pipeline/test_time_series_forecasting_pipeline.py b/test/test_pipeline/test_time_series_forecasting_pipeline.py index 3e34b71b7..09cc6b5f0 100644 --- a/test/test_pipeline/test_time_series_forecasting_pipeline.py +++ b/test/test_pipeline/test_time_series_forecasting_pipeline.py @@ -46,7 +46,7 @@ class TestTimeSeriesForecastingPipeline: "multi_variant_only_num"], indirect=True) def test_fit_predict(self, fit_dictionary_forecasting, forecasting_budgets): dataset_properties = fit_dictionary_forecasting['dataset_properties'] - if not dataset_properties['uni_variant'] and len(dataset_properties['categories']) > 0: + if not dataset_properties['uni_variant'] and len(dataset_properties['num_categories_per_col']) > 0: include = {'network_embedding': ['LearnedEntityEmbedding']} else: include = None From 895b90428e1a2af1fd671778d5c23b67a6bb1520 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 17 Aug 2022 15:15:05 +0200 Subject: [PATCH 17/22] Reg cocktails apt1.0+reg cocktails pytorch embedding reduced (#454) * reduce number of hyperparameters for pytorch embedding * remove todos for the preprocessing PR, and apply suggestion from code review * remove unwanted exclude in test --- autoPyTorch/api/base_task.py | 4 ++ .../LearnedEntityEmbedding.py | 70 ++++++++++++------- .../test_tabular_column_transformer.py | 2 - .../components/setup/test_setup_networks.py | 1 - test/test_pipeline/test_tabular_regression.py | 15 ++-- 5 files changed, 55 insertions(+), 37 deletions(-) diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py index 51414fb02..57614407d 100644 --- a/autoPyTorch/api/base_task.py +++ b/autoPyTorch/api/base_task.py @@ -273,6 +273,10 @@ def build_pipeline( ) -> BasePipeline: """ Build pipeline according to current task + and for the passed dataset properties + + Args: + dataset_properties (Dict[str, Any]): Characteristics of the dataset to guide the pipeline choices of components include_components (Optional[Dict[str, Any]]): diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index f75db44b3..41873d58a 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -4,6 +4,8 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( UniformFloatHyperparameter, + UniformIntegerHyperparameter, + CategoricalHyperparameter ) import numpy as np @@ -16,6 +18,34 @@ from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter +def get_num_output_dimensions(config: Dict[str, Any], num_categs_per_feature: List[int]) -> List[int]: + """ + Returns list of embedding sizes for each categorical variable. + Selects this adaptively based on training_datset. + Note: Assumes there is at least one embed feature. + Args: + config (Dict[str, Any]): + contains the hyperparameters required to calculate the `num_output_dimensions` + num_categs_per_feature (List[int]): + list containing number of categories for each feature that is to be embedded, + 0 if the column is not an embed column + Returns: + List[int]: + list containing the output embedding size for each column, + 1 if the column is not an embed column + """ + + max_embedding_dim = config['max_embedding_dim'] + embed_exponent = config['embed_exponent'] + size_factor = config['embedding_size_factor'] + num_output_dimensions = [int(size_factor*max( + 2, + min(max_embedding_dim, + 1.6 * num_categories**embed_exponent))) + if num_categories > 0 else 1 for num_categories in num_categs_per_feature] + return num_output_dimensions + + class _LearnedEntityEmbedding(nn.Module): """ Learned entity embedding module for categorical features""" @@ -35,9 +65,7 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n self.num_embed_features = self.num_categories_per_col[self.embed_features] - self.num_output_dimensions = [1] * num_features_excl_embed - self.num_output_dimensions.extend([ceil(config["dimension_reduction_" + str(i)] * num_in) for i, num_in in - enumerate(self.num_embed_features)]) + self.num_output_dimensions = get_num_output_dimensions(config, self.num_categories_per_col) self.num_out_feats = num_features_excl_embed + sum(self.num_output_dimensions) @@ -78,12 +106,10 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # before passing it through the model concat_seq = [] - x_pointer = 0 layer_pointer = 0 for x_pointer, embed in enumerate(self.embed_features): current_feature_slice = x[:, x_pointer] if not embed: - x_pointer += 1 concat_seq.append(current_feature_slice.view(-1, 1)) continue current_feature_slice = current_feature_slice.to(torch.int) @@ -153,28 +179,24 @@ def build_embedding(self, num_categories_per_col: np.ndarray, num_features_excl_ @staticmethod def get_hyperparameter_search_space( dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, - dimension_reduction: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="dimension_reduction", - value_range=(0, 1), - default_value=0.5), + embed_exponent: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embed_exponent", + value_range=(0.56,), + default_value=0.56), + max_embedding_dim: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="max_embedding_dim", + value_range=(100,), + default_value=100), + embedding_size_factor: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="embedding_size_factor", + value_range=(0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5), + default_value=1, + ), ) -> ConfigurationSpace: cs = ConfigurationSpace() if dataset_properties is not None: - for i in range(len(dataset_properties['categorical_columns']) - if isinstance(dataset_properties['categorical_columns'], List) else 0): - # currently as we dont have information about the embedding columns - # we search for more dimensions than necessary. This can be solved by - # not having `min_unique_values_for_embedding` as a hyperparameter and - # instead passing it as a parameter to the feature validator, which - # allows us to pass embed_columns to the dataset properties. - # TODO: test the trade off - # Another solution is to combine `OneHotEncoding`, `Embedding` and `NoEncoding` - # in one custom transformer. this will also allow users to use this transformer - # outside the pipeline - ee_dimensions_search_space = HyperparameterSearchSpace(hyperparameter="dimension_reduction_" + str(i), - value_range=dimension_reduction.value_range, - default_value=dimension_reduction.default_value, - log=dimension_reduction.log) - add_hyperparameter(cs, ee_dimensions_search_space, UniformFloatHyperparameter) + if len(dataset_properties['categorical_columns']) > 0: + add_hyperparameter(cs, embed_exponent, UniformFloatHyperparameter) + add_hyperparameter(cs, max_embedding_dim, UniformIntegerHyperparameter) + add_hyperparameter(cs, embedding_size_factor, CategoricalHyperparameter) + return cs @staticmethod diff --git a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py index a81eb34a2..f5f928bd8 100644 --- a/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/preprocessing/test_tabular_column_transformer.py @@ -13,8 +13,6 @@ ) -# TODO: fix in preprocessing PR -# @pytest.mark.skip("Skipping tests as preprocessing is not finalised") @pytest.mark.parametrize("fit_dictionary_tabular", ['classification_numerical_only', 'classification_categorical_only', 'classification_numerical_and_categorical'], indirect=True) diff --git a/test/test_pipeline/components/setup/test_setup_networks.py b/test/test_pipeline/components/setup/test_setup_networks.py index 1036f8304..8fa77560f 100644 --- a/test/test_pipeline/components/setup/test_setup_networks.py +++ b/test/test_pipeline/components/setup/test_setup_networks.py @@ -19,7 +19,6 @@ def head(request): return request.param -# TODO: add 'LearnedEntityEmbedding' after preprocessing dix @pytest.fixture(params=['NoEmbedding', 'LearnedEntityEmbedding']) def embedding(request): return request.param diff --git a/test/test_pipeline/test_tabular_regression.py b/test/test_pipeline/test_tabular_regression.py index 48b6daa5e..e2e770a24 100644 --- a/test/test_pipeline/test_tabular_regression.py +++ b/test/test_pipeline/test_tabular_regression.py @@ -61,11 +61,9 @@ def test_pipeline_fit(self, fit_dictionary_tabular): """This test makes sure that the pipeline is able to fit given random combinations of hyperparameters across the pipeline""" # TODO: fix issue where adversarial also works for regression - # TODO: Fix issue with learned entity embedding after preprocessing PR pipeline = TabularRegressionPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties'], - exclude={'trainer': ['AdversarialTrainer'], - 'network_embedding': ['LearnedEntityEmbedding']}) + exclude={'trainer': ['AdversarialTrainer']}) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() @@ -91,8 +89,7 @@ def test_pipeline_predict(self, fit_dictionary_tabular): X = fit_dictionary_tabular['X_train'].copy() pipeline = TabularRegressionPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties'], - exclude={'trainer': ['AdversarialTrainer'], - 'network_embedding': ['LearnedEntityEmbedding']}) + exclude={'trainer': ['AdversarialTrainer']}) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() @@ -121,8 +118,7 @@ def test_pipeline_transform(self, fit_dictionary_tabular): pipeline = TabularRegressionPipeline( dataset_properties=fit_dictionary_tabular['dataset_properties'], - exclude={'trainer': ['AdversarialTrainer'], - 'network_embedding': ['LearnedEntityEmbedding']}) + exclude={'trainer': ['AdversarialTrainer']}) cs = pipeline.get_hyperparameter_search_space() config = cs.sample_configuration() pipeline.set_hyperparameters(config) @@ -139,11 +135,10 @@ def test_pipeline_transform(self, fit_dictionary_tabular): assert fit_dictionary_tabular.items() <= transformed_fit_dictionary_tabular.items() # Then the pipeline should have added the following keys - # Removing 'imputer', 'encoder', 'scaler', these will be - # TODO: added back after a PR fixing preprocessing expected_keys = {'tabular_transformer', 'preprocess_transforms', 'network', 'optimizer', 'lr_scheduler', 'train_data_loader', - 'val_data_loader', 'run_summary', 'feature_preprocessor'} + 'val_data_loader', 'run_summary', 'feature_preprocessor', + 'imputer', 'encoder', 'scaler'} assert expected_keys.issubset(set(transformed_fit_dictionary_tabular.keys())) # Then we need to have transformations being created. From 033bca7db62c82f2ba369dd103ad7209a4048cce Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Wed, 17 Aug 2022 15:26:01 +0200 Subject: [PATCH 18/22] fix embeddings after rebase --- .../LearnedEntityEmbedding.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py index 41873d58a..e87217469 100644 --- a/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py +++ b/autoPyTorch/pipeline/components/setup/network_embedding/LearnedEntityEmbedding.py @@ -62,6 +62,7 @@ def __init__(self, config: Dict[str, Any], num_categories_per_col: np.ndarray, n # or 0 for numerical data self.num_categories_per_col = num_categories_per_col self.embed_features = self.num_categories_per_col > 0 + self.num_features_excl_embed = num_features_excl_embed self.num_embed_features = self.num_categories_per_col[self.embed_features] @@ -84,8 +85,8 @@ def get_partial_models(self, subset_features: List[int]) -> "_LearnedEntityEmbed partial_model (_LearnedEntityEmbedding) a new partial model """ - num_input_features = self.num_input_features[subset_features] - num_numerical_features = sum([sf < self.num_numerical for sf in subset_features]) + num_input_features = self.num_categories_per_col[subset_features] + num_features_excl_embed = sum([sf < self.num_features_excl_embed for sf in subset_features]) num_output_dimensions = [self.num_output_dimensions[sf] for sf in subset_features] embed_features = [self.embed_features[sf] for sf in subset_features] @@ -98,7 +99,7 @@ def get_partial_models(self, subset_features: List[int]) -> "_LearnedEntityEmbed ee_layer_tracker += 1 ee_layers = nn.ModuleList(ee_layers) - return PartialLearnedEntityEmbedding(num_input_features, num_numerical_features, embed_features, + return PartialLearnedEntityEmbedding(num_input_features, num_features_excl_embed, embed_features, num_output_dimensions, ee_layers) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -136,28 +137,27 @@ class PartialLearnedEntityEmbedding(_LearnedEntityEmbedding): of the input features. This is applied to forecasting tasks where not all the features might be known beforehand """ def __init__(self, - num_input_features: np.ndarray, - num_numerical_features: int, + num_categories_per_col: np.ndarray, + num_features_excl_embed: int, embed_features: List[bool], num_output_dimensions: List[int], ee_layers: nn.Module ): super(_LearnedEntityEmbedding, self).__init__() - self.num_numerical = num_numerical_features + self.num_features_excl_embed = num_features_excl_embed # list of number of categories of categorical data # or 0 for numerical data - self.num_input_features = num_input_features - categorical_features: np.ndarray = self.num_input_features > 0 - - self.num_categorical_features = self.num_input_features[categorical_features] + self.num_categories_per_col = num_categories_per_col self.embed_features = embed_features self.num_output_dimensions = num_output_dimensions - self.num_out_feats = self.num_numerical + sum(self.num_output_dimensions) + self.num_out_feats = self.num_features_excl_embed + sum(self.num_output_dimensions) self.ee_layers = ee_layers + self.num_embed_features = self.num_categories_per_col[self.embed_features] + class LearnedEntityEmbedding(NetworkEmbeddingComponent): """ From d4cd8b496115b632477a8e65f5ea408bd07e8d49 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 18 Aug 2022 18:34:30 +0200 Subject: [PATCH 19/22] fix error with pytorch embeddings --- .../TabularColumnTransformer.py | 31 ++++++++++++++----- .../base_tabular_preprocessing.py | 10 +++--- .../column_splitting/ColumnSplitter.py | 3 ++ .../encoding/OneHotEncoder.py | 12 +++---- .../encoding/base_encoder.py | 9 +++--- .../tabular_preprocessing/utils.py | 4 ++- 6 files changed, 46 insertions(+), 23 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 48f40e9fe..0f5711f31 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -23,7 +23,9 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N self.preprocessor: Optional[ColumnTransformer] = None self.add_fit_requirements([ FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), - FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)]) + FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), + FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False), + FitRequirement('embed_columns', (List,), user_defined=True, dataset_property=False)]) def get_column_transformer(self) -> ColumnTransformer: @@ -53,17 +55,32 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": self.check_requirements(X, y) preprocessors = get_tabular_preprocessers(X) + column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = [] + + numerical_pipeline = 'passthrough' + categorical_pipeline = 'passthrough' + encode_pipeline = 'passthrough' + if len(preprocessors['numerical']) > 0: numerical_pipeline = make_pipeline(*preprocessors['numerical']) - column_transformers.append( - ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']) - ) + + column_transformers.append( + ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']) + ) if len(preprocessors['categorical']) > 0: categorical_pipeline = make_pipeline(*preprocessors['categorical']) - column_transformers.append( - ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns']) - ) + + column_transformers.append( + ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns']) + ) + + if len(preprocessors['encode']) > 0: + encode_pipeline = make_pipeline(*preprocessors['encode']) + + column_transformers.append( + ('encode_pipeline', encode_pipeline, X['encode_columns']) + ) # in case the preprocessing steps are disabled # i.e, NoEncoder for categorical, we want to diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py index aefe9ddf8..74b1a4d58 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/base_tabular_preprocessing.py @@ -14,19 +14,19 @@ class autoPyTorchTabularPreprocessingComponent(autoPyTorchPreprocessingComponent def __init__(self) -> None: super().__init__() self.preprocessor: Union[Dict[str, Optional[BaseEstimator]], BaseEstimator] = dict( - numerical=None, categorical=None) + numerical=None, encode=None, categorical=None) def get_preprocessor_dict(self) -> Dict[str, BaseEstimator]: """ - Returns early_preprocessor dictionary containing the sklearn numerical - and categorical early_preprocessor with "numerical" and "categorical" - keys. May contain None for a key if early_preprocessor does not + Returns early_preprocessor dictionary containing the sklearn numerical, + categorical and encode early_preprocessor with "numerical", "categorical" + "encode" keys. May contain None for a key if early_preprocessor does not handle the datatype defined by key Returns: Dict[str, BaseEstimator]: early_preprocessor dictionary """ - if (self.preprocessor['numerical'] and self.preprocessor['categorical']) is None: + if (self.preprocessor['numerical'] and self.preprocessor['categorical'] and self.preprocessor['encode']) is None: raise AttributeError("{} can't return early_preprocessor dict without fitting first" .format(self.__class__.__name__)) return self.preprocessor diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py index 6902fb1bb..aa8e6ab19 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py @@ -1,3 +1,5 @@ +import logging +import time from typing import Any, Dict, List, Optional, Union from ConfigSpace.configuration_space import ConfigurationSpace @@ -12,6 +14,7 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ autoPyTorchTabularPreprocessingComponent from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter +from autoPyTorch.utils.logging_ import get_named_client_logger class ColumnSplitter(autoPyTorchTabularPreprocessingComponent): diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py index 4f8878615..2f382a574 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/OneHotEncoder.py @@ -20,12 +20,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEncoder: self.check_requirements(X, y) - self.preprocessor['categorical'] = OHE( - # It is safer to have the OHE produce a 0 array than to crash a good configuration - categories='auto', - sparse=False, - handle_unknown='ignore', - dtype=np.float32) + if self._has_encode_columns(X): + self.preprocessor['encode'] = OHE( + # It is safer to have the OHE produce a 0 array than to crash a good configuration + sparse=False, + handle_unknown='ignore', + dtype=np.float32) return self @staticmethod diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py index 0a2486420..b62822107 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder.py @@ -13,7 +13,11 @@ class BaseEncoder(autoPyTorchTabularPreprocessingComponent): def __init__(self) -> None: super().__init__() self.add_fit_requirements([ - FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), ]) + FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False)]) + + @staticmethod + def _has_encode_columns(X: Dict[str, Any]): + return len(X.get('encode_columns', [])) > 0 def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ @@ -24,8 +28,5 @@ def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: Returns: (Dict[str, Any]): the updated 'X' dictionary """ - if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: - raise ValueError("cant call transform on {} without fitting first." - .format(self.__class__.__name__)) X.update({'encoder': self.preprocessor}) return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py index e71583e3e..20f0e0320 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/utils.py @@ -21,7 +21,7 @@ def get_tabular_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator Returns: (Dict[str, List[BaseEstimator]]): dictionary with list of numerical and categorical preprocessors """ - preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list()) + preprocessor: Dict[str, List[BaseEstimator]] = dict(numerical=list(), categorical=list(), encode=list()) for key, value in X.items(): if isinstance(value, dict): # as each preprocessor is child of BaseEstimator @@ -29,5 +29,7 @@ def get_tabular_preprocessers(X: Dict[str, Any]) -> Dict[str, List[BaseEstimator preprocessor['numerical'].append(value['numerical']) if 'categorical' in value and isinstance(value['categorical'], BaseEstimator): preprocessor['categorical'].append(value['categorical']) + if 'encode' in value and isinstance(value['encode'], BaseEstimator): + preprocessor['encode'].append(value['encode']) return preprocessor From a5807cb025a045083b38ccbadf9ad7edbe86bed1 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 18 Aug 2022 18:45:22 +0200 Subject: [PATCH 20/22] fix redundant code --- .../tabular_preprocessing/column_splitting/ColumnSplitter.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py index aa8e6ab19..6902fb1bb 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/column_splitting/ColumnSplitter.py @@ -1,5 +1,3 @@ -import logging -import time from typing import Any, Dict, List, Optional, Union from ConfigSpace.configuration_space import ConfigurationSpace @@ -14,7 +12,6 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ autoPyTorchTabularPreprocessingComponent from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter -from autoPyTorch.utils.logging_ import get_named_client_logger class ColumnSplitter(autoPyTorchTabularPreprocessingComponent): From 960e1ef477236d6b3f168256f52b493b3b2443b4 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Thu, 18 Aug 2022 19:46:56 +0200 Subject: [PATCH 21/22] change userdefined to False --- .../tabular_preprocessing/TabularColumnTransformer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 0f5711f31..44cdeb117 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -24,8 +24,8 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N self.add_fit_requirements([ FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True), FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True), - FitRequirement('encode_columns', (List,), user_defined=True, dataset_property=False), - FitRequirement('embed_columns', (List,), user_defined=True, dataset_property=False)]) + FitRequirement('encode_columns', (List,), user_defined=False, dataset_property=False), + FitRequirement('embed_columns', (List,), user_defined=False, dataset_property=False)]) def get_column_transformer(self) -> ColumnTransformer: From 1be80d56233e48882def39f39794e6a209f95fd1 Mon Sep 17 00:00:00 2001 From: Ravin Kohli Date: Fri, 19 Aug 2022 15:36:03 +0200 Subject: [PATCH 22/22] remove using categorical columns --- .../TabularColumnTransformer.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index 44cdeb117..58a55a1df 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -59,7 +59,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": column_transformers: List[Tuple[str, BaseEstimator, List[int]]] = [] numerical_pipeline = 'passthrough' - categorical_pipeline = 'passthrough' encode_pipeline = 'passthrough' if len(preprocessors['numerical']) > 0: @@ -68,12 +67,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": column_transformers.append( ('numerical_pipeline', numerical_pipeline, X['dataset_properties']['numerical_columns']) ) - if len(preprocessors['categorical']) > 0: - categorical_pipeline = make_pipeline(*preprocessors['categorical']) - - column_transformers.append( - ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns']) - ) if len(preprocessors['encode']) > 0: encode_pipeline = make_pipeline(*preprocessors['encode']) @@ -82,6 +75,12 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": ('encode_pipeline', encode_pipeline, X['encode_columns']) ) + # if len(preprocessors['categorical']) > 0: + # categorical_pipeline = make_pipeline(*preprocessors['categorical']) + # column_transformers.append( + # ('categorical_pipeline', categorical_pipeline, X['dataset_properties']['categorical_columns']) + # ) + # in case the preprocessing steps are disabled # i.e, NoEncoder for categorical, we want to # let the data in categorical columns pass through