Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nn.Embedding to avoid OneHotEncoding all categorical columns #425

20 changes: 1 addition & 19 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,23 +111,6 @@ def send_warnings_to_log(
return prediction


def get_search_updates(categorical_indicator: List[bool]):
"""
These updates mimic the autopytorch tabular paper.
Returns:
________
search_space_updates - HyperparameterSearchSpaceUpdates
The search space updates like setting different hps to different values or ranges.
"""

has_cat_features = any(categorical_indicator)
has_numerical_features = not all(categorical_indicator)

search_space_updates = HyperparameterSearchSpaceUpdates()

return search_space_updates


class BaseTask(ABC):
"""
Base class for the tasks that serve as API to the pipelines.
Expand Down Expand Up @@ -200,7 +183,6 @@ def __init__(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
task_type: Optional[str] = None,
categorical_indicator: Optional[List[bool]] = None
) -> None:

if isinstance(resampling_strategy, NoResamplingStrategyTypes) and ensemble_size != 0:
Expand Down Expand Up @@ -267,7 +249,7 @@ def __init__(

self.input_validator: Optional[BaseInputValidator] = None

self.search_space_updates = search_space_updates if search_space_updates is not None else get_search_updates(categorical_indicator)
self.search_space_updates = search_space_updates
if search_space_updates is not None:
if not isinstance(self.search_space_updates,
HyperparameterSearchSpaceUpdates):
Expand Down
2 changes: 0 additions & 2 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ def __init__(
resampling_strategy_args: Optional[Dict[str, Any]] = None,
backend: Optional[Backend] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None,
categorical_indicator: Optional[List[bool]] = None
):
super().__init__(
seed=seed,
Expand All @@ -119,7 +118,6 @@ def __init__(
resampling_strategy_args=resampling_strategy_args,
search_space_updates=search_space_updates,
task_type=TASK_TYPES_TO_STRING[TABULAR_CLASSIFICATION],
categorical_indicator=categorical_indicator
)

def build_pipeline(
Expand Down
2 changes: 2 additions & 0 deletions autoPyTorch/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,5 @@

CLASSIFICATION_OUTPUTS = [BINARY, MULTICLASS, MULTICLASSMULTIOUTPUT]
REGRESSION_OUTPUTS = [CONTINUOUS, CONTINUOUSMULTIOUTPUT]

MIN_CATEGORIES_FOR_EMBEDDING_MAX = 7
2 changes: 1 addition & 1 deletion autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ def __init__(

# Required for dataset properties
self.num_features: Optional[int] = None
self.categories: List[List[int]] = []
self.categorical_columns: List[int] = []
self.numerical_columns: List[int] = []

self.num_categories_per_col: Optional[List[int]] = []
self.all_nan_columns: Optional[Set[Union[int, str]]] = None

self._is_fitted = False
Expand Down
8 changes: 2 additions & 6 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,10 +193,8 @@ def _fit(
encoded_categories = self.column_transformer.\
named_transformers_['categorical_pipeline'].\
named_steps['ordinalencoder'].categories_
self.categories = [
list(range(len(cat)))
for cat in encoded_categories
]

self.num_categories_per_col = [len(cat) for cat in encoded_categories]

# differently to categorical_columns and numerical_columns,
# this saves the index of the column.
Expand Down Expand Up @@ -274,8 +272,6 @@ def transform(
X = self.numpy_to_pandas(X)

if ispandas(X) and not issparse(X):
X = cast(pd.DataFrame, X)

if self.all_nan_columns is None:
raise ValueError('_fit must be called before calling transform')

Expand Down
2 changes: 2 additions & 0 deletions autoPyTorch/data/tabular_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ def _compress_dataset(
y=y,
is_classification=self.is_classification,
random_state=self.seed,
categorical_columns=self.feature_validator.categorical_columns,
n_categories_per_cat_column=self.feature_validator.num_categories_per_col,
**self.dataset_compression # type: ignore [arg-type]
)
self._reduced_dtype = dict(X.dtypes) if is_dataframe else X.dtype
Expand Down
53 changes: 46 additions & 7 deletions autoPyTorch/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from sklearn.utils import _approximate_mode, check_random_state
from sklearn.utils.validation import _num_samples, check_array

from autoPyTorch.constants import MIN_CATEGORIES_FOR_EMBEDDING_MAX
from autoPyTorch.data.base_target_validator import SupportedTargetTypes
from autoPyTorch.utils.common import ispandas

Expand Down Expand Up @@ -459,8 +460,8 @@ def _subsample_by_indices(
return X, y


def megabytes(arr: DatasetCompressionInputType) -> float:

def get_raw_memory_usage(arr: DatasetCompressionInputType) -> float:
memory_in_bytes: float
if isinstance(arr, np.ndarray):
memory_in_bytes = arr.nbytes
elif issparse(arr):
Expand All @@ -470,19 +471,57 @@ def megabytes(arr: DatasetCompressionInputType) -> float:
else:
raise ValueError(f"Unrecognised data type of X, expected data type to "
f"be in (np.ndarray, spmatrix, pd.DataFrame) but got :{type(arr)}")
return memory_in_bytes


def get_approximate_mem_usage_in_mb(
arr: DatasetCompressionInputType,
categorical_columns: List,
n_categories_per_cat_column: Optional[List[int]] = None
) -> float:

err_msg = "Value number of categories per categorical is required when the data has categorical columns"
if ispandas(arr):
arr_dtypes = arr.dtypes.to_dict()
multipliers = [dtype.itemsize for col, dtype in arr_dtypes.items() if col not in categorical_columns]
if len(categorical_columns) > 0:
if n_categories_per_cat_column is None:
raise ValueError(err_msg)
for col, num_cat in zip(categorical_columns, n_categories_per_cat_column):
if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX:
multipliers.append(num_cat * arr_dtypes[col].itemsize)
else:
multipliers.append(arr_dtypes[col].itemsize)
size_one_row = sum(multipliers)

elif isinstance(arr, (np.ndarray, spmatrix)):
n_cols = arr.shape[-1] - len(categorical_columns)
multiplier = arr.dtype.itemsize
if len(categorical_columns) > 0:
if n_categories_per_cat_column is None:
raise ValueError(err_msg)
# multiply num categories with the size of the column to capture memory after one hot encoding
n_cols += sum(num_cat if num_cat < MIN_CATEGORIES_FOR_EMBEDDING_MAX else 1 for num_cat in n_categories_per_cat_column)
size_one_row = n_cols * multiplier
else:
raise ValueError(f"Unrecognised data type of X, expected data type to "
f"be in (np.ndarray, spmatrix, pd.DataFrame), but got :{type(arr)}")

return float(memory_in_bytes / (2**20))
return float(arr.shape[0] * size_one_row / (2**20))


def reduce_dataset_size_if_too_large(
X: DatasetCompressionInputType,
memory_allocation: Union[int, float],
is_classification: bool,
random_state: Union[int, np.random.RandomState],
categorical_columns: List,
n_categories_per_cat_column: Optional[List[int]] = None,
y: Optional[SupportedTargetTypes] = None,
methods: List[str] = ['precision', 'subsample'],
) -> DatasetCompressionInputType:
f""" Reduces the size of the dataset if it's too close to the memory limit.
f"""
Reduces the size of the dataset if it's too close to the memory limit.

Follows the order of the operations passed in and retains the type of its
input.
Expand Down Expand Up @@ -513,7 +552,6 @@ def reduce_dataset_size_if_too_large(
Reduce the amount of samples of the dataset such that it fits into the allocated
memory. Ensures stratification and that unique labels are present


memory_allocation (Union[int, float]):
The amount of memory to allocate to the dataset. It should specify an
absolute amount.
Expand All @@ -524,7 +562,7 @@ def reduce_dataset_size_if_too_large(
"""

for method in methods:
if megabytes(X) <= memory_allocation:
if get_approximate_mem_usage_in_mb(X, categorical_columns, n_categories_per_cat_column) <= memory_allocation:
break

if method == 'precision':
Expand All @@ -540,7 +578,8 @@ def reduce_dataset_size_if_too_large(
# into the allocated memory, we subsample it so that it does

n_samples_before = X.shape[0]
sample_percentage = memory_allocation / megabytes(X)
sample_percentage = memory_allocation / get_approximate_mem_usage_in_mb(
X, categorical_columns, n_categories_per_cat_column)

# NOTE: type ignore
#
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/datasets/tabular_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def __init__(self,
self.categorical_columns = validator.feature_validator.categorical_columns
self.numerical_columns = validator.feature_validator.numerical_columns
self.num_features = validator.feature_validator.num_features
self.categories = validator.feature_validator.categories
self.num_categories_per_col = validator.feature_validator.num_categories_per_col

super().__init__(train_tensors=(X, Y), test_tensors=(X_test, Y_test), shuffle=shuffle,
resampling_strategy=resampling_strategy,
Expand Down
43 changes: 24 additions & 19 deletions autoPyTorch/evaluation/train_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
from multiprocessing.queues import Queue
import os
from multiprocessing.queues import Queue
from typing import Any, Dict, List, Optional, Tuple, Union

from ConfigSpace.configuration_space import Configuration
Expand All @@ -22,6 +22,7 @@
fit_and_suppress_warnings
)
from autoPyTorch.evaluation.utils import DisableFileOutputParameters
from autoPyTorch.pipeline.base_pipeline import BasePipeline
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
from autoPyTorch.utils.common import dict_repr, subsampler
Expand Down Expand Up @@ -195,24 +196,7 @@ def fit_predict_and_loss(self) -> None:
additional_run_info = pipeline.get_additional_run_info() if hasattr(
pipeline, 'get_additional_run_info') else {}

# add learning curve of configurations to additional_run_info
if isinstance(pipeline, TabularClassificationPipeline):
if hasattr(pipeline.named_steps['trainer'], 'run_summary'):
run_summary = pipeline.named_steps['trainer'].run_summary
split_types = ['train', 'val', 'test']
run_summary_dict = dict(
run_summary={},
budget=self.budget,
seed=self.seed,
config_id=self.configuration.config_id,
num_run=self.num_run
)
for split_type in split_types:
run_summary_dict['run_summary'][f'{split_type}_loss'] = run_summary.performance_tracker.get(f'{split_type}_loss', None)
run_summary_dict['run_summary'][f'{split_type}_metrics'] = run_summary.performance_tracker.get(f'{split_type}_metrics', None)
self.logger.debug(f"run_summary_dict {json.dumps(run_summary_dict)}")
with open(os.path.join(self.backend.temporary_directory, 'run_summary.txt'), 'a') as file:
file.write(f"{json.dumps(run_summary_dict)}\n")
# self._write_run_summary(pipeline)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the functionality that was encapsulated in the function, I think this should be called here, right?


status = StatusType.SUCCESS

Expand Down Expand Up @@ -370,6 +354,27 @@ def fit_predict_and_loss(self) -> None:
status=status,
)

def _write_run_summary(self, pipeline: BasePipeline) -> None:
# add learning curve of configurations to additional_run_info
if isinstance(pipeline, TabularClassificationPipeline):
assert isinstance(self.configuration, Configuration)
if hasattr(pipeline.named_steps['trainer'], 'run_summary'):
run_summary = pipeline.named_steps['trainer'].run_summary
split_types = ['train', 'val', 'test']
run_summary_dict = dict(
run_summary={},
budget=self.budget,
seed=self.seed,
config_id=self.configuration.config_id,
num_run=self.num_run)
for split_type in split_types:
run_summary_dict['run_summary'][f'{split_type}_loss'] = run_summary.performance_tracker.get(
f'{split_type}_loss', None)
run_summary_dict['run_summary'][f'{split_type}_metrics'] = run_summary.performance_tracker.get(
f'{split_type}_metrics', None)
with open(os.path.join(self.backend.temporary_directory, 'run_summary.txt'), 'a') as file:
file.write(f"{json.dumps(run_summary_dict)}\n")

def _fit_and_predict(self, pipeline: BaseEstimator, fold: int, train_indices: Union[np.ndarray, List],
test_indices: Union[np.ndarray, List],
add_pipeline_to_self: bool
Expand Down
37 changes: 4 additions & 33 deletions autoPyTorch/pipeline/base_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from copy import copy
import warnings
from abc import ABCMeta
from collections import Counter
Expand Down Expand Up @@ -297,11 +296,10 @@ def _get_hyperparameter_search_space(self,
"""
raise NotImplementedError()

def _add_forbidden_conditions(self, cs):
def _add_forbidden_conditions(self, cs: ConfigurationSpace) -> ConfigurationSpace:
"""
Add forbidden conditions to ensure valid configurations.
Currently, Learned Entity Embedding is only valid when encoder is one hot encoder
and CyclicLR is disabled when using stochastic weight averaging and snapshot
Currently, CyclicLR is disabled when using stochastic weight averaging and snapshot
ensembling.

Args:
Expand All @@ -310,33 +308,6 @@ def _add_forbidden_conditions(self, cs):

"""

# Learned Entity Embedding is only valid when encoder is one hot encoder
if 'network_embedding' in self.named_steps.keys() and 'encoder' in self.named_steps.keys():
embeddings = cs.get_hyperparameter('network_embedding:__choice__').choices
if 'LearnedEntityEmbedding' in embeddings:
encoders = cs.get_hyperparameter('encoder:__choice__').choices
possible_default_embeddings = copy(list(embeddings))
del possible_default_embeddings[possible_default_embeddings.index('LearnedEntityEmbedding')]

for encoder in encoders:
if encoder == 'OneHotEncoder':
continue
while True:
try:
cs.add_forbidden_clause(ForbiddenAndConjunction(
ForbiddenEqualsClause(cs.get_hyperparameter(
'network_embedding:__choice__'), 'LearnedEntityEmbedding'),
ForbiddenEqualsClause(cs.get_hyperparameter('encoder:__choice__'), encoder)
))
break
except ValueError:
# change the default and try again
try:
default = possible_default_embeddings.pop()
except IndexError:
raise ValueError("Cannot find a legal default configuration")
cs.get_hyperparameter('network_embedding:__choice__').default_value = default

# Disable CyclicLR until todo is completed.
if 'lr_scheduler' in self.named_steps.keys() and 'trainer' in self.named_steps.keys():
trainers = cs.get_hyperparameter('trainer:__choice__').choices
Expand All @@ -347,7 +318,8 @@ def _add_forbidden_conditions(self, cs):
if cyclic_lr_name in available_schedulers:
# disable snapshot ensembles and stochastic weight averaging
snapshot_ensemble_hyperparameter = cs.get_hyperparameter(f'trainer:{trainer}:use_snapshot_ensemble')
if hasattr(snapshot_ensemble_hyperparameter, 'choices') and True in snapshot_ensemble_hyperparameter.choices:
if hasattr(snapshot_ensemble_hyperparameter, 'choices') and \
True in snapshot_ensemble_hyperparameter.choices:
cs.add_forbidden_clause(ForbiddenAndConjunction(
ForbiddenEqualsClause(snapshot_ensemble_hyperparameter, True),
ForbiddenEqualsClause(cs.get_hyperparameter('lr_scheduler:__choice__'), cyclic_lr_name)
Expand Down Expand Up @@ -549,7 +521,6 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
node_hyperparameters,
update.hyperparameter))


def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]]
) -> List[Tuple[str, PipelineStepType]]:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = N
self.add_fit_requirements([
FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True)])


def get_column_transformer(self) -> ColumnTransformer:
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ def __init__(self) -> None:
self._processing = True
self.add_fit_requirements([
FitRequirement('categorical_columns', (List,), user_defined=True, dataset_property=True),
FitRequirement('categories', (List,), user_defined=True, dataset_property=True)
])

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Expand Down
Loading