Skip to content

Commit

Permalink
fix tests after rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
ravinkohli committed Jul 26, 2022
1 parent 637a68b commit e69ff3b
Show file tree
Hide file tree
Showing 22 changed files with 145 additions and 98 deletions.
2 changes: 2 additions & 0 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -1908,6 +1908,7 @@ def _init_ensemble_builder(
# builder in the provide dask client
required_dataset_properties = {'task_type': self.task_type,
'output_type': self.dataset.output_type}

proc_ensemble = EnsembleBuilderManager(
start_time=time.time(),
time_left_for_ensembles=time_left_for_ensembles,
Expand All @@ -1928,6 +1929,7 @@ def _init_ensemble_builder(
random_state=self.seed,
precision=precision,
logger_port=self._logger_port,
metrics_kwargs=self._metrics_kwargs
)
self._stopwatch.stop_task(ensemble_task_name)

Expand Down
1 change: 1 addition & 0 deletions autoPyTorch/data/base_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def __init__(
self.categories: List[List[int]] = []
self.categorical_columns: List[int] = []
self.numerical_columns: List[int] = []
self.encode_columns: List[int] = []

self.all_nan_columns: Optional[Set[Union[int, str]]] = None

Expand Down
28 changes: 10 additions & 18 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,27 +168,23 @@ def _fit(

# Handle objects if possible
exist_object_columns = has_object_columns(X.dtypes.values)

if exist_object_columns:
X = self.infer_objects(X)
self.dtypes = [dt.name for dt in X.dtypes] # Also note this change in self.dtypes

self.all_nan_columns = set(all_nan_columns)

self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)
self.encode_columns, self.feat_types = self.get_columns_to_encode(X)

assert self.feat_types is not None

preprocessors = get_tabular_preprocessors()
self.column_transformer = _create_column_transformer(
preprocessors=preprocessors,
categorical_columns=self.transformed_columns,
)

if len(self.enc_columns) > 0:
if len(self.encode_columns) > 0:

preprocessors = get_tabular_preprocessors()
self.column_transformer = _create_column_transformer(
preprocessors=preprocessors,
categorical_columns=self.enc_columns,
categorical_columns=self.encode_columns,
)

# Mypy redefinition
Expand Down Expand Up @@ -302,8 +298,8 @@ def transform(
# we change those columns to `object` dtype
# to ensure that these columns are changed to appropriate dtype
# in self.infer_objects
all_nan_cat_cols = set(X[self.enc_columns].columns[X[self.enc_columns].isna().all()])
dtype_dict = {col: 'object' for col in self.enc_columns if col in all_nan_cat_cols}
all_nan_cat_cols = set(X[self.encode_columns].columns[X[self.encode_columns].isna().all()])
dtype_dict = {col: 'object' for col in self.encode_columns if col in all_nan_cat_cols}
X = X.astype(dtype_dict)

# Check the data here so we catch problems on new test data
Expand Down Expand Up @@ -388,10 +384,6 @@ def _check_data(
if exist_object_columns:
X = self.infer_objects(X)

# Define the column to be encoded here as the feature validator is fitted once
# per estimator
self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)

column_order = [column for column in X.columns]
if len(self.column_order) > 0:
if self.column_order != column_order:
Expand Down Expand Up @@ -491,8 +483,8 @@ def _get_columns_to_encode(
Type of each column numerical/categorical
"""

if len(self.transformed_columns) > 0 and self.feat_types is not None:
return self.transformed_columns, self.feat_types
if len(self.encode_columns) > 0 and self.feat_types is not None:
return self.encode_columns, self.feat_types

# Register if a column needs encoding
categorical_columns = []
Expand All @@ -503,7 +495,7 @@ def _get_columns_to_encode(
for i, column in enumerate(X.columns):
if self.all_nan_columns is not None and column in self.all_nan_columns:
continue
column_dtype = self.dtypes[i]
column_dtype = self.dtypes[i] if len(self.dtypes) > 0 else X[column].dtype.name
err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
"but input column {} has an invalid type `{}`.".format(column, column_dtype)
if column_dtype in ['category', 'bool']:
Expand Down
4 changes: 2 additions & 2 deletions autoPyTorch/data/time_series_feature_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def __init__(
self.series_idx: Optional[List[Union[str, int]]] = None

def get_reordered_columns(self) -> List[str]:
return self.transformed_columns + [
col for col in self.column_order if col not in set(self.transformed_columns)
return self.encode_columns + [
col for col in self.column_order if col not in set(self.encode_columns)
]

def fit(
Expand Down
3 changes: 0 additions & 3 deletions autoPyTorch/datasets/resampling_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,7 @@ def is_stratified(self) -> bool:

# TODO: replace it with another way
ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
<<<<<<< HEAD

=======
>>>>>>> Additional metrics during train (#194)

DEFAULT_RESAMPLING_PARAMETERS: Dict[
ResamplingStrategies,
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/evaluation/abstract_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,7 +729,7 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **metric_kwargs: Any) ->
def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
valid_pred: Optional[np.ndarray], test_pred: Optional[np.ndarray],
additional_run_info: Optional[Dict], file_output: bool, status: StatusType,
opt_pred: Optional[np.ndarray],
opt_pred: Optional[np.ndarray], **metric_kwargs: Any
) -> Optional[Tuple[float, float, int, Dict]]:
"""This function does everything necessary after the fitting is done:
Expand Down
3 changes: 2 additions & 1 deletion autoPyTorch/optimizer/smbo.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,8 @@ def __init__(self,
initial_configurations = []

if STRING_TO_TASK_TYPES.get(self.task_type, -1) == TIMESERIES_FORECASTING:
initial_configurations = self.get_init_configs_for_forecasting(config_space, kwargs)
# TODO: update search space (to remove reg cocktails) for forecasting tasks so that we can use the portfolio (or build the portfolio again)
# initial_configurations = self.get_init_configs_for_forecasting(config_space, kwargs)
# proxy-validation sets
self.min_num_test_instances: Optional[int] = kwargs.get('min_num_test_instances', # type:ignore[assignment]
None)
Expand Down
7 changes: 0 additions & 7 deletions autoPyTorch/pipeline/base_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,6 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
# needs to be updated is in components of the
# choice module
elif split_hyperparameter[0] not in components.keys():
<<<<<<< HEAD
hp_in_component = False
if hasattr(node, 'additional_components') and node.additional_components:
# This is designed for forecasting network encoder:
Expand All @@ -538,12 +537,6 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
"Expected update hyperparameter "
"to be in {} got {}".format(node.__class__.__name__,
components.keys(), split_hyperparameter[0]))
=======
raise ValueError("Unknown component choice for node {}. "
"Expected update component "
"to be in {}, but got {}".format(node_name,
components.keys(), split_hyperparameter[0]))
>>>>>>> Bug fixes (#249)
else:
# check if hyperparameter is in the search space of the component
component = components[split_hyperparameter[0]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,18 @@ def percentage_value_range_to_integer_range(
log = False
else:
log = hyperparameter_search_space.log

min_hyperparameter_value = hyperparameter_search_space.value_range[0]
if len(hyperparameter_search_space.value_range) > 1:
max_hyperparameter_value = hyperparameter_search_space.value_range[1]
else:
max_hyperparameter_value = hyperparameter_search_space.value_range[0]

hyperparameter_search_space = HyperparameterSearchSpace(
hyperparameter=hyperparameter_name,
value_range=(
floor(float(hyperparameter_search_space.value_range[0]) * n_features),
floor(float(hyperparameter_search_space.value_range[1]) * n_features)),
floor(float(min_hyperparameter_value) * n_features),
floor(float(max_hyperparameter_value) * n_features)),
default_value=ceil(float(hyperparameter_search_space.default_value) * n_features),
log=log)
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -574,6 +574,10 @@ def forward(self,
past_observed_targets: Optional[torch.BoolTensor] = None,
decoder_observed_values: Optional[torch.Tensor] = None,
) -> ALL_NET_OUTPUT:

if isinstance(past_targets, dict):
past_targets, past_features, future_features, past_observed_targets = self._unwrap_past_targets(past_targets)

x_past, x_future, x_static, loc, scale, static_context_initial_hidden, _ = self.pre_processing(
past_targets=past_targets,
past_observed_targets=past_observed_targets,
Expand Down Expand Up @@ -603,6 +607,38 @@ def forward(self,

return self.rescale_output(output, loc, scale, self.device)

def _unwrap_past_targets(
self,
past_targets: dict
) -> Tuple[
torch.Tensor,
Optional[torch.Tensor],
Optional[torch.Tensor],
Optional[torch.Tensor],
Optional[torch.BoolTensor],
Optional[torch.Tensor]]:
"""
Time series forecasting network requires multiple inputs for the forward pass which is different to how pytorch
networks usually work. SWA's update_bn in line #452 of trainer choice, does not unwrap the dictionary of the
input when running the forward pass. So we need to check for that here.
Args:
past_targets (dict):
Input mistakenly passed to past_targets variable
Returns:
_type_: _description_
"""

past_targets_copy = past_targets.copy()
past_targets = past_targets_copy.pop('past_targets')
future_targets = past_targets_copy.pop('future_targets', None)
past_features = past_targets_copy.pop('past_features', None)
future_features = past_targets_copy.pop('future_features', None)
past_observed_targets = past_targets_copy.pop('past_observed_targets', None)
decoder_observed_values = past_targets_copy.pop('decoder_observed_values', None)
return past_targets,past_features,future_features,past_observed_targets

def pred_from_net_output(self, net_output: ALL_NET_OUTPUT) -> torch.Tensor:
if self.output_type == 'regression':
return net_output
Expand Down Expand Up @@ -694,6 +730,10 @@ def forward(self,
future_features: Optional[torch.Tensor] = None,
past_observed_targets: Optional[torch.BoolTensor] = None,
decoder_observed_values: Optional[torch.Tensor] = None, ) -> ALL_NET_OUTPUT:

if isinstance(past_targets, dict):
past_targets, past_features, future_features, past_observed_targets = self._unwrap_past_targets(past_targets)

x_past, _, x_static, loc, scale, static_context_initial_hidden, past_targets = self.pre_processing(
past_targets=past_targets,
past_observed_targets=past_observed_targets,
Expand Down Expand Up @@ -983,6 +1023,10 @@ def forward(self,
future_features: Optional[torch.Tensor] = None,
past_observed_targets: Optional[torch.BoolTensor] = None,
decoder_observed_values: Optional[torch.Tensor] = None, ) -> ALL_NET_OUTPUT:

if isinstance(past_targets, dict):
past_targets, past_features, future_features, past_observed_targets = self._unwrap_past_targets(past_targets)

encode_length = min(self.window_size, past_targets.shape[1])

if past_observed_targets is None:
Expand Down Expand Up @@ -1250,6 +1294,9 @@ def forward(self, # type: ignore[override]
decoder_observed_values: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor,
Tuple[torch.Tensor, torch.Tensor]]:

if isinstance(past_targets, dict):
past_targets, past_features, future_features, past_observed_targets = self._unwrap_past_targets(past_targets)

# Unlike other networks, NBEATS network is required to predict both past and future targets.
# Thereby, we return two tensors for backcast and forecast
if past_observed_targets is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,10 @@ def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...], has
"""
placeholder = torch.randn((2, *input_shape), dtype=torch.float)
with torch.no_grad():
<<<<<<< HEAD
if has_hidden_states:
output = network(placeholder)[0]
else:
output = network(placeholder)
=======
output = network(placeholder)

>>>>>>> Bug fixes (#249)
return tuple(output.shape[1:])


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@


class NetworkEmbeddingComponent(autoPyTorchSetupComponent):
def __init__(self, random_state: Optional[np.random.RandomState] = None):
super().__init__(random_state=random_state)
def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
super().__init__()
self.embedding: Optional[nn.Module] = None
self.random_state = random_state
self.feature_shapes: Dict[str, int] = {}

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:

num_numerical_columns, num_input_features = self._get_required_info_from_data(X)
num_numerical_columns, num_input_features = self._get_args(X)

self.embedding, num_output_features = self.build_embedding(
num_input_features=num_input_features,
Expand All @@ -35,7 +36,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
self.feature_shapes = feature_shapes
else:
self.feature_shapes = X['dataset_properties']['feature_shapes']

return self

def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
Expand All @@ -49,39 +49,31 @@ def build_embedding(self,
num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]:
raise NotImplementedError

def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
"""
Returns the number of numerical columns after preprocessing and
an array of size equal to the number of input features
containing zeros for numerical data and number of categories
for categorical data. This is required to build the embedding.
Args:
X (Dict[str, Any]):
Fit dictionary
Returns:
Tuple[int, np.ndarray]:
number of numerical columns and array indicating
number of categories for categorical columns and
0 for numerical columns
"""
def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
# Feature preprocessors can alter numerical columns
if len(X['dataset_properties']['numerical_columns']) == 0:
num_numerical_columns = 0
else:
X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])

numerical_column_transformer = X['tabular_transformer'].preprocessor. \
named_transformers_['numerical_pipeline']
num_numerical_columns = numerical_column_transformer.transform(
X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]

num_cols = num_numerical_columns + len(X['dataset_properties']['categorical_columns'])
num_input_feats = np.zeros(num_cols, dtype=np.int32)

if 'tabular_transformer' in X:
numerical_column_transformer = X['tabular_transformer'].preprocessor. \
named_transformers_['numerical_pipeline']
elif 'time_series_feature_transformer' in X:
numerical_column_transformer = X['time_series_feature_transformer'].preprocessor. \
named_transformers_['numerical_pipeline']
else:
raise ValueError("Either a tabular or time_series transformer must be contained!")
if hasattr(X_train, 'iloc'):
num_numerical_columns = numerical_column_transformer.transform(
X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1]
else:
num_numerical_columns = numerical_column_transformer.transform(
X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])),
dtype=np.int32)
categories = X['dataset_properties']['categories']
for idx, cats in enumerate(categories, start=num_numerical_columns):
num_input_feats[idx] = len(cats)

return num_numerical_columns, num_input_feats
for i, category in enumerate(categories):
num_input_features[num_numerical_columns + i, ] = len(category)
return num_numerical_columns, num_input_features
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
self.val_transform,
train=False,
)

if X['dataset_properties']["is_small_preprocess"]:
if X['dataset_properties'].get("is_small_preprocess", True):
# This parameter indicates that the data has been pre-processed for speed
# Overwrite the datamanager with the pre-processes data
datamanager.replace_data(X['X_train'],
Expand Down Expand Up @@ -616,3 +615,16 @@ def __str__(self) -> str:
""" Allow a nice understanding of what components where used """
string = self.train_data_loader.__class__.__name__
return string

def _check_transform_requirements(self, X: Dict[str, Any], y: Any = None) -> None:
"""
Makes sure that the fit dictionary contains the required transformations
that the dataset should go through
Args:
X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing
mechanism, in which during a transform, a components adds relevant information
so that further stages can be properly fitted
"""
pass
Loading

0 comments on commit e69ff3b

Please sign in to comment.