fix tests after rebase

automl · Jul 26, 2022 · e69ff3b · e69ff3b
1 parent 637a68b
commit e69ff3b
Show file tree

Hide file tree

Showing 22 changed files with 145 additions and 98 deletions.
diff --git a/autoPyTorch/api/base_task.py b/autoPyTorch/api/base_task.py
@@ -1908,6 +1908,7 @@ def _init_ensemble_builder(
         # builder in the provide dask client
         required_dataset_properties = {'task_type': self.task_type,
                                        'output_type': self.dataset.output_type}
+
         proc_ensemble = EnsembleBuilderManager(
             start_time=time.time(),
             time_left_for_ensembles=time_left_for_ensembles,
@@ -1928,6 +1929,7 @@ def _init_ensemble_builder(
             random_state=self.seed,
             precision=precision,
             logger_port=self._logger_port,
+            metrics_kwargs=self._metrics_kwargs
         )
         self._stopwatch.stop_task(ensemble_task_name)
 

diff --git a/autoPyTorch/data/base_feature_validator.py b/autoPyTorch/data/base_feature_validator.py
@@ -49,6 +49,7 @@ def __init__(
         self.categories: List[List[int]] = []
         self.categorical_columns: List[int] = []
         self.numerical_columns: List[int] = []
+        self.encode_columns: List[int] = []
 
         self.all_nan_columns: Optional[Set[Union[int, str]]] = None
 

diff --git a/autoPyTorch/data/tabular_feature_validator.py b/autoPyTorch/data/tabular_feature_validator.py
@@ -168,27 +168,23 @@ def _fit(
 
             # Handle objects if possible
             exist_object_columns = has_object_columns(X.dtypes.values)
+
             if exist_object_columns:
                 X = self.infer_objects(X)
             self.dtypes = [dt.name for dt in X.dtypes]  # Also note this change in self.dtypes
+
             self.all_nan_columns = set(all_nan_columns)
 
-            self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)
+            self.encode_columns, self.feat_types = self.get_columns_to_encode(X)
 
             assert self.feat_types is not None
 
-            preprocessors = get_tabular_preprocessors()
-            self.column_transformer = _create_column_transformer(
-                preprocessors=preprocessors,
-                categorical_columns=self.transformed_columns,
-            )
-
-            if len(self.enc_columns) > 0:
+            if len(self.encode_columns) > 0:
 
                 preprocessors = get_tabular_preprocessors()
                 self.column_transformer = _create_column_transformer(
                     preprocessors=preprocessors,
-                    categorical_columns=self.enc_columns,
+                    categorical_columns=self.encode_columns,
                 )
 
                 # Mypy redefinition
@@ -302,8 +298,8 @@ def transform(
             # we change those columns to `object` dtype
             # to ensure that these columns are changed to appropriate dtype
             # in self.infer_objects
-            all_nan_cat_cols = set(X[self.enc_columns].columns[X[self.enc_columns].isna().all()])
-            dtype_dict = {col: 'object' for col in self.enc_columns if col in all_nan_cat_cols}
+            all_nan_cat_cols = set(X[self.encode_columns].columns[X[self.encode_columns].isna().all()])
+            dtype_dict = {col: 'object' for col in self.encode_columns if col in all_nan_cat_cols}
             X = X.astype(dtype_dict)
 
         # Check the data here so we catch problems on new test data
@@ -388,10 +384,6 @@ def _check_data(
             if exist_object_columns:
                 X = self.infer_objects(X)
 
-            # Define the column to be encoded here as the feature validator is fitted once
-            # per estimator
-            self.transformed_columns, self.feat_types = self.get_columns_to_encode(X)
-
             column_order = [column for column in X.columns]
             if len(self.column_order) > 0:
                 if self.column_order != column_order:
@@ -491,8 +483,8 @@ def _get_columns_to_encode(
                 Type of each column numerical/categorical
         """
 
-        if len(self.transformed_columns) > 0 and self.feat_types is not None:
-            return self.transformed_columns, self.feat_types
+        if len(self.encode_columns) > 0 and self.feat_types is not None:
+            return self.encode_columns, self.feat_types
 
         # Register if a column needs encoding
         categorical_columns = []
@@ -503,7 +495,7 @@ def _get_columns_to_encode(
         for i, column in enumerate(X.columns):
             if self.all_nan_columns is not None and column in self.all_nan_columns:
                 continue
-            column_dtype = self.dtypes[i]
+            column_dtype = self.dtypes[i] if len(self.dtypes) > 0 else X[column].dtype.name
             err_msg = "Valid types are `numerical`, `categorical` or `boolean`, " \
                       "but input column {} has an invalid type `{}`.".format(column, column_dtype)
             if column_dtype in ['category', 'bool']:

diff --git a/autoPyTorch/data/time_series_feature_validator.py b/autoPyTorch/data/time_series_feature_validator.py
@@ -37,8 +37,8 @@ def __init__(
         self.series_idx: Optional[List[Union[str, int]]] = None
 
     def get_reordered_columns(self) -> List[str]:
-        return self.transformed_columns + [
-            col for col in self.column_order if col not in set(self.transformed_columns)
+        return self.encode_columns + [
+            col for col in self.column_order if col not in set(self.encode_columns)
         ]
 
     def fit(

diff --git a/autoPyTorch/datasets/resampling_strategy.py b/autoPyTorch/datasets/resampling_strategy.py
@@ -109,10 +109,7 @@ def is_stratified(self) -> bool:
 
 # TODO: replace it with another way
 ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]
-<<<<<<< HEAD
 
-=======
->>>>>>> Additional metrics during train (#194)
 
 DEFAULT_RESAMPLING_PARAMETERS: Dict[
     ResamplingStrategies,

diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py
@@ -729,7 +729,7 @@ def _loss(self, y_true: np.ndarray, y_hat: np.ndarray, **metric_kwargs: Any) ->
     def finish_up(self, loss: Dict[str, float], train_loss: Dict[str, float],
                   valid_pred: Optional[np.ndarray], test_pred: Optional[np.ndarray],
                   additional_run_info: Optional[Dict], file_output: bool, status: StatusType,
-                  opt_pred: Optional[np.ndarray],
+                  opt_pred: Optional[np.ndarray], **metric_kwargs: Any
                   ) -> Optional[Tuple[float, float, int, Dict]]:
         """This function does everything necessary after the fitting is done:
 

diff --git a/autoPyTorch/optimizer/smbo.py b/autoPyTorch/optimizer/smbo.py
@@ -276,7 +276,8 @@ def __init__(self,
         initial_configurations = []
 
         if STRING_TO_TASK_TYPES.get(self.task_type, -1) == TIMESERIES_FORECASTING:
-            initial_configurations = self.get_init_configs_for_forecasting(config_space, kwargs)
+            # TODO: update search space (to remove reg cocktails) for forecasting tasks so that we can use the portfolio (or build the portfolio again)
+            # initial_configurations = self.get_init_configs_for_forecasting(config_space, kwargs)
             # proxy-validation sets
             self.min_num_test_instances: Optional[int] = kwargs.get('min_num_test_instances',  # type:ignore[assignment]
                                                                     None)

diff --git a/autoPyTorch/pipeline/base_pipeline.py b/autoPyTorch/pipeline/base_pipeline.py
@@ -520,7 +520,6 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                 # needs to be updated is in components of the
                 # choice module
                 elif split_hyperparameter[0] not in components.keys():
-<<<<<<< HEAD
                     hp_in_component = False
                     if hasattr(node, 'additional_components') and node.additional_components:
                         # This is designed for forecasting network encoder:
@@ -538,12 +537,6 @@ def _check_search_space_updates(self, include: Optional[Dict[str, Any]],
                                          "Expected update hyperparameter "
                                          "to be in {} got {}".format(node.__class__.__name__,
                                                                      components.keys(), split_hyperparameter[0]))
-=======
-                    raise ValueError("Unknown component choice for node {}. "
-                                     "Expected update component "
-                                     "to be in {}, but got {}".format(node_name,
-                                                                      components.keys(), split_hyperparameter[0]))
->>>>>>> Bug fixes (#249)
                 else:
                     # check if hyperparameter is in the search space of the component
                     component = components[split_hyperparameter[0]]

diff --git a/...ch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py b/...ch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/utils.py
@@ -81,11 +81,18 @@ def percentage_value_range_to_integer_range(
             log = False
         else:
             log = hyperparameter_search_space.log
+
+        min_hyperparameter_value = hyperparameter_search_space.value_range[0]
+        if len(hyperparameter_search_space.value_range) > 1:
+            max_hyperparameter_value = hyperparameter_search_space.value_range[1]
+        else:
+            max_hyperparameter_value = hyperparameter_search_space.value_range[0]
+
         hyperparameter_search_space = HyperparameterSearchSpace(
             hyperparameter=hyperparameter_name,
             value_range=(
-                floor(float(hyperparameter_search_space.value_range[0]) * n_features),
-                floor(float(hyperparameter_search_space.value_range[1]) * n_features)),
+                floor(float(min_hyperparameter_value) * n_features),
+                floor(float(max_hyperparameter_value) * n_features)),
             default_value=ceil(float(hyperparameter_search_space.default_value) * n_features),
             log=log)
     else:

diff --git a/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py b/autoPyTorch/pipeline/components/setup/network/forecasting_architecture.py
@@ -574,6 +574,10 @@ def forward(self,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None,
                 ) -> ALL_NET_OUTPUT:
+
+        if isinstance(past_targets, dict):
+            past_targets, past_features, future_features, past_observed_targets = self._unwrap_past_targets(past_targets)
+
         x_past, x_future, x_static, loc, scale, static_context_initial_hidden, _ = self.pre_processing(
             past_targets=past_targets,
             past_observed_targets=past_observed_targets,
@@ -603,6 +607,38 @@ def forward(self,
 
         return self.rescale_output(output, loc, scale, self.device)
 
+    def _unwrap_past_targets(
+        self,
+        past_targets: dict
+    ) -> Tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.BoolTensor],
+        Optional[torch.Tensor]]:
+        """
+        Time series forecasting network requires multiple inputs for the forward pass which is different to how pytorch
+        networks usually work. SWA's update_bn in line #452 of trainer choice, does not unwrap the dictionary of the
+        input when running the forward pass. So we need to check for that here.
+
+        Args:
+            past_targets (dict):
+                Input mistakenly passed to past_targets variable
+
+        Returns:
+            _type_: _description_
+        """
+
+        past_targets_copy = past_targets.copy()
+        past_targets = past_targets_copy.pop('past_targets')
+        future_targets = past_targets_copy.pop('future_targets', None)
+        past_features = past_targets_copy.pop('past_features', None)
+        future_features = past_targets_copy.pop('future_features', None)
+        past_observed_targets = past_targets_copy.pop('past_observed_targets', None)
+        decoder_observed_values = past_targets_copy.pop('decoder_observed_values', None)
+        return past_targets,past_features,future_features,past_observed_targets
+
     def pred_from_net_output(self, net_output: ALL_NET_OUTPUT) -> torch.Tensor:
         if self.output_type == 'regression':
             return net_output
@@ -694,6 +730,10 @@ def forward(self,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ) -> ALL_NET_OUTPUT:
+
+        if isinstance(past_targets, dict):
+            past_targets, past_features, future_features, past_observed_targets = self._unwrap_past_targets(past_targets)
+
         x_past, _, x_static, loc, scale, static_context_initial_hidden, past_targets = self.pre_processing(
             past_targets=past_targets,
             past_observed_targets=past_observed_targets,
@@ -983,6 +1023,10 @@ def forward(self,
                 future_features: Optional[torch.Tensor] = None,
                 past_observed_targets: Optional[torch.BoolTensor] = None,
                 decoder_observed_values: Optional[torch.Tensor] = None, ) -> ALL_NET_OUTPUT:
+
+        if isinstance(past_targets, dict):
+            past_targets, past_features, future_features, past_observed_targets = self._unwrap_past_targets(past_targets)
+
         encode_length = min(self.window_size, past_targets.shape[1])
 
         if past_observed_targets is None:
@@ -1250,6 +1294,9 @@ def forward(self,  # type: ignore[override]
                 decoder_observed_values: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor,
                                                                                    Tuple[torch.Tensor, torch.Tensor]]:
 
+        if isinstance(past_targets, dict):
+            past_targets, past_features, future_features, past_observed_targets = self._unwrap_past_targets(past_targets)
+
         # Unlike other networks, NBEATS network is required to predict both past and future targets.
         # Thereby, we return two tensors for backcast and forecast
         if past_observed_targets is None:

diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py
@@ -29,15 +29,10 @@ def get_output_shape(network: torch.nn.Module, input_shape: Tuple[int, ...], has
     """
     placeholder = torch.randn((2, *input_shape), dtype=torch.float)
     with torch.no_grad():
-<<<<<<< HEAD
         if has_hidden_states:
             output = network(placeholder)[0]
         else:
             output = network(placeholder)
-=======
-        output = network(placeholder)
-
->>>>>>> Bug fixes (#249)
     return tuple(output.shape[1:])
 
 

diff --git a/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py b/autoPyTorch/pipeline/components/setup/network_embedding/base_network_embedding.py
@@ -11,14 +11,15 @@
 
 
 class NetworkEmbeddingComponent(autoPyTorchSetupComponent):
-    def __init__(self, random_state: Optional[np.random.RandomState] = None):
-        super().__init__(random_state=random_state)
+    def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None):
+        super().__init__()
         self.embedding: Optional[nn.Module] = None
+        self.random_state = random_state
         self.feature_shapes: Dict[str, int] = {}
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 
-        num_numerical_columns, num_input_features = self._get_required_info_from_data(X)
+        num_numerical_columns, num_input_features = self._get_args(X)
 
         self.embedding, num_output_features = self.build_embedding(
             num_input_features=num_input_features,
@@ -35,7 +36,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
                 self.feature_shapes = feature_shapes
             else:
                 self.feature_shapes = X['dataset_properties']['feature_shapes']
-
         return self
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:
@@ -49,39 +49,31 @@ def build_embedding(self,
                         num_numerical_features: int) -> Tuple[nn.Module, Optional[List[int]]]:
         raise NotImplementedError
 
-    def _get_required_info_from_data(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
-        """
-        Returns the number of numerical columns after preprocessing and
-        an array of size equal to the number of input features
-        containing zeros for numerical data and number of categories
-        for categorical data. This is required to build the embedding.
-
-        Args:
-            X (Dict[str, Any]):
-                Fit dictionary
-
-        Returns:
-            Tuple[int, np.ndarray]:
-                number of numerical columns and array indicating
-                number of categories for categorical columns and
-                0 for numerical columns
-        """
+    def _get_args(self, X: Dict[str, Any]) -> Tuple[int, np.ndarray]:
         # Feature preprocessors can alter numerical columns
         if len(X['dataset_properties']['numerical_columns']) == 0:
             num_numerical_columns = 0
         else:
             X_train = copy.deepcopy(X['backend'].load_datamanager().train_tensors[0][:2])
 
-            numerical_column_transformer = X['tabular_transformer'].preprocessor. \
-                named_transformers_['numerical_pipeline']
-            num_numerical_columns = numerical_column_transformer.transform(
-                X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
-
-        num_cols = num_numerical_columns + len(X['dataset_properties']['categorical_columns'])
-        num_input_feats = np.zeros(num_cols, dtype=np.int32)
-
+            if 'tabular_transformer' in X:
+                numerical_column_transformer = X['tabular_transformer'].preprocessor. \
+                    named_transformers_['numerical_pipeline']
+            elif 'time_series_feature_transformer' in X:
+                numerical_column_transformer = X['time_series_feature_transformer'].preprocessor. \
+                    named_transformers_['numerical_pipeline']
+            else:
+                raise ValueError("Either a tabular or time_series transformer must be contained!")
+            if hasattr(X_train, 'iloc'):
+                num_numerical_columns = numerical_column_transformer.transform(
+                    X_train.iloc[:, X['dataset_properties']['numerical_columns']]).shape[1]
+            else:
+                num_numerical_columns = numerical_column_transformer.transform(
+                    X_train[:, X['dataset_properties']['numerical_columns']]).shape[1]
+        num_input_features = np.zeros((num_numerical_columns + len(X['dataset_properties']['categorical_columns'])),
+                                      dtype=np.int32)
         categories = X['dataset_properties']['categories']
-        for idx, cats in enumerate(categories, start=num_numerical_columns):
-            num_input_feats[idx] = len(cats)
 
-        return num_numerical_columns, num_input_feats
+        for i, category in enumerate(categories):
+            num_input_features[num_numerical_columns + i, ] = len(category)
+        return num_numerical_columns, num_input_features
diff --git a/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py b/autoPyTorch/pipeline/components/training/data_loader/time_series_forecasting_data_loader.py
@@ -254,8 +254,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> torch.utils.data.DataLoader:
             self.val_transform,
             train=False,
         )
-
-        if X['dataset_properties']["is_small_preprocess"]:
+        if X['dataset_properties'].get("is_small_preprocess", True):
             # This parameter indicates that the data has been pre-processed for speed
             # Overwrite the datamanager with the pre-processes data
             datamanager.replace_data(X['X_train'],
@@ -616,3 +615,16 @@ def __str__(self) -> str:
         """ Allow a nice understanding of what components where used """
         string = self.train_data_loader.__class__.__name__
         return string
+
+    def _check_transform_requirements(self, X: Dict[str, Any], y: Any = None) -> None:
+        """
+
+        Makes sure that the fit dictionary contains the required transformations
+        that the dataset should go through
+
+        Args:
+            X (Dict[str, Any]): Dictionary with fitted parameters. It is a message passing
+                mechanism, in which during a transform, a components adds relevant information
+                so that further stages can be properly fitted
+        """
+        pass