diff --git a/supervised/automl.py b/supervised/automl.py index 14da3cd6..b2f1906b 100644 --- a/supervised/automl.py +++ b/supervised/automl.py @@ -367,13 +367,15 @@ def fit( """ return self._fit(X, y, sample_weight, cv) - def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame]) -> numpy.ndarray: + def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame], model_name=None) -> numpy.ndarray: """ Computes predictions from AutoML best model. Arguments: X (list or numpy.ndarray or pandas.DataFrame): Input values to make predictions on. + model_name (str): + Name of the model that must be loaded Returns: numpy.ndarray: @@ -384,10 +386,11 @@ def predict(self, X: Union[List, numpy.ndarray, pandas.DataFrame]) -> numpy.ndar Raises: AutoMLException: Model has not yet been fitted. """ - return self._predict(X) + + return self._predict(X,model_name) def predict_proba( - self, X: Union[List, numpy.ndarray, pandas.DataFrame] + self, X: Union[List, numpy.ndarray, pandas.DataFrame], model_name=None ) -> numpy.ndarray: """ Computes class probabilities from AutoML best model. @@ -396,6 +399,8 @@ def predict_proba( Arguments: X (list or numpy.ndarray or pandas.DataFrame): Input values to make predictions on. + model_name (str): + Name of the model that must be loaded Returns: numpy.ndarray of shape (n_samples, n_classes): @@ -405,10 +410,10 @@ def predict_proba( AutoMLException: Model has not yet been fitted. """ - return self._predict_proba(X) + return self._predict_proba(X,model_name) def predict_all( - self, X: Union[List, numpy.ndarray, pandas.DataFrame] + self, X: Union[List, numpy.ndarray, pandas.DataFrame], model_name=None ) -> pandas.DataFrame: """ Computes both class probabilities and class labels for classification tasks. @@ -417,6 +422,8 @@ def predict_all( Arguments: X (list or numpy.ndarray or pandas.DataFrame): Input values to make predictions on. + model_name (str): + Name of the model that must be loaded Returns: pandas.Dataframe: @@ -428,7 +435,7 @@ def predict_all( AutoMLException: Model has not yet been fitted. """ - return self._predict_all(X) + return self._predict_all(X,model_name) def score( self, diff --git a/supervised/base_automl.py b/supervised/base_automl.py index 65ea17e7..74785fd0 100644 --- a/supervised/base_automl.py +++ b/supervised/base_automl.py @@ -83,7 +83,7 @@ def __init__(self): self._top_models_to_improve = None self._random_state = 1234 self._models = [] # instances of iterative learner framework or ensemble - self._best_model = None + self._model = None self._verbose = True self._threshold = None # used only in classification self._metrics_details = None @@ -128,7 +128,7 @@ def _check_can_load(self): self.load(self.results_path) self._results_path = self.results_path - def load(self, path): + def load(self, path, model_name=None): logger.info("Loading AutoML models ...") try: params = json.load(open(os.path.join(path, "params.json"))) @@ -174,9 +174,10 @@ def load(self, path): self._n_jobs = params.get("n_jobs", self._n_jobs) self._random_state = params.get("random_state", self._random_state) stacked_models = params.get("stacked") - - best_model_name = params.get("best_model") - load_on_predict = params.get("load_on_predict") + load_on_predict = None + if model_name is None: + model_name = params.get("best_model") + load_on_predict = params.get("load_on_predict") self._fit_level = params.get("fit_level") lazy_load = not ( self._fit_level is not None and self._fit_level == "finished" @@ -186,7 +187,7 @@ def load(self, path): load_models = load_on_predict # just in case there is check for which models should be loaded # fix https://github.com/mljar/mljar-supervised/issues/395 - models_needed = self.models_needed_on_predict(best_model_name) + models_needed = self.models_needed_on_predict(model_name) # join them and return unique list load_models = list(np.unique(load_models + models_needed)) @@ -204,12 +205,14 @@ def load(self, path): self._models += [m] models_map[m.get_name()] = m - self._best_model = None - if best_model_name is not None: - self._best_model = models_map.get(best_model_name) + self._model = None + if model_name is not None: + if model_name not in models_map: + raise ValueError(f"model name {model_name} does not exist in file") + self._model = models_map[model_name] if stacked_models is not None and ( - self._best_model._is_stacked or self._fit_level != "finished" + self._model._is_stacked or self._fit_level != "finished" ): self._stacked_models = [] for stacked_model_name in stacked_models: @@ -1120,7 +1123,7 @@ def _fit(self, X, y, sample_weight=None, cv=None): self.verbose_print( f"AutoML fit time: {np.round(time.time() - self._start_time,2)} seconds" ) - self.verbose_print(f"AutoML best model: {self._best_model.get_name()}") + self.verbose_print(f"AutoML best model: {self._model.get_name()}") except Exception as e: raise e @@ -1143,7 +1146,7 @@ def _update_errors_report(self, model_name, error_msg): def select_and_save_best(self, show_warnings=False): # Select best model based on the lowest loss - self._best_model = None + self._model = None if self._models: model_list = [ m @@ -1151,14 +1154,14 @@ def select_and_save_best(self, show_warnings=False): if m.is_valid() and m.is_fast_enough(self._max_single_prediction_time) ] if model_list: - self._best_model = min( + self._model = min( model_list, key=lambda x: x.get_final_loss(), ) # if none selected please select again and warn the user if ( len(self._models) - and self._best_model is None + and self._model is None and self._max_single_prediction_time is not None ): if show_warnings: @@ -1171,7 +1174,7 @@ def select_and_save_best(self, show_warnings=False): ) self.verbose_print(msg) - self._best_model = min( + self._model = min( [m for m in self._models if m.is_valid()], key=lambda x: x.get_final_loss(), ) @@ -1204,11 +1207,11 @@ def select_and_save_best(self, show_warnings=False): "saved": self._model_subpaths, "fit_level": self._fit_level, } - if self._best_model is not None: - params["best_model"] = self._best_model.get_name() + if self._model is not None: + params["best_model"] = self._model.get_name() load_on_predict = [] - load_on_predict += self._best_model.involved_model_names() - if self._best_model._is_stacked and self._stacked_models is not None: + load_on_predict += self._model.involved_model_names() + if self._model._is_stacked and self._stacked_models is not None: for m in self._stacked_models: load_on_predict += m.involved_model_names() params["load_on_predict"] = list(np.unique(load_on_predict)) @@ -1224,7 +1227,7 @@ def select_and_save_best(self, show_warnings=False): # save report ldb.insert(loc=0, column="Best model", value="") ldb.loc[ - ldb.name == self._best_model.get_name(), "Best model" + ldb.name == self._model.get_name(), "Best model" ] = "**the best**" ldb["name"] = [f"[{m}]({m}/README.md)" for m in ldb["name"].values] @@ -1287,11 +1290,10 @@ def models_needed_on_predict(self, required_model_name): ) def _base_predict(self, X, model=None): - if model is None: - if self._best_model is None: + if self._model is None: self.load(self.results_path) - model = self._best_model + model = self._model if model is None: raise AutoMLException( @@ -1356,9 +1358,10 @@ def _base_predict(self, X, model=None): else: return predictions - def _predict(self, X): - - predictions = self._base_predict(X) + def _predict(self, X,model_name=None): + if model_name is not None: + self.load(self._results_path,model_name) + predictions = self._base_predict(X,self._model) # Return predictions # If classification task the result is in column 'label' # If regression task the result is in column 'prediction' @@ -1368,7 +1371,7 @@ def _predict(self, X): else predictions["prediction"].to_numpy() ) - def _predict_proba(self, X): + def _predict_proba(self, X,model_name): # Check is task type is correct if self._ml_task == REGRESSION: raise AutoMLException( @@ -1378,11 +1381,17 @@ def _predict_proba(self, X): # Make and return predictions # If classification task the result is in column 'label' # Need to drop `label` column. - return self._base_predict(X).drop(["label"], axis=1).to_numpy() - - def _predict_all(self, X): + model=None + if model_name is not None: + model = self.load(self._results_path,model_name) + return self._base_predict(X,model).drop(["label"], axis=1).to_numpy() + + def _predict_all(self, X,model_name): + model = None + if model_name is not None: + model = self.load(self.results_path, model_name) # Make and return predictions - return self._base_predict(X) + return self._base_predict(X,model) def _score(self, X, y=None, sample_weight=None): # y default must be None for scikit-learn compatibility @@ -2025,11 +2034,11 @@ def _validate_random_state(self): check_positive_integer(self.random_state, "random_state") def to_json(self): - if self._best_model is None: + if self._model is None: return None return { - "best_model": self._best_model.to_json(), + "best_model": self._model.to_json(), "threshold": self._threshold, "ml_task": self._ml_task, } @@ -2037,11 +2046,11 @@ def to_json(self): def from_json(self, json_data): if json_data["best_model"]["algorithm_short_name"] == "Ensemble": - self._best_model = Ensemble() - self._best_model.from_json(json_data["best_model"]) + self._model = Ensemble() + self._model.from_json(json_data["best_model"]) else: - self._best_model = ModelFramework(json_data["best_model"].get("params")) - self._best_model.from_json(json_data["best_model"]) + self._model = ModelFramework(json_data["best_model"].get("params")) + self._model.from_json(json_data["best_model"]) self._threshold = json_data.get("threshold") self._ml_task = json_data.get("ml_task") @@ -2254,7 +2263,7 @@ def _report(self, width=900, height=1200): def _need_retrain(self, X, y, sample_weight, decrease): - metric = self._best_model.get_metric() + metric = self._model.get_metric() X, y, sample_weight = ExcludeRowsMissingTarget.transform( X, y, sample_weight, warn=True @@ -2270,7 +2279,7 @@ def _need_retrain(self, X, y, sample_weight, decrease): sign = -1.0 if Metric.optimize_negative(metric.name) else 1.0 new_score = metric(y, prediction, sample_weight) - old_score = self._best_model.get_final_loss() + old_score = self._model.get_final_loss() change = np.abs((old_score - new_score) / old_score) diff --git a/tests/tests_automl/test_specific_model.py b/tests/tests_automl/test_specific_model.py new file mode 100644 index 00000000..0dcec0f1 --- /dev/null +++ b/tests/tests_automl/test_specific_model.py @@ -0,0 +1,40 @@ +import os +import unittest +import pytest +import json +import shutil + +import supervised.exceptions +from supervised import AutoML +from sklearn import datasets + +iris = datasets.load_iris() + +class ModelSelectionTest(unittest.TestCase): + + automl_dir = "model_selection_tests" + + def tearDown(self): + shutil.rmtree(self.automl_dir, ignore_errors=True) + + def test_choose_model(self): + model = AutoML( + explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir + ) + model.fit(iris.data, iris.target) + params = json.load(open(os.path.join(self.automl_dir, "params.json"))) + for model_name in params['saved']: + model.predict(iris.data,model_name) + model.predict_all(iris.data, model_name) + model.predict_proba(iris.data, model_name) + + def test_raise_with_wrong_model(self): + model = AutoML( + explain_level=0, verbose=1, random_state=1, results_path=self.automl_dir + ) + model.fit(iris.data, iris.target) + msg = "Cannot load AutoML directory. model name random_name does not exist in file" + with pytest.raises(supervised.exceptions.AutoMLException, match=msg): + model.predict(iris.data, "random_name") + +