From a9bc8dc3c5a118c48d32436ae8e762890a7dc66f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Thu, 11 Jan 2024 11:50:03 +0100 Subject: [PATCH 01/14] Initial version of experiment_tracking.py --- mango/models/experiment_tracking.py | 370 ++++++++++++++++++ .../models_module/test_experiment_tracking.py | 310 +++++++++++++++ requirements-dev.txt | 1 + 3 files changed, 681 insertions(+) create mode 100644 mango/models/experiment_tracking.py create mode 100644 mango/tests/models_module/test_experiment_tracking.py diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py new file mode 100644 index 00000000..ddb2e7f6 --- /dev/null +++ b/mango/models/experiment_tracking.py @@ -0,0 +1,370 @@ +import json +import os +import pickle +import shutil +from datetime import datetime +from enum import Enum +from typing import Dict, Any + +import pandas as pd +import sklearn.base +from catboost import CatBoost +from lightgbm import LGBMModel +from sklearn.metrics import ( + mean_absolute_error, + mean_squared_error, + median_absolute_error, + r2_score, + confusion_matrix, + precision_score, + f1_score, + recall_score, +) + + +class ProblemType(Enum): + """ + Basic enum to represent the problem type. + """ + + REGRESSION = "regression" + CLASSIFICATION = "classification" + + +class ModelLibrary(Enum): + """ + Basic enum to represent the model library. + """ + + SCIKIT_LEARN = "scikit-learn" + CATBOOST = "catboost" + LIGHTGBM = "lightgbm" + + +def generate_metrics_regression( + y_true: pd.Series, y_pred: pd.Series +) -> Dict[str, float]: + """ + Generate common metrics for regression and return them in a dictionary. The metrics are: + - R2 score + - Mean absolute error + - Mean squared error + - Root mean squared error + - Median absolute error + + :param y_true: The true values. + :param y_pred: The predicted values. + :return: A dictionary of metrics. + """ + return { + "r2_score": round(r2_score(y_true, y_pred), 4), + "mean_absolute_error": round(mean_absolute_error(y_true, y_pred), 4), + "mean_squared_error": round(mean_squared_error(y_true, y_pred), 4), + "root_mean_squared_error": round( + mean_squared_error(y_true, y_pred, squared=False), 4 + ), + "median_absolute_error": round(median_absolute_error(y_true, y_pred), 4), + } + + +def generate_metrics_classification( + y_true: pd.Series, y_pred: pd.Series +) -> Dict[str, float]: + """ + Generate common metrics for classification and return them in a dictionary. The metrics for binary classification + are: + - Confusion matrix + - Accuracy + - Precision + - Recall + - F1 score + + In case It is a multiclass classification, the metrics are: + - Confusion matrix + - Accuracy + - Precision macro + - Recall macro + - F1 score macro + + + :param y_true: The true values. + :param y_pred: The predicted values. + :return: A dictionary of metrics. + """ + if len(y_true.unique()) == 2: + return { + "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(), + "accuracy": round((y_true == y_pred).sum() / len(y_true), 4), + "precision": round(precision_score(y_true, y_pred), 4), + "recall": round(recall_score(y_true, y_pred), 4), + "f1_score": round(f1_score(y_true, y_pred), 4), + } + else: + return { + "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(), + "accuracy": round((y_true == y_pred).sum() / len(y_true), 4), + "precision_macro": round( + precision_score(y_true, y_pred, average="macro"), 4 + ), + "recall_macro": round(recall_score(y_true, y_pred, average="macro"), 4), + "f1_score_macro": round(f1_score(y_true, y_pred, average="macro"), 4), + } + + +def export_model( + model: Any, + X_train: pd.DataFrame, + y_train: pd.Series, + X_test: pd.DataFrame, + y_test: pd.Series, + base_path: str, + save_model: bool = True, + save_datasets: bool = False, + zip_files: bool = True, +): + """ + Register model and metrics in a json file and save the model and datasets in a folder. + :param model: A model from one of the supported libraries. Currently supported libraries are: + - scikit-learn + - catboost + - lightgbm + :param X_train: Training data as a pandas dataframe. + :param y_train: Training target as a pandas series. + :param X_test: Test data as a pandas dataframe. + :param y_test: Test target as a pandas series. + :param base_path: Path to the base folder where the model and datasets will be saved in a subfolder structure. + :param zip_files: Whether to zip the files or not. + :param save_datasets: Whether to save the datasets or not. + :param save_model: Whether to save the model or not. + :return: The path to the subfolder inside base_path where the model and datasets have been saved. + + Usage + ----- + >>> from sklearn.datasets import fetch_california_housing + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> from mango.models.experiment_tracking import export_model + >>> + >>> + >>> X, y = fetch_california_housing(return_X_y=True, as_frame=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y) + >>> model = LogisticRegression() + >>> model.fit(X_train, y_train) + >>> output_folder = export_model(model, X_train, y_train, X_test, y_test, "/my_experiments_folder") + >>> print(output_folder) # /my_experiments_folder/experiment_LogisticRegression_YYYYMMDD-HHMMSS + + Subfolder structure + ------------------- + The subfolder structure will be the following: + |- base_path + |- experiment_{model_name}_{datetime} + |- model + |- model.pkl + + |- hyperparameters.json + |- data + |- X_train.csv + + |- y_train.csv + + |- X_test.csv + + |- y_test.csv + |- summary.json + """ + _SUPPORTED_LIBRARIES_CLASSES = { + ModelLibrary.SCIKIT_LEARN: sklearn.base.BaseEstimator, + ModelLibrary.CATBOOST: CatBoost, + ModelLibrary.LIGHTGBM: LGBMModel, + } + if not os.path.exists(base_path): + raise FileNotFoundError(f"Folder {base_path} does not exist.") + model_name = model.__class__.__name__ + model_library = None + for library, class_name in _SUPPORTED_LIBRARIES_CLASSES.items(): + if isinstance(model, class_name): + model_library = library + if model_library is None: + raise ValueError(f"Model {model_name} is not supported.") + + # Detect if it is a classification or regression model + if hasattr(model, "predict_proba"): + problem_type = ProblemType.CLASSIFICATION + else: + problem_type = ProblemType.REGRESSION + + # Intended structure + # summary = { + # "model": { + # "name": "", + # "problem_type": "", + # # Optional "num_classes": 0, if classification + # "input": "", + # "target": "", + # "hyperparameters": {}, + # "library": "", + # }, + # "results": {}, + # # Optional + # # "files": { + # # "model": { + # # "zip": "", + # # "model.pkl": "", + # # "hyperparameters.json": "", + # # }, + # # "data": { + # # "zip": "", + # # "X_train.csv": "", + # # "y_train.csv": "", + # # "X_test.csv": "", + # # "y_test.csv": "", + # # }, + # # }, + # } + summary = {} + extra_params = [] + # Fill structure + summary["model"] = {} + summary["model"]["name"] = model_name + summary["model"]["problem_type"] = problem_type.value + summary["model"]["target"] = y_train.name + summary["model"]["library"] = model_library.value + if model_library == ModelLibrary.CATBOOST: + summary["model"]["input"] = list(model.feature_names_) + summary["model"]["hyperparameters"] = model.get_all_params() + elif model_library == ModelLibrary.SCIKIT_LEARN: + summary["model"]["input"] = list(model.feature_names_in_) + summary["model"]["hyperparameters"] = model.get_params(deep=True) + elif model_library == ModelLibrary.LIGHTGBM: + summary["model"]["input"] = list(model.feature_name_) + summary["model"]["hyperparameters"] = model.get_params(deep=True) + + # Sort keys in summary["model"] + if problem_type == ProblemType.CLASSIFICATION: + summary["model"]["num_classes"] = len(y_train.unique()) + # Sort keys in summary["model"] to be: name, problem_type, num_classes, input, target, hyperparameters, library + summary["model"] = { + k: summary["model"][k] + for k in [ + "name", + "problem_type", + "num_classes", + "input", + "target", + "hyperparameters", + "library", + ] + } + else: + # Sort keys in summary["model"] to be: name, problem_type, input, target, hyperparameters, library + summary["model"] = { + k: summary["model"][k] + for k in [ + "name", + "problem_type", + "input", + "target", + "hyperparameters", + "library", + ] + } + + # Generate metrics + if model_library == ModelLibrary.CATBOOST: + y_train_pred = pd.Series(model.predict(X_train).reshape(-1)).reset_index( + drop=True + ) + y_test_pred = pd.Series(model.predict(X_test).reshape(-1)).reset_index( + drop=True + ) + elif model_library in [ModelLibrary.SCIKIT_LEARN, ModelLibrary.LIGHTGBM]: + y_train_pred = pd.Series(model.predict(X_train)).reset_index(drop=True) + y_test_pred = pd.Series(model.predict(X_test)).reset_index(drop=True) + + if problem_type == ProblemType.CLASSIFICATION: + summary["results"] = { + "train_score": generate_metrics_classification( + y_train.reset_index(drop=True), y_train_pred + ), + "test_score": generate_metrics_classification( + y_test.reset_index(drop=True), y_test_pred + ), + } + elif problem_type == ProblemType.REGRESSION: + summary["results"] = { + "train_score": generate_metrics_regression( + y_train.reset_index(drop=True), y_train_pred + ), + "test_score": generate_metrics_regression( + y_test.reset_index(drop=True), y_test_pred + ), + } + + # Prepare environment to save files + folder_name = os.path.join( + base_path, + f"experiment_{model_name}_{datetime.now().strftime('%Y%m%d-%H%M%S')}", + ) + + # Compress model and save + if save_model: + os.makedirs(os.path.join(folder_name, "model")) + if not "files" in summary: + summary["files"] = {} + if not "model" in summary["files"]: + summary["files"]["model"] = {} + # Save hyperparameters + hyperparameters_path = os.path.join( + folder_name, "model", "hyperparameters.json" + ) + summary["files"]["model"]["hyperparameters.json"] = os.path.abspath( + hyperparameters_path + ) + with open(hyperparameters_path, "w") as f: + json.dump(summary["model"]["hyperparameters"], f, indent=4) + # Save the model + model_path = os.path.join(folder_name, "model", "model.pkl") + summary["files"]["model"]["model.pkl"] = os.path.abspath(model_path) + with open(model_path, "wb") as f: + pickle.dump(model, f) + if zip_files: + zip_path = os.path.join(folder_name, "model.zip") + summary["files"]["model"]["zip"] = os.path.abspath(zip_path) + shutil.make_archive( + zip_path.rstrip(".zip"), "zip", os.path.join(folder_name, "model") + ) + shutil.rmtree(os.path.join(folder_name, "model")) + + if save_datasets: + os.makedirs(os.path.join(folder_name, "data")) + if not "files" in summary: + summary["files"] = {} + if not "data" in summary["files"]: + summary["files"]["data"] = {} + X_train_path = os.path.join(folder_name, "data", "X_train.csv") + summary["files"]["data"]["X_train.csv"] = os.path.abspath(X_train_path) + X_train.to_csv(X_train_path, index=False) + y_train_path = os.path.join(folder_name, "data", "y_train.csv") + summary["files"]["data"]["y_train.csv"] = os.path.abspath(y_train_path) + y_train.to_csv(y_train_path, index=False) + X_test_path = os.path.join(folder_name, "data", "X_test.csv") + summary["files"]["data"]["X_test.csv"] = os.path.abspath(X_test_path) + X_test.to_csv(X_test_path, index=False) + y_test_path = os.path.join(folder_name, "data", "y_test.csv") + summary["files"]["data"]["y_test.csv"] = os.path.abspath(y_test_path) + y_test.to_csv(y_test_path, index=False) + if zip_files: + # Compress data and save + zip_path = os.path.join(folder_name, "data.zip") + summary["files"]["data"]["zip"] = os.path.abspath(zip_path) + shutil.make_archive( + zip_path.rstrip(".zip"), "zip", os.path.join(folder_name, "data") + ) + shutil.rmtree(os.path.join(folder_name, "data")) + + # Save json + json_path = os.path.join(folder_name, "summary.json") + with open(json_path, "w", encoding="utf-8") as f: + json.dump(summary, f, indent=4, ensure_ascii=False) + + return folder_name diff --git a/mango/tests/models_module/test_experiment_tracking.py b/mango/tests/models_module/test_experiment_tracking.py new file mode 100644 index 00000000..ef011364 --- /dev/null +++ b/mango/tests/models_module/test_experiment_tracking.py @@ -0,0 +1,310 @@ +import os +import pickle +import shutil +from unittest import TestCase + +import numpy as np +import pandas as pd +from catboost import CatBoostClassifier, CatBoostRegressor +from pandas.testing import assert_frame_equal +from sklearn.datasets import make_classification, make_regression +from sklearn.linear_model import LinearRegression, LogisticRegression +from lightgbm import LGBMClassifier, LGBMRegressor + +from mango.models.experiment_tracking import ( + export_model, ProblemType, +) + + +class InvalidModel: + """ + Dummy class to test errors + """ + pass + + +class TestExperimentTracking(TestCase): + """ + Tes suite for the experiment tracking module inside models. + """ + folder_name = "test_experiment_tracking" + + @classmethod + def setUpClass(cls): + """ + Create data for the tests and needed folders. + """ + os.makedirs(cls.folder_name, exist_ok=True) + + # Classification + X_clf, y_clf = make_classification( + n_samples=1000, n_features=10, random_state=42, n_classes=3, n_informative=5 + ) + X_clf = pd.DataFrame(X_clf, columns=[f"feature_{i}" for i in range(10)]) + y_clf = pd.Series(y_clf, name="target") + + # Shuffle + X_clf = X_clf.sample(frac=1, random_state=42) + y_clf = y_clf[X_clf.index] + + # Split + cls.X_train_clf = X_clf[: int(len(X_clf) * 0.8)].reset_index(drop=True) + cls.y_train_clf = y_clf[: int(len(y_clf) * 0.8)].reset_index(drop=True) + cls.X_test_clf = X_clf[int(len(X_clf) * 0.8):].reset_index(drop=True) + cls.y_test_clf = y_clf[int(len(y_clf) * 0.8):].reset_index(drop=True) + + # Regression + X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42) + X_reg = pd.DataFrame(X_reg, columns=[f"feature_{i}" for i in range(10)]) + y_reg = pd.Series(y_reg, name="target") + + # Shuffle + X_reg = X_reg.sample(frac=1, random_state=42) + y_reg = y_reg[X_reg.index] + + # Split + cls.X_train_reg = X_reg[: int(len(X_reg) * 0.8)].reset_index(drop=True) + cls.y_train_reg = y_reg[: int(len(y_reg) * 0.8)].reset_index(drop=True) + cls.X_test_reg = X_reg[int(len(X_reg) * 0.8):].reset_index(drop=True) + cls.y_test_reg = y_reg[int(len(y_reg) * 0.8):].reset_index(drop=True) + + @classmethod + def tearDownClass(cls): + """ + Delete the folders created for the tests. + """ + if os.path.exists(cls.folder_name): + shutil.rmtree(cls.folder_name) + + def _check_model_with_zip(self, output_folder): + """ + Helper function to check the model is saved correctly when zip_files is True. + """ + # Assert zip files are saved + self.assertTrue(os.path.exists(os.path.join(output_folder, "model.zip"))) + self.assertTrue(os.path.exists(os.path.join(output_folder, "data.zip"))) + + # Assert files are saved correctly + self.assertTrue(os.path.exists(os.path.join(output_folder, "summary.json"))) + + # Assert files are not saved + self.assertFalse( + os.path.exists(os.path.join(output_folder, "model", "model.pkl")) + ) + self.assertFalse( + os.path.exists(os.path.join(output_folder, "model", "hyperparameters.json")) + ) + self.assertFalse( + os.path.exists(os.path.join(output_folder, "data", "X_train.csv")) + ) + self.assertFalse( + os.path.exists(os.path.join(output_folder, "data", "y_train.csv")) + ) + self.assertFalse( + os.path.exists(os.path.join(output_folder, "data", "X_test.csv")) + ) + self.assertFalse( + os.path.exists(os.path.join(output_folder, "data", "y_test.csv")) + ) + + # Assert subfolder not saved + self.assertFalse(os.path.exists(os.path.join(output_folder, "model"))) + self.assertFalse(os.path.exists(os.path.join(output_folder, "data"))) + + def _check_model_without_zip(self, model, output_folder, problem_type): + """ + Helper function to check the model is saved correctly when zip_files is False. + """ + # Assert folders are saved correctly + self.assertTrue(os.path.exists(os.path.join(output_folder, "model"))) + self.assertTrue(os.path.exists(os.path.join(output_folder, "data"))) + # Assert files are saved correctly + self.assertTrue(os.path.exists(os.path.join(output_folder, "summary.json"))) + self.assertTrue( + os.path.exists(os.path.join(output_folder, "model", "hyperparameters.json")) + ) + self.assertTrue( + os.path.exists(os.path.join(output_folder, "model", "model.pkl")) + ) + self.assertTrue( + os.path.exists(os.path.join(output_folder, "data", "X_train.csv")) + ) + self.assertTrue( + os.path.exists(os.path.join(output_folder, "data", "y_train.csv")) + ) + self.assertTrue( + os.path.exists(os.path.join(output_folder, "data", "X_test.csv")) + ) + self.assertTrue( + os.path.exists(os.path.join(output_folder, "data", "y_test.csv")) + ) + # Assert zip files are not saved + self.assertFalse(os.path.exists(os.path.join(output_folder, "model.zip"))) + self.assertFalse(os.path.exists(os.path.join(output_folder, "data.zip"))) + # Assert files are valid for data folder + X_train = pd.read_csv(os.path.join(output_folder, "data", "X_train.csv")) + y_train = pd.read_csv(os.path.join(output_folder, "data", "y_train.csv")).values + X_test = pd.read_csv(os.path.join(output_folder, "data", "X_test.csv")) + y_test = pd.read_csv(os.path.join(output_folder, "data", "y_test.csv")).values + if problem_type == ProblemType.CLASSIFICATION: + assert_frame_equal(X_train, self.X_train_clf) + self.assertListEqual(list([y for y in y_train.reshape(-1)]), list([y for y in self.y_train_clf.values])) + assert_frame_equal(X_test, self.X_test_clf) + self.assertListEqual(list([y for y in y_test.reshape(-1)]), list([y for y in self.y_test_clf.values])) + elif problem_type == ProblemType.REGRESSION: + assert_frame_equal(X_train, self.X_train_reg) + self.assertListEqual(list([round(y, 4) for y in y_train.reshape(-1)]), + list([round(y, 4) for y in self.y_train_reg.values])) + assert_frame_equal(X_test, self.X_test_reg) + self.assertListEqual(list([round(y, 4) for y in y_test.reshape(-1)]), + list([round(y, 4) for y in self.y_test_reg.values])) + else: + raise ValueError("Problem type not supported") + # Assert model is the same + # Assert model is the same + with open(os.path.join(output_folder, "model", "model.pkl"), "rb") as f: + model_load = pickle.load(f) + + # Generate predictions from both models + original_predictions = model.predict(self.X_test_reg) + loaded_predictions = model_load.predict(self.X_test_reg) + + # Check if the predictions are almost the same + self.assertTrue(np.allclose(original_predictions, loaded_predictions)) + + def test_serialize_sklearn(self): + """ + Test serialization of a sklearn model. + """ + model = LinearRegression() + model.fit(self.X_train_reg, self.y_train_reg) + output_folder = export_model( + model, + self.X_train_reg, + self.y_train_reg, + self.X_test_reg, + self.y_test_reg, + self.folder_name, + save_model=True, + save_datasets=True, + zip_files=False, + ) + self._check_model_without_zip(output_folder=output_folder, model=model, problem_type=ProblemType.REGRESSION) + # Assert works for classification with Zip + model = LogisticRegression() + model.fit(self.X_train_clf, self.y_train_clf) + output_folder = export_model( + model, + self.X_train_clf, + self.y_train_clf, + self.X_test_clf, + self.y_test_clf, + self.folder_name, + save_model=True, + save_datasets=True, + zip_files=True, + ) + self._check_model_with_zip(output_folder=output_folder) + + def test_serialize_catboost(self): + """ + Test serialization of a CatBoost model. + """ + model = CatBoostClassifier(allow_writing_files=False, verbose=5, iterations=10) + model.fit(self.X_train_clf, self.y_train_clf) + output_folder = export_model( + model, + self.X_train_clf, + self.y_train_clf, + self.X_test_clf, + self.y_test_clf, + self.folder_name, + save_model=True, + save_datasets=True, + zip_files=False, + ) + self._check_model_without_zip(output_folder=output_folder, model=model, problem_type=ProblemType.CLASSIFICATION) + + # Assert works for regression with Zip + model = CatBoostRegressor(allow_writing_files=False, verbose=5, iterations=10) + model.fit(self.X_train_reg, self.y_train_reg) + output_folder = export_model( + model, + self.X_train_reg, + self.y_train_reg, + self.X_test_reg, + self.y_test_reg, + self.folder_name, + save_model=True, + save_datasets=True, + zip_files=True, + ) + self._check_model_with_zip(output_folder=output_folder) + + def test_serialize_lightgbm(self): + """ + Test serialization of a LightGBM model. + """ + model = LGBMClassifier() + model.fit(self.X_train_clf, self.y_train_clf) + output_folder = export_model( + model, + self.X_train_clf, + self.y_train_clf, + self.X_test_clf, + self.y_test_clf, + self.folder_name, + save_model=True, + save_datasets=True, + zip_files=False, + ) + self._check_model_without_zip(output_folder=output_folder, model=model, problem_type=ProblemType.CLASSIFICATION) + + # Assert works for regression with Zip + model = LGBMRegressor() + model.fit(self.X_train_reg, self.y_train_reg) + output_folder = export_model( + model, + self.X_train_reg, + self.y_train_reg, + self.X_test_reg, + self.y_test_reg, + self.folder_name, + save_model=True, + save_datasets=True, + zip_files=True, + ) + self._check_model_with_zip(output_folder=output_folder) + + def test_errors(self): + """ + Test errors raised by the function. + """ + # Not supported model + model = InvalidModel() + with self.assertRaises(ValueError): + export_model( + model, + self.X_train_reg, + self.y_train_reg, + self.X_test_reg, + self.y_test_reg, + self.folder_name, + save_model=True, + save_datasets=True, + zip_files=False, + ) + + # Invalid folder + with self.assertRaises(FileNotFoundError): + export_model( + model, + self.X_train_reg, + self.y_train_reg, + self.X_test_reg, + self.y_test_reg, + "invalid_folder", + save_model=True, + save_datasets=True, + zip_files=False, + ) diff --git a/requirements-dev.txt b/requirements-dev.txt index 9727b12c..2b067f88 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,6 +3,7 @@ coverage scikit-learn==1.3.2 lightgbm==4.1.0 xgboost==2.0.2 +catboost sphinx shibuya sphinxcontrib-bibtex \ No newline at end of file From 2900bc86864c04a357f3a7bd8e78af21980a94af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Thu, 11 Jan 2024 12:29:41 +0100 Subject: [PATCH 02/14] Lazy imports --- mango/models/experiment_tracking.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index ddb2e7f6..9f41c20f 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -7,9 +7,7 @@ from typing import Dict, Any import pandas as pd -import sklearn.base -from catboost import CatBoost -from lightgbm import LGBMModel +from sklearn.base import BaseEstimator from sklearn.metrics import ( mean_absolute_error, mean_squared_error, @@ -172,11 +170,21 @@ def export_model( |- y_test.csv |- summary.json """ - _SUPPORTED_LIBRARIES_CLASSES = { - ModelLibrary.SCIKIT_LEARN: sklearn.base.BaseEstimator, - ModelLibrary.CATBOOST: CatBoost, - ModelLibrary.LIGHTGBM: LGBMModel, - } + _SUPPORTED_LIBRARIES_CLASSES = {} + _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator + try: + from catboost import CatBoost + + _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.CATBOOST] = CatBoost + except ImportError: + pass + try: + from lightgbm import LGBMModel + + _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.LIGHTGBM] = LGBMModel + except ImportError: + pass + if not os.path.exists(base_path): raise FileNotFoundError(f"Folder {base_path} does not exist.") model_name = model.__class__.__name__ From db6e65fbbadc2b27ad9d6c13d929dd03b5623b4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Thu, 11 Jan 2024 15:13:44 +0100 Subject: [PATCH 03/14] Added documentation --- docs/source/dev/models.rst | 115 ++++++++++++++++++++++++++++ mango/models/experiment_tracking.py | 109 ++++++++++++-------------- 2 files changed, 163 insertions(+), 61 deletions(-) diff --git a/docs/source/dev/models.rst b/docs/source/dev/models.rst index bdd25b9a..da11ac48 100644 --- a/docs/source/dev/models.rst +++ b/docs/source/dev/models.rst @@ -59,3 +59,118 @@ Pyomo ~~~~~~ .. automodule:: mango.models.pyomo + +Machine Learning +================ + +Metrics +~~~~~~~~ + +.. autofunction:: mango.models.experiment_tracking.generate_metrics_classification +.. autofunction:: mango.models.experiment_tracking.generate_metrics_regression + +Model tracking +~~~~~~~~~~~~~~~ + +The following fucntion attempts to export the model and the data used to train it. The model is saved as a pickle file and the data is saved as csv files. The function also saves a summary of the model in a json file. This way many models (experiments) can be saved in the same folder and the user can easily compare them. + +.. autofunction:: mango.models.experiment_tracking.export_model + +The subfolder structure after running export_model is the following: + +If not zipped: + +.. code-block:: bash + + base_path + |-- experiment_LinearRegression_20240111-133955 + | `-- summary.json + | |-- data + | | |-- X_test.csv + | | |-- X_train.csv + | | |-- y_test.csv + | | `-- y_train.csv + | `-- model + | |-- hyperparameters.json + | `-- model.pkl + +In case of zipped: + +.. code-block:: bash + + base_path + |-- experiment_LinearRegression_20240111-133955 + | |-- summary.json + | |-- data.zip + | `-- model.zip + + +The following is an example of the summary.json file: + +.. code-block:: json + + { + "model": { + "name": "LinearRegression", + "problem_type": "regression", + "input": "X_train.csv", + "target": "y_train.csv", + "hyperparameters": { + "fit_intercept": true, + "normalize": false, + "copy_X": true, + "n_jobs": null + }, + "library": "sklearn" + }, + "results": { + "train": { + "r2": 0.9999999999999999, + "rmse": 0.0, + "mae": 0.0 + }, + "test": { + "r2": 0.9999999999999999, + "rmse": 0.0, + "mae": 0.0 + } + } + } + +If save_dataset is set to True, the JSON file will also contain the following: + +.. code-block:: json + + { + "data": { + "X_train": { + "path": "X_train.csv", + "shape": [ + 100, + 2 + ] + }, + "y_train": { + "path": "y_train.csv", + "shape": [ + 100, + 1 + ] + }, + "X_test": { + "path": "X_test.csv", + "shape": [ + 100, + 2 + ] + }, + "y_test": { + "path": "y_test.csv", + "shape": [ + 100, + 1 + ] + } + } + } + diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index 9f41c20f..70060f40 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -51,8 +51,19 @@ def generate_metrics_regression( - Median absolute error :param y_true: The true values. + :type y_true: :class:`pandas.Series` :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` :return: A dictionary of metrics. + :rtype: :class:`dict` + + Usage + ----- + >>> y_true = pd.Series([3, -0.5, 2, 7]) + >>> y_pred = pd.Series([2.5, 0.0, 2, 8]) + >>> metrics = generate_metrics_regression(y_true, y_pred) + >>> print(metrics) + {'r2_score': 0.9486, 'mean_absolute_error': 0.5, 'mean_squared_error': 0.375, 'root_mean_squared_error': 0.6124, 'median_absolute_error': 0.5} """ return { "r2_score": round(r2_score(y_true, y_pred), 4), @@ -84,10 +95,20 @@ def generate_metrics_classification( - Recall macro - F1 score macro - :param y_true: The true values. + :type y_true: :class:`pandas.Series` :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` :return: A dictionary of metrics. + :rtype: :class:`dict` + + Usage + ----- + >>> y_true = pd.Series([0, 1, 1, 0]) + >>> y_pred = pd.Series([0, 0, 1, 1]) + >>> metrics = generate_metrics_classification(y_true, y_pred) + >>> print(metrics) + {'confusion_matrix': [[1, 1], [1, 1]], 'accuracy': 0.5, 'precision': 0.5, 'recall': 0.5, 'f1_score': 0.5} """ if len(y_true.unique()) == 2: return { @@ -122,53 +143,39 @@ def export_model( ): """ Register model and metrics in a json file and save the model and datasets in a folder. - :param model: A model from one of the supported libraries. Currently supported libraries are: - - scikit-learn - - catboost - - lightgbm + + :param model: A model from one of the supported libraries. + :type model: :class:`Any` :param X_train: Training data as a pandas dataframe. + :type X_train: :class:`pandas.DataFrame` :param y_train: Training target as a pandas series. + :type y_train: :class:`pandas.Series` :param X_test: Test data as a pandas dataframe. + :type X_test: :class:`pandas.DataFrame` :param y_test: Test target as a pandas series. + :type y_test: :class:`pandas.Series` :param base_path: Path to the base folder where the model and datasets will be saved in a subfolder structure. + :type base_path: :class:`str` :param zip_files: Whether to zip the files or not. + :type zip_files: :class:`bool` :param save_datasets: Whether to save the datasets or not. + :type save_datasets: :class:`bool` :param save_model: Whether to save the model or not. + :type save_model: :class:`bool` :return: The path to the subfolder inside base_path where the model and datasets have been saved. + :rtype: :class:`str` Usage ----- >>> from sklearn.datasets import fetch_california_housing >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.model_selection import train_test_split - >>> from mango.models.experiment_tracking import export_model - >>> - >>> >>> X, y = fetch_california_housing(return_X_y=True, as_frame=True) >>> X_train, X_test, y_train, y_test = train_test_split(X, y) >>> model = LogisticRegression() >>> model.fit(X_train, y_train) >>> output_folder = export_model(model, X_train, y_train, X_test, y_test, "/my_experiments_folder") >>> print(output_folder) # /my_experiments_folder/experiment_LogisticRegression_YYYYMMDD-HHMMSS - - Subfolder structure - ------------------- - The subfolder structure will be the following: - |- base_path - |- experiment_{model_name}_{datetime} - |- model - |- model.pkl - - |- hyperparameters.json - |- data - |- X_train.csv - - |- y_train.csv - - |- X_test.csv - - |- y_test.csv - |- summary.json """ _SUPPORTED_LIBRARIES_CLASSES = {} _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator @@ -200,35 +207,6 @@ def export_model( problem_type = ProblemType.CLASSIFICATION else: problem_type = ProblemType.REGRESSION - - # Intended structure - # summary = { - # "model": { - # "name": "", - # "problem_type": "", - # # Optional "num_classes": 0, if classification - # "input": "", - # "target": "", - # "hyperparameters": {}, - # "library": "", - # }, - # "results": {}, - # # Optional - # # "files": { - # # "model": { - # # "zip": "", - # # "model.pkl": "", - # # "hyperparameters.json": "", - # # }, - # # "data": { - # # "zip": "", - # # "X_train.csv": "", - # # "y_train.csv": "", - # # "X_test.csv": "", - # # "y_test.csv": "", - # # }, - # # }, - # } summary = {} extra_params = [] # Fill structure @@ -350,21 +328,30 @@ def export_model( if not "data" in summary["files"]: summary["files"]["data"] = {} X_train_path = os.path.join(folder_name, "data", "X_train.csv") - summary["files"]["data"]["X_train.csv"] = os.path.abspath(X_train_path) + summary["files"]["data"]["X_train"] = {} + summary["files"]["data"]["X_train"]["path"] = os.path.abspath(X_train_path) + summary["files"]["data"]["X_train"]["shape"] = X_train.shape X_train.to_csv(X_train_path, index=False) y_train_path = os.path.join(folder_name, "data", "y_train.csv") - summary["files"]["data"]["y_train.csv"] = os.path.abspath(y_train_path) + summary["files"]["data"]["y_train"] = {} + summary["files"]["data"]["y_train"]["path"] = os.path.abspath(y_train_path) + summary["files"]["data"]["y_train"]["shape"] = y_train.shape y_train.to_csv(y_train_path, index=False) X_test_path = os.path.join(folder_name, "data", "X_test.csv") - summary["files"]["data"]["X_test.csv"] = os.path.abspath(X_test_path) + summary["files"]["data"]["X_test"] = {} + summary["files"]["data"]["X_test"]["path"] = os.path.abspath(X_test_path) + summary["files"]["data"]["X_test"]["shape"] = X_test.shape X_test.to_csv(X_test_path, index=False) y_test_path = os.path.join(folder_name, "data", "y_test.csv") - summary["files"]["data"]["y_test.csv"] = os.path.abspath(y_test_path) + summary["files"]["data"]["y_test"] = {} + summary["files"]["data"]["y_test"]["path"] = os.path.abspath(y_test_path) + summary["files"]["data"]["y_test"]["shape"] = y_test.shape y_test.to_csv(y_test_path, index=False) if zip_files: # Compress data and save zip_path = os.path.join(folder_name, "data.zip") - summary["files"]["data"]["zip"] = os.path.abspath(zip_path) + summary["files"]["data"]["zip"] = {} + summary["files"]["data"]["zip"]["path"] = os.path.abspath(zip_path) shutil.make_archive( zip_path.rstrip(".zip"), "zip", os.path.join(folder_name, "data") ) From f2eae145373c3b9492035d235c21c4bb3b552b7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Thu, 11 Jan 2024 15:47:17 +0100 Subject: [PATCH 04/14] Added metrics.py to avoid sklearn dependency --- docs/source/dev/models.rst | 5 +- mango/models/experiment_tracking.py | 105 +--------- mango/models/metrics.py | 302 ++++++++++++++++++++++++++++ 3 files changed, 308 insertions(+), 104 deletions(-) create mode 100644 mango/models/metrics.py diff --git a/docs/source/dev/models.rst b/docs/source/dev/models.rst index da11ac48..2b29543d 100644 --- a/docs/source/dev/models.rst +++ b/docs/source/dev/models.rst @@ -66,8 +66,9 @@ Machine Learning Metrics ~~~~~~~~ -.. autofunction:: mango.models.experiment_tracking.generate_metrics_classification -.. autofunction:: mango.models.experiment_tracking.generate_metrics_regression +As a part of mango we have implemented some metrics that are used to evaluate the performance of the models. The metrics are implemented in the following module. + +.. automodule:: mango.models.metrics Model tracking ~~~~~~~~~~~~~~~ diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index 70060f40..b4fb8324 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -4,20 +4,12 @@ import shutil from datetime import datetime from enum import Enum -from typing import Dict, Any +from typing import Any import pandas as pd from sklearn.base import BaseEstimator -from sklearn.metrics import ( - mean_absolute_error, - mean_squared_error, - median_absolute_error, - r2_score, - confusion_matrix, - precision_score, - f1_score, - recall_score, -) + +from mango.models.metrics import generate_metrics_regression, generate_metrics_classification class ProblemType(Enum): @@ -39,97 +31,6 @@ class ModelLibrary(Enum): LIGHTGBM = "lightgbm" -def generate_metrics_regression( - y_true: pd.Series, y_pred: pd.Series -) -> Dict[str, float]: - """ - Generate common metrics for regression and return them in a dictionary. The metrics are: - - R2 score - - Mean absolute error - - Mean squared error - - Root mean squared error - - Median absolute error - - :param y_true: The true values. - :type y_true: :class:`pandas.Series` - :param y_pred: The predicted values. - :type y_pred: :class:`pandas.Series` - :return: A dictionary of metrics. - :rtype: :class:`dict` - - Usage - ----- - >>> y_true = pd.Series([3, -0.5, 2, 7]) - >>> y_pred = pd.Series([2.5, 0.0, 2, 8]) - >>> metrics = generate_metrics_regression(y_true, y_pred) - >>> print(metrics) - {'r2_score': 0.9486, 'mean_absolute_error': 0.5, 'mean_squared_error': 0.375, 'root_mean_squared_error': 0.6124, 'median_absolute_error': 0.5} - """ - return { - "r2_score": round(r2_score(y_true, y_pred), 4), - "mean_absolute_error": round(mean_absolute_error(y_true, y_pred), 4), - "mean_squared_error": round(mean_squared_error(y_true, y_pred), 4), - "root_mean_squared_error": round( - mean_squared_error(y_true, y_pred, squared=False), 4 - ), - "median_absolute_error": round(median_absolute_error(y_true, y_pred), 4), - } - - -def generate_metrics_classification( - y_true: pd.Series, y_pred: pd.Series -) -> Dict[str, float]: - """ - Generate common metrics for classification and return them in a dictionary. The metrics for binary classification - are: - - Confusion matrix - - Accuracy - - Precision - - Recall - - F1 score - - In case It is a multiclass classification, the metrics are: - - Confusion matrix - - Accuracy - - Precision macro - - Recall macro - - F1 score macro - - :param y_true: The true values. - :type y_true: :class:`pandas.Series` - :param y_pred: The predicted values. - :type y_pred: :class:`pandas.Series` - :return: A dictionary of metrics. - :rtype: :class:`dict` - - Usage - ----- - >>> y_true = pd.Series([0, 1, 1, 0]) - >>> y_pred = pd.Series([0, 0, 1, 1]) - >>> metrics = generate_metrics_classification(y_true, y_pred) - >>> print(metrics) - {'confusion_matrix': [[1, 1], [1, 1]], 'accuracy': 0.5, 'precision': 0.5, 'recall': 0.5, 'f1_score': 0.5} - """ - if len(y_true.unique()) == 2: - return { - "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(), - "accuracy": round((y_true == y_pred).sum() / len(y_true), 4), - "precision": round(precision_score(y_true, y_pred), 4), - "recall": round(recall_score(y_true, y_pred), 4), - "f1_score": round(f1_score(y_true, y_pred), 4), - } - else: - return { - "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(), - "accuracy": round((y_true == y_pred).sum() / len(y_true), 4), - "precision_macro": round( - precision_score(y_true, y_pred, average="macro"), 4 - ), - "recall_macro": round(recall_score(y_true, y_pred, average="macro"), 4), - "f1_score_macro": round(f1_score(y_true, y_pred, average="macro"), 4), - } - - def export_model( model: Any, X_train: pd.DataFrame, diff --git a/mango/models/metrics.py b/mango/models/metrics.py new file mode 100644 index 00000000..cded8c37 --- /dev/null +++ b/mango/models/metrics.py @@ -0,0 +1,302 @@ +from typing import Dict + +import pandas as pd + + +# Define the metrics without sklearn +def r2_score(y_true: pd.Series, y_pred: pd.Series) -> float: + """ + Calculate the R2 score for regression. + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :return: The R2 score. + :rtype: :class:`float` + + Usage + ----- + >>> y_true = pd.Series([3, -0.5, 2, 7]) + >>> y_pred = pd.Series([2.5, 0.0, 2, 8]) + >>> r2_score(y_true, y_pred) + 0.9486 + """ + mean_y_true = y_true.mean() + ss_tot = ((y_true - mean_y_true) ** 2).sum() + ss_res = ((y_true - y_pred) ** 2).sum() + return 1 - ss_res / ss_tot + + +def mean_absolute_error(y_true: pd.Series, y_pred: pd.Series) -> float: + """ + Calculate the mean absolute error for regression. + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :return: The mean absolute error. + :rtype: :class:`float` + + Usage + ----- + >>> y_true = pd.Series([3, -0.5, 2, 7]) + >>> y_pred = pd.Series([2.5, 0.0, 2, 8]) + >>> mean_absolute_error(y_true, y_pred) + 0.5 + """ + return (y_true - y_pred).abs().mean() + + +def mean_squared_error( + y_true: pd.Series, y_pred: pd.Series, squared: bool = True +) -> float: + """ + Calculate the mean squared error for regression. + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :param squared: Whether to return the squared error or not. + :type squared: :class:`bool` + :return: The mean squared error. + :rtype: :class:`float` + + Usage + ----- + >>> y_true = pd.Series([3, -0.5, 2, 7]) + >>> y_pred = pd.Series([2.5, 0.0, 2, 8]) + >>> mean_squared_error(y_true, y_pred) + 0.375 + """ + mse = ((y_true - y_pred) ** 2).mean() + if squared: + return mse + else: + return mse**0.5 + + +def median_absolute_error(y_true: pd.Series, y_pred: pd.Series) -> float: + """ + Calculate the median absolute error for regression. + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :return: The median absolute error. + :rtype: :class:`float` + + Usage + ----- + >>> y_true = pd.Series([3, -0.5, 2, 7]) + >>> y_pred = pd.Series([2.5, 0.0, 2, 8]) + >>> median_absolute_error(y_true, y_pred) + 0.5 + """ + return (y_true - y_pred).abs().median() + + +def confusion_matrix(y_true: pd.Series, y_pred: pd.Series) -> pd.DataFrame: + """ + Calculate the confusion matrix for classification. + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :return: The confusion matrix. + :rtype: :class:`pandas.DataFrame` + + Usage + ----- + >>> y_true = pd.Series([0, 1, 1, 0]) + >>> y_pred = pd.Series([0, 0, 1, 1]) + >>> confusion_matrix(y_true, y_pred) + array([[1, 1], + [1, 1]]) + """ + return pd.crosstab(y_true, y_pred) + + +def precision_score( + y_true: pd.Series, y_pred: pd.Series, average: str = "binary" +) -> float: + """ + Calculate the precision score for classification. + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :param average: The type of averaging performed. + :type average: :class:`str` + :return: The precision score. + :rtype: :class:`float` + + Usage + ----- + >>> y_true = pd.Series([0, 1, 1, 0]) + >>> y_pred = pd.Series([0, 0, 1, 1]) + >>> precision_score(y_true, y_pred) + 0.5 + """ + if average == "binary": + return ((y_true == 1) & (y_pred == 1)).sum() / (y_pred == 1).sum() + elif average == "macro": + return ( + ((y_true == 1) & (y_pred == 1)).sum() + + ((y_true == 0) & (y_pred == 0)).sum() + ) / len(y_true) + else: + raise ValueError(f"{average} is not a valid value for average.") + + +def recall_score( + y_true: pd.Series, y_pred: pd.Series, average: str = "binary" +) -> float: + """ + Calculate the recall score for classification. + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :param average: The type of averaging performed. + :type average: :class:`str` + :return: The recall score. + :rtype: :class:`float` + + Usage + ----- + >>> y_true = pd.Series([0, 1, 1, 0]) + >>> y_pred = pd.Series([0, 0, 1, 1]) + >>> recall_score(y_true, y_pred) + 0.5 + """ + if average == "binary": + return ((y_true == 1) & (y_pred == 1)).sum() / (y_true == 1).sum() + elif average == "macro": + return ( + ((y_true == 1) & (y_pred == 1)).sum() + + ((y_true == 0) & (y_pred == 0)).sum() + ) / len(y_true) + else: + raise ValueError(f"{average} is not a valid value for average.") + + +def f1_score(y_true: pd.Series, y_pred: pd.Series, average: str = "binary") -> float: + """ + Calculate the F1 score for classification. + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :param average: The type of averaging performed. + :type average: :class:`str` + :return: The F1 score. + :rtype: :class:`float` + + Usage + ----- + >>> y_true = pd.Series([0, 1, 1, 0]) + >>> y_pred = pd.Series([0, 0, 1, 1]) + >>> f1_score(y_true, y_pred) + 0.5 + """ + precision = precision_score(y_true, y_pred, average=average) + recall = recall_score(y_true, y_pred, average=average) + return 2 * (precision * recall) / (precision + recall) + + +def generate_metrics_regression( + y_true: pd.Series, y_pred: pd.Series +) -> Dict[str, float]: + """ + Generate common metrics for regression and return them in a dictionary. The metrics are: + - R2 score + - Mean absolute error + - Mean squared error + - Root mean squared error + - Median absolute error + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :return: A dictionary of metrics. + :rtype: :class:`dict` + + Usage + ----- + >>> y_true = pd.Series([3, -0.5, 2, 7]) + >>> y_pred = pd.Series([2.5, 0.0, 2, 8]) + >>> metrics = generate_metrics_regression(y_true, y_pred) + >>> print(metrics) + {'r2_score': 0.9486, 'mean_absolute_error': 0.5, 'mean_squared_error': 0.375, 'root_mean_squared_error': 0.6124, 'median_absolute_error': 0.5} + """ + return { + "r2_score": round(r2_score(y_true, y_pred), 4), + "mean_absolute_error": round(mean_absolute_error(y_true, y_pred), 4), + "mean_squared_error": round(mean_squared_error(y_true, y_pred), 4), + "root_mean_squared_error": round( + mean_squared_error(y_true, y_pred, squared=False), 4 + ), + "median_absolute_error": round(median_absolute_error(y_true, y_pred), 4), + } + + +def generate_metrics_classification( + y_true: pd.Series, y_pred: pd.Series +) -> Dict[str, float]: + """ + Generate common metrics for classification and return them in a dictionary. The metrics for binary classification are: + - Confusion matrix + - Accuracy + - Precision + - Recall + - F1 score + + In case It is a multiclass classification, the metrics are: + - Confusion matrix + - Accuracy + - Precision macro + - Recall macro + - F1 score macro + + :param y_true: The true values. + :type y_true: :class:`pandas.Series` + :param y_pred: The predicted values. + :type y_pred: :class:`pandas.Series` + :return: A dictionary of metrics. + :rtype: :class:`dict` + + Usage + ----- + >>> y_true = pd.Series([0, 1, 1, 0]) + >>> y_pred = pd.Series([0, 0, 1, 1]) + >>> metrics = generate_metrics_classification(y_true, y_pred) + >>> print(metrics) + {'confusion_matrix': [[1, 1], [1, 1]], 'accuracy': 0.5, 'precision': 0.5, 'recall': 0.5, 'f1_score': 0.5} + """ + if len(y_true.unique()) == 2: + return { + "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(), + "accuracy": round((y_true == y_pred).sum() / len(y_true), 4), + "precision": round(precision_score(y_true, y_pred), 4), + "recall": round(recall_score(y_true, y_pred), 4), + "f1_score": round(f1_score(y_true, y_pred), 4), + } + else: + return { + "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(), + "accuracy": round((y_true == y_pred).sum() / len(y_true), 4), + "precision_macro": round( + precision_score(y_true, y_pred, average="macro"), 4 + ), + "recall_macro": round(recall_score(y_true, y_pred, average="macro"), 4), + "f1_score_macro": round(f1_score(y_true, y_pred, average="macro"), 4), + } From 0816e6734b1a0f88de6c4af568221721c2943005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Thu, 11 Jan 2024 15:49:59 +0100 Subject: [PATCH 05/14] Added metrics.py to avoid sklearn dependency --- mango/models/experiment_tracking.py | 13 ++++++++++--- mango/models/metrics.py | 5 +++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index b4fb8324..aa0d34a9 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -7,9 +7,11 @@ from typing import Any import pandas as pd -from sklearn.base import BaseEstimator -from mango.models.metrics import generate_metrics_regression, generate_metrics_classification +from mango.models.metrics import ( + generate_metrics_regression, + generate_metrics_classification, +) class ProblemType(Enum): @@ -79,7 +81,12 @@ def export_model( >>> print(output_folder) # /my_experiments_folder/experiment_LogisticRegression_YYYYMMDD-HHMMSS """ _SUPPORTED_LIBRARIES_CLASSES = {} - _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator + try: + from sklearn.base import BaseEstimator + + _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator + except ImportError: + pass try: from catboost import CatBoost diff --git a/mango/models/metrics.py b/mango/models/metrics.py index cded8c37..726e9536 100644 --- a/mango/models/metrics.py +++ b/mango/models/metrics.py @@ -1,5 +1,6 @@ from typing import Dict +import numpy as np import pandas as pd @@ -99,7 +100,7 @@ def median_absolute_error(y_true: pd.Series, y_pred: pd.Series) -> float: return (y_true - y_pred).abs().median() -def confusion_matrix(y_true: pd.Series, y_pred: pd.Series) -> pd.DataFrame: +def confusion_matrix(y_true: pd.Series, y_pred: pd.Series) -> np.ndarray: """ Calculate the confusion matrix for classification. @@ -118,7 +119,7 @@ def confusion_matrix(y_true: pd.Series, y_pred: pd.Series) -> pd.DataFrame: array([[1, 1], [1, 1]]) """ - return pd.crosstab(y_true, y_pred) + return pd.crosstab(y_true, y_pred).to_numpy() def precision_score( From de3815fc8d7901247e52f4cc027e8cc66eb7a1e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Fri, 12 Jan 2024 17:34:42 +0100 Subject: [PATCH 06/14] Missing documentation. Added first version of MLTracking --- mango/models/experiment_tracking.py | 511 ++++++++++++++++-- .../models_module/test_experiment_tracking.py | 32 +- 2 files changed, 496 insertions(+), 47 deletions(-) diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index aa0d34a9..15a633a6 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -1,4 +1,6 @@ import json +import logging + import os import pickle import shutil @@ -7,8 +9,10 @@ from typing import Any import pandas as pd +from sklearn.compose import ColumnTransformer +from sklearn.pipeline import Pipeline -from mango.models.metrics import ( +from .metrics import ( generate_metrics_regression, generate_metrics_classification, ) @@ -22,6 +26,14 @@ class ProblemType(Enum): REGRESSION = "regression" CLASSIFICATION = "classification" + # When creating a new one convert to lowercase + @classmethod + def _missing_(cls, value: str): + for member in cls: + if member.value.lower() == value.lower(): + return member + return super()._missing_(value) + class ModelLibrary(Enum): """ @@ -32,6 +44,20 @@ class ModelLibrary(Enum): CATBOOST = "catboost" LIGHTGBM = "lightgbm" +def _json_serializable(value): + try: + json.dumps(value) + return True + except (TypeError, OverflowError): + return False + +def _clean_hyperparameters(hyperparameters): + for key, value in hyperparameters.items(): + if isinstance(value, dict): + _clean_hyperparameters(value) + elif not _json_serializable(value): + hyperparameters[key] = str(value) + return hyperparameters def export_model( model: Any, @@ -40,13 +66,14 @@ def export_model( X_test: pd.DataFrame, y_test: pd.Series, base_path: str, + description: str = None, + custom_folder_name: str = None, save_model: bool = True, save_datasets: bool = False, zip_files: bool = True, ): """ Register model and metrics in a json file and save the model and datasets in a folder. - :param model: A model from one of the supported libraries. :type model: :class:`Any` :param X_train: Training data as a pandas dataframe. @@ -57,8 +84,12 @@ def export_model( :type X_test: :class:`pandas.DataFrame` :param y_test: Test target as a pandas series. :type y_test: :class:`pandas.Series` + :param description: Description of the experiment. + :type description: :class:`str` :param base_path: Path to the base folder where the model and datasets will be saved in a subfolder structure. :type base_path: :class:`str` + :param custom_folder_name: Custom name for the folder where the model and datasets will be saved. + :type custom_folder_name: :class:`str` :param zip_files: Whether to zip the files or not. :type zip_files: :class:`bool` :param save_datasets: Whether to save the datasets or not. @@ -67,7 +98,6 @@ def export_model( :type save_model: :class:`bool` :return: The path to the subfolder inside base_path where the model and datasets have been saved. :rtype: :class:`str` - Usage ----- >>> from sklearn.datasets import fetch_california_housing @@ -118,6 +148,7 @@ def export_model( summary = {} extra_params = [] # Fill structure + summary["description"] = description summary["model"] = {} summary["model"]["name"] = model_name summary["model"]["problem_type"] = problem_type.value @@ -133,6 +164,9 @@ def export_model( summary["model"]["input"] = list(model.feature_name_) summary["model"]["hyperparameters"] = model.get_params(deep=True) + # Clean hyperparameters for the sklearn pipeline or other non-serializable objects + _clean_hyperparameters(summary["model"]["hyperparameters"]) + # Sort keys in summary["model"] if problem_type == ProblemType.CLASSIFICATION: summary["model"]["num_classes"] = len(y_train.unique()) @@ -195,10 +229,9 @@ def export_model( } # Prepare environment to save files - folder_name = os.path.join( - base_path, - f"experiment_{model_name}_{datetime.now().strftime('%Y%m%d-%H%M%S')}", - ) + folder_name_default = f"experiment_{model_name}_{datetime.now().strftime('%Y%m%d-%H%M%S')}" + folder_name = custom_folder_name or folder_name_default + folder_name = os.path.join(base_path, folder_name) # Compress model and save if save_model: @@ -230,40 +263,40 @@ def export_model( shutil.rmtree(os.path.join(folder_name, "model")) if save_datasets: - os.makedirs(os.path.join(folder_name, "data")) + os.makedirs(os.path.join(folder_name, "datasets")) if not "files" in summary: summary["files"] = {} - if not "data" in summary["files"]: - summary["files"]["data"] = {} - X_train_path = os.path.join(folder_name, "data", "X_train.csv") - summary["files"]["data"]["X_train"] = {} - summary["files"]["data"]["X_train"]["path"] = os.path.abspath(X_train_path) - summary["files"]["data"]["X_train"]["shape"] = X_train.shape + if not "datasets" in summary["files"]: + summary["files"]["datasets"] = {} + X_train_path = os.path.join(folder_name, "datasets", "X_train.csv") + summary["files"]["datasets"]["X_train"] = {} + summary["files"]["datasets"]["X_train"]["path"] = os.path.abspath(X_train_path) + summary["files"]["datasets"]["X_train"]["shape"] = X_train.shape X_train.to_csv(X_train_path, index=False) - y_train_path = os.path.join(folder_name, "data", "y_train.csv") - summary["files"]["data"]["y_train"] = {} - summary["files"]["data"]["y_train"]["path"] = os.path.abspath(y_train_path) - summary["files"]["data"]["y_train"]["shape"] = y_train.shape + y_train_path = os.path.join(folder_name, "datasets", "y_train.csv") + summary["files"]["datasets"]["y_train"] = {} + summary["files"]["datasets"]["y_train"]["path"] = os.path.abspath(y_train_path) + summary["files"]["datasets"]["y_train"]["shape"] = y_train.shape y_train.to_csv(y_train_path, index=False) - X_test_path = os.path.join(folder_name, "data", "X_test.csv") - summary["files"]["data"]["X_test"] = {} - summary["files"]["data"]["X_test"]["path"] = os.path.abspath(X_test_path) - summary["files"]["data"]["X_test"]["shape"] = X_test.shape + X_test_path = os.path.join(folder_name, "datasets", "X_test.csv") + summary["files"]["datasets"]["X_test"] = {} + summary["files"]["datasets"]["X_test"]["path"] = os.path.abspath(X_test_path) + summary["files"]["datasets"]["X_test"]["shape"] = X_test.shape X_test.to_csv(X_test_path, index=False) - y_test_path = os.path.join(folder_name, "data", "y_test.csv") - summary["files"]["data"]["y_test"] = {} - summary["files"]["data"]["y_test"]["path"] = os.path.abspath(y_test_path) - summary["files"]["data"]["y_test"]["shape"] = y_test.shape + y_test_path = os.path.join(folder_name, "datasets", "y_test.csv") + summary["files"]["datasets"]["y_test"] = {} + summary["files"]["datasets"]["y_test"]["path"] = os.path.abspath(y_test_path) + summary["files"]["datasets"]["y_test"]["shape"] = y_test.shape y_test.to_csv(y_test_path, index=False) if zip_files: # Compress data and save - zip_path = os.path.join(folder_name, "data.zip") - summary["files"]["data"]["zip"] = {} - summary["files"]["data"]["zip"]["path"] = os.path.abspath(zip_path) + zip_path = os.path.join(folder_name, "datasets.zip") + summary["files"]["datasets"]["zip"] = {} + summary["files"]["datasets"]["zip"]["path"] = os.path.abspath(zip_path) shutil.make_archive( - zip_path.rstrip(".zip"), "zip", os.path.join(folder_name, "data") + zip_path.rstrip(".zip"), "zip", os.path.join(folder_name, "datasets") ) - shutil.rmtree(os.path.join(folder_name, "data")) + shutil.rmtree(os.path.join(folder_name, "datasets")) # Save json json_path = os.path.join(folder_name, "summary.json") @@ -271,3 +304,419 @@ def export_model( json.dump(summary, f, indent=4, ensure_ascii=False) return folder_name + + +def _find_saving_parameters_from_structure(experiment_folder): + """ + Find the saving parameters from the structure of the experiment folder. + :param experiment_folder: + :return: + """ + # Walk around the folder and find the files. + # Should return the following dictionary: + # { + # "save_datasets": True, if inside folder datasets.zip or data/ exist. + # "save_model": True, if inside folder model.zip or model/ exist. + # "zip_files": True, if inside folder model.zip or data.zip exist. + # } + if not os.path.exists(experiment_folder): + raise FileNotFoundError(f"The folder {experiment_folder} does not exist.") + + for root, dirs, files in os.walk(experiment_folder): + assert ( + "summary.json" in files + ), "The summary.json file is missing. Check if folder is a valid experiment folder." + # Filter possible new files in new versions of experiments. + files = [ + file + for file in files + if file in ["summary.json", "model.zip", "datasets.zip"] + ] + # Check if the files are in the root folder. + if "model.zip" in files or "datasets.zip" in files: + return { + "save_datasets": True if "datasets.zip" in files else False, + "save_model": True if "model.zip" in files else False, + "zip_files": True, + } + # Check if subfolders exist. + if "model" in dirs or "data" in dirs: + return { + "save_datasets": True if "datasets" in dirs else False, + "save_model": True if "model" in dirs else False, + "zip_files": False, + } + + +def _unzip_experiment_folder(experiment_path): + files = [ + file + for file in os.listdir(experiment_path) + if file in ["model.zip", "datasets.zip"] + ] + for file in files: + # Think of a better way to do this with shutil. + shutil.unpack_archive( + os.path.join(experiment_path, file), + os.path.join(experiment_path, file.rstrip(".zip")), + ) + os.remove(os.path.join(experiment_path, file)) + + +def _zip_experiment_folder(experiment_path): + unzipped_folders = [ + folder + for folder in os.listdir(experiment_path) + if folder in ["model", "datasets"] + ] + for folder in unzipped_folders: + shutil.make_archive( + os.path.join(experiment_path, folder), + "zip", + os.path.join(experiment_path, folder), + ) + shutil.rmtree(os.path.join(experiment_path, folder)) + + +class MLExperiment: + __VALID_MODELS = { + "sklearn": { + "regression": "sklearn.linear_model.LinearRegression", + "classification": "sklearn.linear_model.LogisticRegression", + }, + "lightgbm": { + "regression": "lightgbm.LGBMRegressor", + "classification": "lightgbm.LGBMClassifier", + }, + "catboost": { + "regression": "catboost.CatBoostRegressor", + "classification": "catboost.CatBoostClassifier", + }, + } + + def __init__( + self, + config=None, + X_train=None, + y_train=None, + X_test=None, + y_test=None, + model=None, + problem_type=None, + name=None, + description=None, + ): + if config: + self._config = config + self._name = self._config("name") + self._description = self._config("description") + self._problem_type = self._config("problem_type") + self._model = self._load_model_from_config(self._config) + self._X_train = pd.read_csv(self._config("X_train")) + self._y_train = pd.read_csv(self._config("y_train")) + self._X_test = pd.read_csv(self._config("X_test")) + self._y_test = pd.read_csv(self._config("y_test")) + else: + self._config = None + self._name = name + self._description = description + self._problem_type = ProblemType(problem_type) + self._model = model + assert X_train is not None, "X_train cannot be None." + self._X_train = X_train + assert y_train is not None, "y_train cannot be None." + self._y_train = y_train + assert X_test is not None, "X_test cannot be None." + self._X_test = X_test + assert y_test is not None, "y_test cannot be None." + self._y_test = y_test + + self._metrics = None + self._column_transformer = None + self._base_model_library = None + self._base_model = None + self._num_preprocessors = None + self._num_features = None + self._cat_preprocessors = None + self._cat_features = None + self._model_params = None + + @property + def name(self): + return self._name + + @name.setter + def name(self, value): + self._name = value + + @property + def description(self): + return self._description + + @description.setter + def description(self, value): + self._description = value + + @property + def model(self): + return self._model + + @model.setter + def model(self, value): + self._model = value + + @property + def X_train(self): + return self._X_train + + @X_train.setter + def X_train(self, value): + self._X_train = value + + @property + def y_train(self): + return self._y_train + + @y_train.setter + def y_train(self, value): + self._y_train = value + + @property + def X_test(self): + return self._X_test + + @X_test.setter + def X_test(self, value): + self._X_test = value + + @property + def y_test(self): + return self._y_test + + @y_test.setter + def y_test(self, value): + self._y_test = value + + @property + def metrics(self): + if self._metrics is None: + logging.warning("Metrics have not been calculated yet. Calculating now.") + if self._problem_type == ProblemType.REGRESSION: + self._metrics = generate_metrics_regression( + self._y_test, self._model.predict(self._X_test) + ) + elif self._problem_type == ProblemType.CLASSIFICATION: + self._metrics = generate_metrics_classification( + self._y_test, self._model.predict(self._X_test) + ) + return self._metrics + + @metrics.setter + def metrics(self, value): + self._metrics = value + + def _check_model_is_fitted(self): + """ + Check if the model is fitted. + :return: + """ + pass + + def _get_model_from_string(self, model_string): + """ + Get the model from the string. + :param model_string: + :return: + """ + pass + + def _load_model_from_config(self, config): + """ + Load the model from the config. + :param config: + :return: + """ + self.base_model_library = ModelLibrary( + config("model_library") + ) # This should be a string. + self.base_model = config( + "model" + ) # This should be a class string equal to the class name. + self.base_model = None + + # This would be strings and we need to somehow convert them to the actual objects. + self.num_preprocessors = config( + "numeric_preprocessors" + ) # This should be a dictionary of classes. + self.num_features = config("numeric_features") # This should be a list. + self.cat_preprocessors = config( + "categorical_preprocessors" + ) # This should be a list. + self.cat_features = config("categorical_features") # This should be a list. + self.model_params = config("model_params") # This should be a dictionary. + + # Create Pipeline from sklearn. + # Create the numeric pipeline. + if self.base_model_library == ModelLibrary.SCIKIT_LEARN: + self._column_transformer = ColumnTransformer( + transformers=[ + ( + "numeric_pipeline", + Pipeline(steps=[]), + self.num_features, + ), + ( + "categorical_pipeline", + Pipeline(steps=[]), + self.cat_features, + ), + ] + ) + + return Pipeline( + steps=[ + ("column_transformer", self._column_transformer), + ("model", self.base_model(**self.model_params)), + ] + ) + elif self.base_model_library == ModelLibrary.LIGHTGBM: + pass + elif self.base_model_library == ModelLibrary.CATBOOST: + pass + else: + raise ValueError(f"{self.base_model_library} is not a valid model library.") + + def register_experiment( + self, base_path, save_model=True, save_datasets=True, zip_files=True + ): + """ + Register the experiment and save it as a zip file. + :param base_path: + :param save_model: + :param save_datasets: + :param zip_files: + :return: + """ + return export_model( + self.model, + self.X_train, + self.y_train, + self.X_test, + self.y_test, + base_path=base_path, + custom_folder_name=self.name, + save_model=save_model, + save_datasets=save_datasets, + zip_files=zip_files, + ) + + @classmethod + def from_registered_experiment(cls, experiment_path): + """ + Load the experiment from a registered experiment. + :param experiment_path: + :return: + """ + # Read files in the folder and load them. + # Get saving params + saving_params = _find_saving_parameters_from_structure(experiment_path) + try: + if saving_params["zip_files"]: + _unzip_experiment_folder(experiment_path) + with open(os.path.join(experiment_path, "summary.json"), "r") as f: + summary = json.load(f) + # Set params + with open(os.path.join(experiment_path, "model", "model.pkl"), "rb") as f: + model = pickle.load(f) + if saving_params["save_datasets"]: + X_train = pd.read_csv( + os.path.join(experiment_path, "datasets", "X_train.csv") + ) + y_train = pd.read_csv( + os.path.join(experiment_path, "datasets", "y_train.csv") + ) + X_test = pd.read_csv( + os.path.join(experiment_path, "datasets", "X_test.csv") + ) + y_test = pd.read_csv( + os.path.join(experiment_path, "datasets", "y_test.csv") + ) + else: + X_train = None + y_train = None + X_test = None + y_test = None + + experiment = cls( + name=summary["model"]["name"], + description=summary["model"].get("description", ""), + problem_type=ProblemType(summary["model"]["problem_type"]), + model=model, + X_train=X_train, + y_train=y_train, + X_test=X_test, + y_test=y_test, + ) + experiment.metrics = summary["results"] + except Exception as e: + raise e + finally: + # Raise exception but make sure we zip the folder again if it was unzipped. + if saving_params["zip_files"]: + # Zip the folder again. + _zip_experiment_folder(experiment_path) + return experiment + + +class MLTracker: + def __init__(self, experiment_folder): + self._experiment_folder = experiment_folder + self._experiments = {} + + @property + def experiment_folder(self): + return self._experiment_folder + + @property + def experiments(self): + return self._experiments + + def scan_for_experiments(self): + """ + Scan the experiment folder for experiments. + :return: + """ + for experiments_folders in os.listdir(self.experiment_folder): + if os.path.isdir(os.path.join(self.experiment_folder, experiments_folders)): + try: + exp = MLExperiment.from_registered_experiment( + os.path.join(self.experiment_folder, experiments_folders) + ) + if not experiments_folders in self._experiments: + self._experiments[experiments_folders] = exp + else: + logging.warning( + f"Experiment {experiments_folders} already exists in the tracker. Skipping." + ) + except Exception as e: + logging.error(f"Could not load experiment {experiments_folders}.") + logging.error(e, exc_info=True) + logging.info(f"Found {len(self._experiments)} experiments.") + + def add_experiment(self, exp, register=True): + """ + Add an experiment to the tracker. + :param exp: + :return: + """ + # Make sure exp.name is not in self._experiments. + if exp.name in self._experiments or exp.name in os.listdir(self.experiment_folder): + logging.warning("Experiment name already exists. Creating with suffix.") + for i in range(1, 1000): + if f"{exp.name} ({i})" not in self._experiments and f"{exp.name} ({i})" not in os.listdir(self.experiment_folder): + exp.name = f"{exp.name} ({i})" + break + self._experiments[exp.name] = exp + logging.info(f"Added experiment {exp.name} to the tracker. Current experiments: {len(self._experiments)}.") + if register: + exp.register_experiment(self.experiment_folder) diff --git a/mango/tests/models_module/test_experiment_tracking.py b/mango/tests/models_module/test_experiment_tracking.py index ef011364..1cde2058 100644 --- a/mango/tests/models_module/test_experiment_tracking.py +++ b/mango/tests/models_module/test_experiment_tracking.py @@ -82,7 +82,7 @@ def _check_model_with_zip(self, output_folder): """ # Assert zip files are saved self.assertTrue(os.path.exists(os.path.join(output_folder, "model.zip"))) - self.assertTrue(os.path.exists(os.path.join(output_folder, "data.zip"))) + self.assertTrue(os.path.exists(os.path.join(output_folder, "datasets.zip"))) # Assert files are saved correctly self.assertTrue(os.path.exists(os.path.join(output_folder, "summary.json"))) @@ -95,21 +95,21 @@ def _check_model_with_zip(self, output_folder): os.path.exists(os.path.join(output_folder, "model", "hyperparameters.json")) ) self.assertFalse( - os.path.exists(os.path.join(output_folder, "data", "X_train.csv")) + os.path.exists(os.path.join(output_folder, "datasets", "X_train.csv")) ) self.assertFalse( - os.path.exists(os.path.join(output_folder, "data", "y_train.csv")) + os.path.exists(os.path.join(output_folder, "datasets", "y_train.csv")) ) self.assertFalse( - os.path.exists(os.path.join(output_folder, "data", "X_test.csv")) + os.path.exists(os.path.join(output_folder, "datasets", "X_test.csv")) ) self.assertFalse( - os.path.exists(os.path.join(output_folder, "data", "y_test.csv")) + os.path.exists(os.path.join(output_folder, "datasets", "y_test.csv")) ) # Assert subfolder not saved self.assertFalse(os.path.exists(os.path.join(output_folder, "model"))) - self.assertFalse(os.path.exists(os.path.join(output_folder, "data"))) + self.assertFalse(os.path.exists(os.path.join(output_folder, "datasets"))) def _check_model_without_zip(self, model, output_folder, problem_type): """ @@ -117,7 +117,7 @@ def _check_model_without_zip(self, model, output_folder, problem_type): """ # Assert folders are saved correctly self.assertTrue(os.path.exists(os.path.join(output_folder, "model"))) - self.assertTrue(os.path.exists(os.path.join(output_folder, "data"))) + self.assertTrue(os.path.exists(os.path.join(output_folder, "datasets"))) # Assert files are saved correctly self.assertTrue(os.path.exists(os.path.join(output_folder, "summary.json"))) self.assertTrue( @@ -127,25 +127,25 @@ def _check_model_without_zip(self, model, output_folder, problem_type): os.path.exists(os.path.join(output_folder, "model", "model.pkl")) ) self.assertTrue( - os.path.exists(os.path.join(output_folder, "data", "X_train.csv")) + os.path.exists(os.path.join(output_folder, "datasets", "X_train.csv")) ) self.assertTrue( - os.path.exists(os.path.join(output_folder, "data", "y_train.csv")) + os.path.exists(os.path.join(output_folder, "datasets", "y_train.csv")) ) self.assertTrue( - os.path.exists(os.path.join(output_folder, "data", "X_test.csv")) + os.path.exists(os.path.join(output_folder, "datasets", "X_test.csv")) ) self.assertTrue( - os.path.exists(os.path.join(output_folder, "data", "y_test.csv")) + os.path.exists(os.path.join(output_folder, "datasets", "y_test.csv")) ) # Assert zip files are not saved self.assertFalse(os.path.exists(os.path.join(output_folder, "model.zip"))) - self.assertFalse(os.path.exists(os.path.join(output_folder, "data.zip"))) + self.assertFalse(os.path.exists(os.path.join(output_folder, "datasets.zip"))) # Assert files are valid for data folder - X_train = pd.read_csv(os.path.join(output_folder, "data", "X_train.csv")) - y_train = pd.read_csv(os.path.join(output_folder, "data", "y_train.csv")).values - X_test = pd.read_csv(os.path.join(output_folder, "data", "X_test.csv")) - y_test = pd.read_csv(os.path.join(output_folder, "data", "y_test.csv")).values + X_train = pd.read_csv(os.path.join(output_folder, "datasets", "X_train.csv")) + y_train = pd.read_csv(os.path.join(output_folder, "datasets", "y_train.csv")).values + X_test = pd.read_csv(os.path.join(output_folder, "datasets", "X_test.csv")) + y_test = pd.read_csv(os.path.join(output_folder, "datasets", "y_test.csv")).values if problem_type == ProblemType.CLASSIFICATION: assert_frame_equal(X_train, self.X_train_clf) self.assertListEqual(list([y for y in y_train.reshape(-1)]), list([y for y in self.y_train_clf.values])) From fc65ba91f402b8ca470db4051fe870cb4e3c08d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Tue, 23 Jan 2024 10:49:10 +0100 Subject: [PATCH 07/14] Added docs. Fix small errors in MLExperiment --- docs/source/dev/models.rst | 43 +- docs/source/experiment_tracking.rst | 116 ++ docs/source/index.rst | 1 + mango/models/__init__.py | 1 + mango/models/enums.py | 28 + mango/models/experiment_tracking.py | 1369 ++++++++++--- mango/models/metrics.py | 10 +- .../models_module/test_experiment_tracking.py | 1825 ++++++++++++++++- 8 files changed, 3059 insertions(+), 334 deletions(-) create mode 100644 docs/source/experiment_tracking.rst create mode 100644 mango/models/enums.py diff --git a/docs/source/dev/models.rst b/docs/source/dev/models.rst index 2b29543d..62fc76b0 100644 --- a/docs/source/dev/models.rst +++ b/docs/source/dev/models.rst @@ -70,12 +70,46 @@ As a part of mango we have implemented some metrics that are used to evaluate th .. automodule:: mango.models.metrics -Model tracking -~~~~~~~~~~~~~~~ +Enumerations +~~~~~~~~~~~~ + +The enumerations are used to define the type of problem and the type of model. + +.. automodule:: mango.models.enums + +Experiment tracking +~~~~~~~~~~~~~~~~~~~~ + +During the training of the models, the user may develop many models and it is important to keep track of the results. +For this purpose, we have implemented several classes that can be used to keep track of the experiments. The classes +are implemented in the following module. + +The main class is the MLExperiment class. This class is used to keep track of the results of the experiments. The +MLExperiment class is used to save the results of the experiments in a folder structure and provides some methods to +analyze the results. + +.. autoclass:: mango.models.experiment_tracking.MLExperiment + :members: + :undoc-members: + :private-members: + :show-inheritance: + +MLTracker is a class that can be used to keep track of the experiments. It is a simple manager that uses the folder +where all the experiments are saved. It provides some methods to analyze the results and compare the experiments. + +.. autoclass:: mango.models.experiment_tracking.MLTracker + :members: + :undoc-members: + :private-members: + :show-inheritance: + -The following fucntion attempts to export the model and the data used to train it. The model is saved as a pickle file and the data is saved as csv files. The function also saves a summary of the model in a json file. This way many models (experiments) can be saved in the same folder and the user can easily compare them. +In case does not want to use the MLExperiment class, the user can use the following function to save the results of the +trained model into a folder structure. The model is saved as a pickle file and the +data is saved as csv files. The function also saves a summary of the model in a json file. This way many models +(experiments) can be saved in the same folder and the user can easily compare them. -.. autofunction:: mango.models.experiment_tracking.export_model +.. autofunction:: mango.models.export_model The subfolder structure after running export_model is the following: @@ -175,3 +209,4 @@ If save_dataset is set to True, the JSON file will also contain the following: } } +Model experiments \ No newline at end of file diff --git a/docs/source/experiment_tracking.rst b/docs/source/experiment_tracking.rst new file mode 100644 index 00000000..09781dca --- /dev/null +++ b/docs/source/experiment_tracking.rst @@ -0,0 +1,116 @@ +Experiment Tracking +------------------- + +This section describes how to use the experiment tracking system. + +We will use the california housing dataset from sklearn as an example. + +.. code-block:: python + + from sklearn.datasets import fetch_california_housing + X, y = fetch_california_housing(return_X_y=True, as_frame=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + +Now we will create a simple pipeline to train a linear regression model and wrap it in an instance of :class:`MLExperiment` + +.. code-block:: python + + from sklearn.linear_model import LinearRegression + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from mango.models import MLExperiment + pipeline = Pipeline([ + ('scaler', StandardScaler()), + ('regressor', LinearRegression()) + ]) + + pipeline.fit(X_train, y_train) + experiment = MLExperiment( + model=pipeline, + name='California Housing LinearRegression', + description='LinearRegression on California Housing dataset', + problem_type='regression', + X_train=X_train, + X_test=X_test, + y_train=y_train, + y_test=y_test + ) + +Once the model is wrapped several metrics are pre-computed and stored in the experiment object. + +.. code-block:: python + + print(experiment.metrics) + + { + "train_score":{ + "r2_score":0.606, + "mean_squared_error":0.524, + "mean_absolute_error":0.524, + "median_absolute_error":0.524, + "explained_variance_score":0.606 + }, + "test_score":{ + "r2_score":0.606, + "mean_squared_error":0.524, + "mean_absolute_error":0.524, + "median_absolute_error":0.524, + "explained_variance_score":0.606 + } + } + +This experiment can be registered with the experiment tracking system by calling the :meth:`register` method. + +.. code-block:: python + + experiments_folder = "/home/user/experiments" + experiment.register_experiment(experiments_folder) + + +The experiment is now registered and can be viewed in the experiment tracking system. + +The tracking system is used in python with :class:`MLTracker`. + +.. code-block:: python + + from mango.models import MLTracker + tracker = MLTracker(experiments_folder) + traker.scan_for_experiments(experiment_folder) + +If we now create another experiment using a RandomForestRegressor, we can register it with the tracking system and view it. Now we will show another +way of adding the experiment to the tracking system. We will use the :meth:`add_experiment` method. +that adds the experiment to the tracking system and also registers (saves into a subfolder) it for future use. + +.. code-block:: python + + from sklearn.ensemble import RandomForestRegressor + pipeline = Pipeline([ + ('scaler', StandardScaler()), + ('regressor', RandomForestRegressor()) + ]) + + pipeline.fit(X_train, y_train) + experiment = MLExperiment( + model=pipeline, + name='California Housing RandomForestRegressor', + description='RandomForestRegressor on California Housing dataset', + problem_type='regression', + X_train=X_train, + X_test=X_test, + y_train=y_train, + y_test=y_test + ) + tracker.add_experiment(experiment, experiments_folder) + + +Once we added different experiments to the tracking system we can use the :meth:`create_compare_df` +to create a dataframe that compares the different experiments and shows their metrics. + +.. code-block:: python + + tracker.create_compare_df() + +For more information about other methods and usages go to :class:`MLTracker`. + +.. note:: + This module is still under development and some of the features described in this documentation may not be implemented yet. If you find any bug or have any suggestion, please, open an issue in the `GitHub repository `_. \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index cca1bbc9..710c1b9d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -12,6 +12,7 @@ Welcome to mango's documentation! readme changelog + experiment_tracking genetic/index dev/index bib diff --git a/mango/models/__init__.py b/mango/models/__init__.py index 12ca5522..565ae9c0 100644 --- a/mango/models/__init__.py +++ b/mango/models/__init__.py @@ -1,2 +1,3 @@ from .neural_networks import calculate_network_output from .activations import sigmoid, tanh +from .experiment_tracking import MLExperiment, MLTracker, export_model diff --git a/mango/models/enums.py b/mango/models/enums.py new file mode 100644 index 00000000..cbf2fc1e --- /dev/null +++ b/mango/models/enums.py @@ -0,0 +1,28 @@ +from enum import Enum + + +class ProblemType(Enum): + """ + Enum to represent the problem type. + """ + + REGRESSION = "regression" + CLASSIFICATION = "classification" + + # When creating a new one convert to lowercase + @classmethod + def _missing_(cls, value: str): + for member in cls: + if member.value.lower() == value.lower(): + return member + return super()._missing_(value) + + +class ModelLibrary(Enum): + """ + Enum to represent the model library. + """ + + SCIKIT_LEARN = "scikit-learn" + CATBOOST = "catboost" + LIGHTGBM = "lightgbm" diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index 15a633a6..cc316f25 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -1,57 +1,45 @@ import json import logging - import os import pickle import shutil from datetime import datetime -from enum import Enum -from typing import Any +from typing import Any, Optional, Union, Tuple import pandas as pd -from sklearn.compose import ColumnTransformer -from sklearn.pipeline import Pipeline +from matplotlib import pyplot as plt +from .enums import ProblemType, ModelLibrary from .metrics import ( generate_metrics_regression, generate_metrics_classification, ) +from mango.config import BaseConfig +from pandas.testing import assert_frame_equal, assert_series_equal -class ProblemType(Enum): - """ - Basic enum to represent the problem type. - """ - REGRESSION = "regression" - CLASSIFICATION = "classification" +class _DummyPipeline: + pass - # When creating a new one convert to lowercase - @classmethod - def _missing_(cls, value: str): - for member in cls: - if member.value.lower() == value.lower(): - return member - return super()._missing_(value) +class _DummyLinearRegression: + pass -class ModelLibrary(Enum): - """ - Basic enum to represent the model library. - """ - SCIKIT_LEARN = "scikit-learn" - CATBOOST = "catboost" - LIGHTGBM = "lightgbm" +class _DummyLogisticRegression: + pass + -def _json_serializable(value): +def _json_serializable(value: Any) -> bool: try: json.dumps(value) return True except (TypeError, OverflowError): return False -def _clean_hyperparameters(hyperparameters): + +def _clean_hyperparameters(hyperparameters: dict) -> dict: for key, value in hyperparameters.items(): if isinstance(value, dict): _clean_hyperparameters(value) @@ -59,6 +47,7 @@ def _clean_hyperparameters(hyperparameters): hyperparameters[key] = str(value) return hyperparameters + def export_model( model: Any, X_train: pd.DataFrame, @@ -66,14 +55,16 @@ def export_model( X_test: pd.DataFrame, y_test: pd.Series, base_path: str, + custom_metrics: dict = None, description: str = None, - custom_folder_name: str = None, + base_folder_name: str = None, save_model: bool = True, save_datasets: bool = False, zip_files: bool = True, -): +) -> str: """ Register model and metrics in a json file and save the model and datasets in a folder. + :param model: A model from one of the supported libraries. :type model: :class:`Any` :param X_train: Training data as a pandas dataframe. @@ -88,8 +79,8 @@ def export_model( :type description: :class:`str` :param base_path: Path to the base folder where the model and datasets will be saved in a subfolder structure. :type base_path: :class:`str` - :param custom_folder_name: Custom name for the folder where the model and datasets will be saved. - :type custom_folder_name: :class:`str` + :param base_folder_name: Custom name for the folder where the model and datasets will be saved. + :type base_folder_name: :class:`str` :param zip_files: Whether to zip the files or not. :type zip_files: :class:`bool` :param save_datasets: Whether to save the datasets or not. @@ -98,6 +89,7 @@ def export_model( :type save_model: :class:`bool` :return: The path to the subfolder inside base_path where the model and datasets have been saved. :rtype: :class:`str` + Usage ----- >>> from sklearn.datasets import fetch_california_housing @@ -113,10 +105,13 @@ def export_model( _SUPPORTED_LIBRARIES_CLASSES = {} try: from sklearn.base import BaseEstimator + from sklearn.pipeline import Pipeline + + pipeline_class = Pipeline _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator except ImportError: - pass + pipeline_class = _DummyPipeline try: from catboost import CatBoost @@ -132,7 +127,16 @@ def export_model( if not os.path.exists(base_path): raise FileNotFoundError(f"Folder {base_path} does not exist.") + model_name = model.__class__.__name__ + + if isinstance(model, pipeline_class): + pipeline = model + col_transformer = model[0] + model = model[-1] + else: + pipeline = None + col_transformer = None model_library = None for library, class_name in _SUPPORTED_LIBRARIES_CLASSES.items(): if isinstance(model, class_name): @@ -149,20 +153,35 @@ def export_model( extra_params = [] # Fill structure summary["description"] = description + summary["name"] = base_folder_name or model_name + summary["training_date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") summary["model"] = {} summary["model"]["name"] = model_name summary["model"]["problem_type"] = problem_type.value summary["model"]["target"] = y_train.name summary["model"]["library"] = model_library.value if model_library == ModelLibrary.CATBOOST: - summary["model"]["input"] = list(model.feature_names_) - summary["model"]["hyperparameters"] = model.get_all_params() + if pipeline is not None: + summary["model"]["input"] = list(col_transformer.get_feature_names_out()) + summary["model"]["hyperparameters"] = pipeline.get_params(deep=True) + else: + summary["model"]["hyperparameters"] = model.get_all_params() + summary["model"]["input"] = list(model.feature_names_) + elif model_library == ModelLibrary.SCIKIT_LEARN: - summary["model"]["input"] = list(model.feature_names_in_) - summary["model"]["hyperparameters"] = model.get_params(deep=True) + if pipeline is not None: + summary["model"]["input"] = list(col_transformer.get_feature_names_out()) + summary["model"]["hyperparameters"] = pipeline.get_params(deep=True) + else: + summary["model"]["input"] = list(model.feature_names_in_) + summary["model"]["hyperparameters"] = model.get_params(deep=True) elif model_library == ModelLibrary.LIGHTGBM: - summary["model"]["input"] = list(model.feature_name_) - summary["model"]["hyperparameters"] = model.get_params(deep=True) + if pipeline is not None: + summary["model"]["input"] = list(col_transformer.get_feature_names_out()) + summary["model"]["hyperparameters"] = pipeline.get_params(deep=True) + else: + summary["model"]["input"] = list(model.feature_name_) + summary["model"]["hyperparameters"] = model.get_params(deep=True) # Clean hyperparameters for the sklearn pipeline or other non-serializable objects _clean_hyperparameters(summary["model"]["hyperparameters"]) @@ -197,6 +216,10 @@ def export_model( ] } + # Restore pipeline to model variable + if pipeline: + model = pipeline + # Generate metrics if model_library == ModelLibrary.CATBOOST: y_train_pred = pd.Series(model.predict(X_train).reshape(-1)).reset_index( @@ -210,14 +233,17 @@ def export_model( y_test_pred = pd.Series(model.predict(X_test)).reset_index(drop=True) if problem_type == ProblemType.CLASSIFICATION: - summary["results"] = { - "train_score": generate_metrics_classification( - y_train.reset_index(drop=True), y_train_pred - ), - "test_score": generate_metrics_classification( - y_test.reset_index(drop=True), y_test_pred - ), - } + if not custom_metrics: + summary["results"] = { + "train_score": generate_metrics_classification( + y_train.reset_index(drop=True), y_train_pred + ), + "test_score": generate_metrics_classification( + y_test.reset_index(drop=True), y_test_pred + ), + } + else: + summary["results"] = custom_metrics elif problem_type == ProblemType.REGRESSION: summary["results"] = { "train_score": generate_metrics_regression( @@ -229,9 +255,13 @@ def export_model( } # Prepare environment to save files - folder_name_default = f"experiment_{model_name}_{datetime.now().strftime('%Y%m%d-%H%M%S')}" - folder_name = custom_folder_name or folder_name_default - folder_name = os.path.join(base_path, folder_name) + folder_name_default = ( + f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_experiment_{model_name}" + ) + folder_name = base_folder_name or folder_name_default + folder_name = os.path.join( + base_path, f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_{folder_name}" + ) # Compress model and save if save_model: @@ -306,143 +336,161 @@ def export_model( return folder_name -def _find_saving_parameters_from_structure(experiment_folder): - """ - Find the saving parameters from the structure of the experiment folder. - :param experiment_folder: - :return: +class MLExperiment: """ - # Walk around the folder and find the files. - # Should return the following dictionary: - # { - # "save_datasets": True, if inside folder datasets.zip or data/ exist. - # "save_model": True, if inside folder model.zip or model/ exist. - # "zip_files": True, if inside folder model.zip or data.zip exist. - # } - if not os.path.exists(experiment_folder): - raise FileNotFoundError(f"The folder {experiment_folder} does not exist.") - - for root, dirs, files in os.walk(experiment_folder): - assert ( - "summary.json" in files - ), "The summary.json file is missing. Check if folder is a valid experiment folder." - # Filter possible new files in new versions of experiments. - files = [ - file - for file in files - if file in ["summary.json", "model.zip", "datasets.zip"] - ] - # Check if the files are in the root folder. - if "model.zip" in files or "datasets.zip" in files: - return { - "save_datasets": True if "datasets.zip" in files else False, - "save_model": True if "model.zip" in files else False, - "zip_files": True, - } - # Check if subfolders exist. - if "model" in dirs or "data" in dirs: - return { - "save_datasets": True if "datasets" in dirs else False, - "save_model": True if "model" in dirs else False, - "zip_files": False, - } - + MLExperiment is a class that represents a machine learning experiment. It provides functionalities to initialize metrics, + get feature importance, plot ROC curve, plot precision recall curve, plot feature importance, register an experiment, + predict using the model, and load an experiment from a registered experiment. + + Currently, the following libraries are supported both for regression and classification problems: + - scikit-learn + - lightgbm + - catboost + + Attributes: + - **config:** Configuration for the experiment. (Not implemented yet) + - **X_train:** Training data. + - **y_train:** Training target. + - **X_test:** Test data. + - **y_test:** Test target. + - **model:** A model from one of the supported libraries. + - **problem_type:** Type of the problem (classification or regression). + - **name:** Name of the experiment. + - **description:** Description of the experiment. + + Methods: + - **get_feature_importance():** Returns the feature importance of the model. If linear model, returns the coefficients. + - **plot_roc_curve(show=False):** Plots the ROC curve of the experiment. If show is True, it displays the plot. + - **plot_precision_recall_curve(show=False):** Plots the precision recall curve of the experiment. If show is True, it displays the plot. + - **plot_feature_importance(show=False):** Plots the feature importance of the experiment. If show is True, it displays the plot. + - **register_experiment(base_path, save_model=True, save_datasets=True, zip_files=True):** Registers the experiment and saves it as a zip file. + - **from_registered_experiment(experiment_path):** Loads the experiment from a registered experiment. -def _unzip_experiment_folder(experiment_path): - files = [ - file - for file in os.listdir(experiment_path) - if file in ["model.zip", "datasets.zip"] - ] - for file in files: - # Think of a better way to do this with shutil. - shutil.unpack_archive( - os.path.join(experiment_path, file), - os.path.join(experiment_path, file.rstrip(".zip")), - ) - os.remove(os.path.join(experiment_path, file)) - - -def _zip_experiment_folder(experiment_path): - unzipped_folders = [ - folder - for folder in os.listdir(experiment_path) - if folder in ["model", "datasets"] - ] - for folder in unzipped_folders: - shutil.make_archive( - os.path.join(experiment_path, folder), - "zip", - os.path.join(experiment_path, folder), - ) - shutil.rmtree(os.path.join(experiment_path, folder)) - - -class MLExperiment: - __VALID_MODELS = { - "sklearn": { - "regression": "sklearn.linear_model.LinearRegression", - "classification": "sklearn.linear_model.LogisticRegression", - }, - "lightgbm": { - "regression": "lightgbm.LGBMRegressor", - "classification": "lightgbm.LGBMClassifier", - }, - "catboost": { - "regression": "catboost.CatBoostRegressor", - "classification": "catboost.CatBoostClassifier", - }, - } + Usage + ----- + >>> from sklearn.datasets import fetch_california_housing + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> X, y = fetch_california_housing(return_X_y=True, as_frame=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y) + >>> model = LogisticRegression() + >>> model.fit(X_train, y_train) + >>> experiment = MLExperiment(model=model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, problem_type=ProblemType.CLASSIFICATION, name="Logistic Regression Experiment", description="This is a logistic regression experiment.") + >>> experiment.plot_roc_curve(show=True) + >>> experiment.plot_precision_recall_curve(show=True) + >>> experiment.plot_feature_importance(show=True) + >>> experiment.register_experiment(base_path="/my_experiments_folder") + >>> loaded_experiment = MLExperiment.from_registered_experiment(experiment_path="/my_experiments_folder/Logistic Regression Experiment") + """ def __init__( self, - config=None, - X_train=None, - y_train=None, - X_test=None, - y_test=None, - model=None, - problem_type=None, - name=None, - description=None, + *, + config: BaseConfig = None, + X_train: Optional[pd.DataFrame] = None, + y_train: Optional[pd.Series] = None, + X_test: Optional[pd.DataFrame] = None, + y_test: Optional[pd.Series] = None, + model: Any = None, + problem_type: Union[str, ProblemType] = None, + name: str = None, + description: Optional[str] = None, ): + """ + Initializes an instance of the MLExperiment class. + + :param config: Configuration for the experiment. Not implemented yet. + :type config: :class:`BaseConfig`, optional + :param X_train: Training data. + :type X_train: :class:`pd.DataFrame`, optional + :param y_train: Training target. + :type y_train: :class:`pd.Series`, optional + :param X_test: Test data. + :type X_test: :class:`pd.DataFrame`, optional + :param y_test: Test target. + :type y_test: :class:`pd.Series`, optional + :param model: A model from one of the supported libraries. + :type model: Any, optional + :param problem_type: Type of the problem (classification or regression). + :type problem_type: Union[str, ProblemType], optional + :param name: Name of the experiment. + :type name: str, optional + :param description: Description of the experiment. + :type description: str, optional + + :raises NotImplementedError: If the config parameter is provided, as it's not implemented yet. + """ + # For this version not implement config setup of the experiment if config: - self._config = config - self._name = self._config("name") - self._description = self._config("description") - self._problem_type = self._config("problem_type") - self._model = self._load_model_from_config(self._config) - self._X_train = pd.read_csv(self._config("X_train")) - self._y_train = pd.read_csv(self._config("y_train")) - self._X_test = pd.read_csv(self._config("X_test")) - self._y_test = pd.read_csv(self._config("y_test")) - else: - self._config = None - self._name = name - self._description = description - self._problem_type = ProblemType(problem_type) - self._model = model - assert X_train is not None, "X_train cannot be None." - self._X_train = X_train - assert y_train is not None, "y_train cannot be None." - self._y_train = y_train - assert X_test is not None, "X_test cannot be None." - self._X_test = X_test - assert y_test is not None, "y_test cannot be None." - self._y_test = y_test - - self._metrics = None - self._column_transformer = None - self._base_model_library = None - self._base_model = None - self._num_preprocessors = None - self._num_features = None - self._cat_preprocessors = None - self._cat_features = None - self._model_params = None + raise NotImplementedError("Config usage is not implemented yet.") + + # Search for supported libraries + self._search_for_supported_libraries() + + # Public properties (Not defined in the if config block) + self.name = name + self.description = description + self.problem_type = problem_type + self.model = model + self.base_model = None + self.num_classes = None + self.imbalance = None + self.metrics = None + self.best_threshold_roc_curve = self.best_threshold_pr_curve = 0.5 + self.base_model_library = None + + # Setup datasets + self.X_train = X_train + self.y_train = y_train + self.X_test = X_test + self.y_test = y_test + + if self.problem_type == ProblemType.CLASSIFICATION: + self.num_classes = len(self.y_test.unique()) + # Consider imbalance if for every 5 positive examples there are 25 negative examples. + self.imbalance = ( + self.y_train.value_counts().values[1] + / self.y_train.value_counts().values[0] + < 0.2 + ) + # Private properties + self._tpr_list = None + self._fpr_list = None + self._precision_list = None + self._recall_list = None + self._config = None + self._is_pipeline = isinstance(self.model, self.pipeline_class) + + # Final Setup + self._set_base_model_and_library() + self._set_datasets_dtypes() + self._init_metrics() + + def __eq__(self, other): + assert isinstance(other, MLExperiment), "Can only compare with MLExperiment" + assert_frame_equal(self.X_train, other.X_train, check_dtype=False) + assert_series_equal(self.y_train, other.y_train, check_dtype=False) + assert_frame_equal(self.X_test, other.X_test, check_dtype=False) + assert_series_equal(self.y_test, other.y_test, check_dtype=False) + return ( + self.name == other.name + and self.description == other.description + # and self.model == other.model # Cannot compare models + and self.metrics == other.metrics + and self.problem_type == other.problem_type + and self.num_classes == other.num_classes + # and self.base_model == other.base_model # Cannot compare models + and self.base_model_library == other.base_model_library + and self.imbalance == other.imbalance + ) + + # Properties @property - def name(self): + def name(self) -> str: + """ + Name of the experiment. + """ return self._name @name.setter @@ -450,15 +498,23 @@ def name(self, value): self._name = value @property - def description(self): + def description(self) -> str: + """ + Description of the experiment. + """ return self._description @description.setter def description(self, value): + if value is None: + logging.warning("Description is empty.") self._description = value @property - def model(self): + def model(self) -> Any: + """ + The full model from the supported libraries. + """ return self._model @model.setter @@ -466,190 +522,679 @@ def model(self, value): self._model = value @property - def X_train(self): + def X_train(self) -> pd.DataFrame: + """ + Training data. + """ return self._X_train @X_train.setter def X_train(self, value): + if value is None: + raise ValueError("X_train cannot be None.") self._X_train = value @property - def y_train(self): + def y_train(self) -> pd.Series: + """ + Training target. + """ return self._y_train @y_train.setter def y_train(self, value): + if value is None: + raise ValueError("y_train cannot be None.") + if isinstance(value, pd.DataFrame): + if value.shape[1] == 1: + value = value.iloc[:, 0] + else: + raise ValueError("y_train must be a pandas Series.") + if not isinstance(value, pd.Series): + raise ValueError("y_train must be a pandas Series.") self._y_train = value @property - def X_test(self): + def X_test(self) -> pd.DataFrame: + """ + Test data. + """ return self._X_test @X_test.setter def X_test(self, value): + if value is None: + raise ValueError("X_test cannot be None.") self._X_test = value @property - def y_test(self): + def y_test(self) -> pd.Series: + """ + Test target. + """ return self._y_test @y_test.setter def y_test(self, value): + if value is None: + raise ValueError("y_test cannot be None.") + if isinstance(value, pd.DataFrame): + if value.shape[1] == 1: + value = value.iloc[:, 0] + else: + raise ValueError("y_train must be a pandas Series.") + if not isinstance(value, pd.Series): + raise ValueError("y_train must be a pandas Series.") self._y_test = value @property - def metrics(self): - if self._metrics is None: - logging.warning("Metrics have not been calculated yet. Calculating now.") - if self._problem_type == ProblemType.REGRESSION: - self._metrics = generate_metrics_regression( - self._y_test, self._model.predict(self._X_test) - ) - elif self._problem_type == ProblemType.CLASSIFICATION: - self._metrics = generate_metrics_classification( - self._y_test, self._model.predict(self._X_test) - ) + def metrics(self) -> dict: + """ + Dictionary with the metrics of the experiment. + """ return self._metrics @metrics.setter def metrics(self, value): self._metrics = value - def _check_model_is_fitted(self): + @property + def problem_type(self) -> ProblemType: + """ + Type of the problem (classification or regression). + """ + return self._problem_type + + @problem_type.setter + def problem_type(self, value): + if value is None: + raise ValueError("problem_type cannot be None.") + # Check if is already an enum + if isinstance(value, ProblemType): + self._problem_type = value + else: + self._problem_type = ProblemType(value) + + @property + def num_classes(self) -> Optional[int]: + """ + Number of classes in the classification problem. If it's a regression problem, it's None. + """ + return self._num_classes + + @num_classes.setter + def num_classes(self, value): + self._num_classes = value + + @property + def base_model(self) -> Any: + """ + The base model from the supported libraries. If model is a pipeline, it's the last step of the pipeline, + otherwise it's the model. + """ + return self._base_model + + @base_model.setter + def base_model(self, value): + self._base_model = value + + @property + def base_model_library(self) -> ModelLibrary: + """ + The library of the base model. + """ + return self._base_model_library + + @base_model_library.setter + def base_model_library(self, value): + self._base_model_library = value + + @property + def imbalance(self) -> Optional[bool]: + """ + Whether the problem is imbalanced or not. If it's a regression problem, it's None. + """ + return self._imbalance + + @imbalance.setter + def imbalance(self, value): + self._imbalance = value + + # Utility methods + def _search_for_supported_libraries(self): + """ + Search if libraries are installed and lazy import them. + """ + self._SUPPORTED_LIBRARIES_CLASSES = {} + try: + from sklearn.base import BaseEstimator + from sklearn.pipeline import Pipeline + from sklearn.linear_model import LogisticRegression, LinearRegression + + self.pipeline_class = Pipeline + self.sklearn_linear_regression_class = LinearRegression + self.sklearn_logistic_regression_class = LogisticRegression + + self._SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator + except ImportError: + self.pipeline_class = _DummyPipeline + self.sklearn_linear_regression_class = _DummyLinearRegression + self.sklearn_logistic_regression_class = _DummyLogisticRegression + try: + from catboost import CatBoost + + self._SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.CATBOOST] = CatBoost + except ImportError: + pass + try: + from lightgbm import LGBMModel + + self._SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.LIGHTGBM] = LGBMModel + except ImportError: + pass + + def _set_datasets_dtypes(self): + """ + Set the datasets dtypes to the correct ones so that CatBoost works. + """ + # Set X_train dtypes + if self.base_model_library == ModelLibrary.CATBOOST: + for col_idx in self.base_model.get_param("cat_features") or []: + self.X_train.iloc[:, col_idx] = self.X_train.iloc[:, col_idx].astype( + str + ) + self.X_test.iloc[:, col_idx] = self.X_test.iloc[:, col_idx].astype(str) + + def _generate_classification_metrics_with_threshold(self): + """ + Helper function to generate the classification metrics with different thresholds. + """ + self.metrics = {"train_score": {}, "test_score": {}} + if self.num_classes == 2: + y_pred_train = self.model.predict_proba(self.X_train)[:, 1] + y_pred_test = self.model.predict_proba(self.X_test)[:, 1] + for threshold in [i / 100 for i in range(1, 101)]: + self.metrics["train_score"][ + threshold + ] = generate_metrics_classification( + self.y_train, y_pred_train >= threshold + ) + self.metrics["test_score"][threshold] = generate_metrics_classification( + self.y_test, y_pred_test >= threshold + ) + else: + self.metrics = {} + y_pred_train = self.model.predict(self.X_train) + y_pred_test = self.model.predict(self.X_test) + self.metrics["train_score"] = generate_metrics_classification( + self.y_train, y_pred_train + ) + self.metrics["test_score"] = generate_metrics_classification( + self.y_test, y_pred_test + ) + + @staticmethod + def _find_saving_parameters_from_structure(experiment_folder: str) -> dict: + """ + Find the paramrters used to export the experiment from the structure of the experiment folder. + Walk around the folder and find the files. + Returns a dictionary with the following keys: + + - save_datasets: Whether the datasets were saved or not. + - save_model: Whether the model was saved or not. + - zip_files: Whether the files were zipped or not. + + :param experiment_folder: Path to the experiment folder. + :type experiment_folder: str + :return: A dictionary with the saving parameters. + """ + if not os.path.exists(experiment_folder): + raise FileNotFoundError(f"The folder {experiment_folder} does not exist.") + + for root, dirs, files in os.walk(experiment_folder): + assert ( + "summary.json" in files + ), "The summary.json file is missing. Check if folder is a valid experiment folder." + # Filter possible new files in new versions of experiments. + files = [ + file + for file in files + if file in ["summary.json", "model.zip", "datasets.zip"] + ] + # Check if the files are in the root folder. + if "model.zip" in files or "datasets.zip" in files: + return { + "save_datasets": True if "datasets.zip" in files else False, + "save_model": True if "model.zip" in files else False, + "zip_files": True, + } + # Check if subfolders exist. + if "model" in dirs or "data" in dirs: + return { + "save_datasets": True if "datasets" in dirs else False, + "save_model": True if "model" in dirs else False, + "zip_files": False, + } + + @staticmethod + def _unzip_experiment_folder(experiment_path: str): """ - Check if the model is fitted. + Unzip the experiment folder. + :param experiment_path: Path to the experiment folder. + :type experiment_path: str + :return: None + """ + files = [ + file + for file in os.listdir(experiment_path) + if file in ["model.zip", "datasets.zip"] + ] + for file in files: + # Think of a better way to do this with shutil. + shutil.unpack_archive( + os.path.join(experiment_path, file), + os.path.join(experiment_path, file.rstrip(".zip")), + ) + os.remove(os.path.join(experiment_path, file)) + + @staticmethod + def _zip_experiment_folder(experiment_path: str): + """ + Zip the experiment folder. + :param experiment_path: Path to the experiment folder. + :type experiment_path: str :return: """ - pass + unzipped_folders = [ + folder + for folder in os.listdir(experiment_path) + if folder in ["model", "datasets"] + ] + for folder in unzipped_folders: + shutil.make_archive( + os.path.join(experiment_path, folder), + "zip", + os.path.join(experiment_path, folder), + ) + shutil.rmtree(os.path.join(experiment_path, folder)) - def _get_model_from_string(self, model_string): + def _load_model_from_config(self): """ - Get the model from the string. - :param model_string: + Load the model from the config. :return: """ pass - def _load_model_from_config(self, config): + def _set_base_model_and_library(self): """ - Load the model from the config. - :param config: + Get the model library from the model or pipeline. + Sets the following attributes: + + - base_model + - base_model_library + + :return: None + """ + # Detect if pipeline or model + if self._is_pipeline: + # Get the last step + model = self.model[-1] + self.base_model = model + else: + model = self.model + self.base_model = model + + # Get the library + matching_libraries = [] + for library, class_name in self._SUPPORTED_LIBRARIES_CLASSES.items(): + if isinstance(model, class_name): + matching_libraries.append(library) + # Some models inherit from sklearn hence if len(matching_libraries) > 1 and sklearn is one of them pop it + if len(matching_libraries) == 1: + pass + elif ( + len(matching_libraries) == 2 + and ModelLibrary.SCIKIT_LEARN in matching_libraries + ): + matching_libraries.remove(ModelLibrary.SCIKIT_LEARN) + else: + raise ValueError( + f"Could not detect library or is not installed. Model name {model.__class__.__name__}" + ) + self.base_model_library = matching_libraries[0] + + def _calc_precision_recall_curve_data(self): + """ + Get the data to plot the precision recall curve. + Sets the following attributes: + + - _precision_list + - _recall_list + - best_threshold_pr_curve + :return: """ - self.base_model_library = ModelLibrary( - config("model_library") - ) # This should be a string. - self.base_model = config( - "model" - ) # This should be a class string equal to the class name. - self.base_model = None + if self.num_classes is not None and self.num_classes > 2: + raise NotImplementedError( + "Precision recall curve is only supported for binary classification." + ) + elif self.num_classes is None: + raise ValueError( + "Precision recall curve is only for classification problems" + ) + precision_list = [] + recall_list = [] + best_distance = 9999 + best_threshold = None + for threshold, metric in self.metrics["test_score"].items(): + precision = metric["precision"] + recall = metric["recall"] + precision_list.append(precision) + recall_list.append(recall) + distance = (precision - 1) ** 2 + (recall - 1) ** 2 + if distance <= best_distance: + best_distance = distance + best_threshold = threshold + self.best_threshold_pr_curve = best_threshold + self._precision_list = precision_list + self._recall_list = recall_list + + def _calc_roc_curve_data(self): + """ + Get the data to plot the roc curve. + Sets the following attributes: + + - _tpr_list + - _fpr_list + - best_threshold_roc_curve + + :return: + """ + if self.num_classes is not None and self.num_classes > 2: + raise NotImplementedError( + "ROC curve is only supported for binary classification." + ) + elif self.num_classes is None: + raise ValueError("ROC curve is only for classification problems") + tpr_list = [] + fpr_list = [] + best_distance = 9999 + best_threshold = None + for threshold, metric in self.metrics["test_score"].items(): + (tn, fp), (fn, tp) = metric["confusion_matrix"] + tpr = tp / (tp + fn) + fpr = fp / (fp + tn) + tpr_list.append(tpr) + fpr_list.append(fpr) + distance = (tpr - 1) ** 2 + (fpr - 0) ** 2 + if distance <= best_distance: + best_distance = distance + best_threshold = threshold + self.best_threshold_roc_curve = best_threshold + self._tpr_list = tpr_list + self._fpr_list = fpr_list + + # Public methods + def _init_metrics(self): + """ + Initialize the metrics for the experiment. + Sets the following attributes: + + - metrics - # This would be strings and we need to somehow convert them to the actual objects. - self.num_preprocessors = config( - "numeric_preprocessors" - ) # This should be a dictionary of classes. - self.num_features = config("numeric_features") # This should be a list. - self.cat_preprocessors = config( - "categorical_preprocessors" - ) # This should be a list. - self.cat_features = config("categorical_features") # This should be a list. - self.model_params = config("model_params") # This should be a dictionary. - - # Create Pipeline from sklearn. - # Create the numeric pipeline. - if self.base_model_library == ModelLibrary.SCIKIT_LEARN: - self._column_transformer = ColumnTransformer( - transformers=[ - ( - "numeric_pipeline", - Pipeline(steps=[]), - self.num_features, - ), - ( - "categorical_pipeline", - Pipeline(steps=[]), - self.cat_features, - ), - ] + :return: + """ + if self.problem_type == ProblemType.REGRESSION: + self.metrics = generate_metrics_regression( + self.y_test, self.model.predict(self.X_test) ) + elif self.problem_type == ProblemType.CLASSIFICATION: + self._generate_classification_metrics_with_threshold() + if self.num_classes == 2: + self._calc_precision_recall_curve_data() + self._calc_roc_curve_data() - return Pipeline( - steps=[ - ("column_transformer", self._column_transformer), - ("model", self.base_model(**self.model_params)), - ] + def get_feature_importance(self) -> pd.Series: + """ + Get the feature importance of the model. In case of a linear model, it returns the coefficients. + :return: A pandas Series with the feature importance. + :rtype: :class:`pd.Series` + :raises NotImplementedError: If the model does not support feature importance. + """ + + is_linear_model = isinstance( + self.base_model, + ( + self.sklearn_linear_regression_class, + self.sklearn_logistic_regression_class, + ), + ) + + if self._is_pipeline: + # Assume first step is the column transformer + feature_names = self.model[0].get_feature_names_out() + else: + feature_names = self.X_train.columns + + if is_linear_model: + # Linear model from sklearn + feature_importance = self.base_model.coef_[0] + return pd.Series(feature_importance, index=feature_names).sort_values( + ascending=False ) - elif self.base_model_library == ModelLibrary.LIGHTGBM: - pass - elif self.base_model_library == ModelLibrary.CATBOOST: - pass + + if hasattr(self.base_model, "feature_importances_"): + # Feature importance from model + feature_importance = self.base_model.feature_importances_ + return pd.Series(feature_importance, index=feature_names).sort_values( + ascending=False + ) + raise NotImplementedError( + f"Feature importance is not supported for model {self.base_model.__class__.__name__}" + ) + + def plot_roc_curve( + self, show: bool = False + ) -> Optional[Tuple[plt.Figure, plt.Axes]]: + """ + Plot the ROC curve. If show is True, it displays the plot. + :param show: Whether to display the plot or not. + :type show: bool, optional + :return: A tuple with the matplotlib Figure and Axes. + :rtype: Tuple[plt.Figure, plt.Axes] + :raises ValueError: If the problem is not classification. + :raises NotImplementedError: If the problem is not binary classification. + """ + if self.num_classes is None: + raise ValueError("ROC curve is only for classification problems") + elif self.num_classes > 2: + raise NotImplementedError( + "ROC curve is only supported for binary classification." + ) + fig, ax = plt.subplots(figsize=(15, 10)) + # Scatter and show cmap legend + thresholds = list(self.metrics["test_score"].keys()) + ax.scatter(self._fpr_list, self._tpr_list, c=thresholds, cmap="viridis") + ax.set_title(f"ROC Curve, best threshold {self.best_threshold_roc_curve:.2f}") + ax.set_xlabel("False Positive Rate") + ax.set_ylabel("True Positive Rate") + # Add circle around best threshold + best_threshold_idx = int( + self.best_threshold_roc_curve * 100 - 1 + ) # Due to how the thresholds are generated + ax.scatter( + self._fpr_list[best_threshold_idx], + self._tpr_list[best_threshold_idx], + s=100, + facecolors="none", + edgecolors="r", + ) + fig.add_axes(ax) + fig.colorbar(ax.collections[0], ax=ax) + fig.tight_layout() + if show: + plt.show() + else: + return fig, ax + + def plot_precision_recall_curve( + self, show=False + ) -> Optional[Tuple[plt.Figure, plt.Axes]]: + """ + Plot the precision recall curve. If show is True, it displays the plot. + :param show: Whether to display the plot or not. + :type show: bool, optional + :return: A tuple with the matplotlib Figure and Axes. + :rtype: Tuple[plt.Figure, plt.Axes] + :raises ValueError: If the problem is not classification. + :raises NotImplementedError: If the problem is not binary classification. + """ + if self.num_classes is None: + raise ValueError( + "Precision recall curve is only for classification problems" + ) + elif self.num_classes > 2: + raise NotImplementedError( + "Precision recall curve is only supported for binary classification." + ) + fig, ax = plt.subplots(figsize=(15, 10)) + # Scatter and show cmap legend + thresholds = list(self.metrics["test_score"].keys()) + ax.scatter( + self._recall_list, self._precision_list, c=thresholds, cmap="viridis" + ) + ax.set_title( + f"Precision Recall Curve, best threshold {self.best_threshold_pr_curve:.2f}" + ) + ax.set_xlabel("Recall") + ax.set_ylabel("Precision") + # Add circle around best threshold + best_threshold_idx = int(self.best_threshold_pr_curve * 100 - 1) + ax.scatter( + self._recall_list[best_threshold_idx], + self._precision_list[best_threshold_idx], + s=100, + facecolors="none", + edgecolors="r", + ) + fig.add_axes(ax) + fig.colorbar(ax.collections[0], ax=ax) + fig.tight_layout() + if show: + plt.show() + else: + return fig, ax + + def plot_feature_importance( + self, show=False + ) -> Optional[Tuple[plt.Figure, plt.Axes]]: + """ + Plot the feature importance. If show is True, it displays the plot. + :param show: Whether to display the plot or not. + :type show: bool, optional + :return: A tuple with the matplotlib Figure and Axes. + :rtype: Tuple[plt.Figure, plt.Axes] + """ + importance = self.get_feature_importance() + fig, ax = plt.subplots(figsize=(20, 40)) + # Sort importance + importance = importance.sort_values(ascending=True) + ax.barh(importance.index, importance.values) + ax.set_title("Feature Importance") + ax.set_xlabel("Importance") + ax.set_ylabel("Feature") + # Tight layout + fig.tight_layout() + if show: + plt.show() else: - raise ValueError(f"{self.base_model_library} is not a valid model library.") + return fig, ax def register_experiment( - self, base_path, save_model=True, save_datasets=True, zip_files=True - ): + self, + base_path, + zip_files: bool = True, + ) -> str: """ - Register the experiment and save it as a zip file. - :param base_path: - :param save_model: - :param save_datasets: - :param zip_files: - :return: + Register the experiment and save it + :param base_path: Path to the folder where the experiment will be saved. + :type base_path: str + :param zip_files: Whether to zip the files or not. + :type zip_files: bool, optional + :return: The path to the experiment folder. + :rtype: str """ + custom_metrics = self.metrics + if self.num_classes == 2: + # Make sure is inserted at the beginning of the dictionary. + threshold = ( + self.best_threshold_pr_curve + if self.imbalance + else self.best_threshold_roc_curve + ) + custom_metrics = { + "best_threshold": { + "value": threshold, + "train_score": self.metrics["train_score"][threshold], + "test_score": self.metrics["test_score"][threshold], + }, + **custom_metrics, + } return export_model( self.model, self.X_train, self.y_train, self.X_test, self.y_test, + description=self.description, + custom_metrics=custom_metrics, base_path=base_path, - custom_folder_name=self.name, - save_model=save_model, - save_datasets=save_datasets, + base_folder_name=self.name, + save_model=True, + save_datasets=True, zip_files=zip_files, ) @classmethod - def from_registered_experiment(cls, experiment_path): + def from_registered_experiment(cls, experiment_path: str): """ Load the experiment from a registered experiment. - :param experiment_path: - :return: + :param experiment_path: Path to the experiment folder. + :return: An instance of MLExperiment. """ - # Read files in the folder and load them. - # Get saving params - saving_params = _find_saving_parameters_from_structure(experiment_path) + saving_params = cls._find_saving_parameters_from_structure(experiment_path) + + # Try-except-finally to make sure we zip the folder again if it was unzipped and an exception is raised. try: if saving_params["zip_files"]: - _unzip_experiment_folder(experiment_path) + cls._unzip_experiment_folder(experiment_path) with open(os.path.join(experiment_path, "summary.json"), "r") as f: summary = json.load(f) # Set params with open(os.path.join(experiment_path, "model", "model.pkl"), "rb") as f: model = pickle.load(f) - if saving_params["save_datasets"]: - X_train = pd.read_csv( - os.path.join(experiment_path, "datasets", "X_train.csv") - ) - y_train = pd.read_csv( - os.path.join(experiment_path, "datasets", "y_train.csv") - ) - X_test = pd.read_csv( - os.path.join(experiment_path, "datasets", "X_test.csv") - ) - y_test = pd.read_csv( - os.path.join(experiment_path, "datasets", "y_test.csv") - ) - else: - X_train = None - y_train = None - X_test = None - y_test = None + + X_train = pd.read_csv( + os.path.join(experiment_path, "datasets", "X_train.csv"), + low_memory=False, + ) + y_train = pd.read_csv( + os.path.join(experiment_path, "datasets", "y_train.csv") + ) + X_test = pd.read_csv( + os.path.join(experiment_path, "datasets", "X_test.csv"), + low_memory=False, + ) + y_test = pd.read_csv( + os.path.join(experiment_path, "datasets", "y_test.csv") + ) + # Make sure is a pd.Series + y_train = y_train.iloc[:, 0] + y_test = y_test.iloc[:, 0] experiment = cls( - name=summary["model"]["name"], - description=summary["model"].get("description", ""), + name=summary.get("name", experiment_path.split("-", 1)[1].rstrip("/")), + description=summary.get("description", ""), problem_type=ProblemType(summary["model"]["problem_type"]), model=model, X_train=X_train, @@ -657,34 +1202,75 @@ def from_registered_experiment(cls, experiment_path): X_test=X_test, y_test=y_test, ) - experiment.metrics = summary["results"] except Exception as e: raise e finally: # Raise exception but make sure we zip the folder again if it was unzipped. if saving_params["zip_files"]: # Zip the folder again. - _zip_experiment_folder(experiment_path) + cls._zip_experiment_folder(experiment_path) return experiment class MLTracker: + """ + MLTracker is a class that manages multiple machine learning experiments. It provides functionalities to scan for + existing experiments, add new experiments, compare experiments, update experiment metrics, and generate comparison + dataframes and hyperparameters json. + + Attributes: + **experiment_folder:** The folder where the experiments are stored. + **experiments:** A dictionary of the experiments. + + Methods: + **scan_for_experiments():** Scans the experiment folder for existing experiments. + **add_experiment(exp: MLExperiment):** Adds a new experiment to the tracker. + **compare_experiments(experiments=None, show_plots=False):** Compares the experiments. + **update_experiments_metrics():** Only use to update old versions of experiments from MLExperiment. + **create_comparison_df(save=True):** Creates a comparison dataframe of the experiments. + **create_hyperparameters_json(save=True):** Creates a json file of the hyperparameters of the experiments. + + Usage + ----- + >>> from mango.models.experiment_tracking import MLExperiment, MLTracker + >>> tracker = MLTracker(experiment_folder="/path/to/experiments") + >>> tracker.scan_for_experiments() + >>> experiment = MLExperiment.from_registered_experiment(experiment_path="/path/to/experiment") + >>> tracker.add_experiment(experiment) + >>> tracker.compare_experiments() + >>> tracker.create_comparison_df(save=True) + >>> tracker.create_hyperparameters_json(save=True) + """ + def __init__(self, experiment_folder): - self._experiment_folder = experiment_folder - self._experiments = {} + """ + Initializes an instance of the MLTracker class. + :param experiment_folder: The folder where the experiments are stored. + :type experiment_folder: str + """ + self.experiment_folder = experiment_folder + self.experiments = {} @property - def experiment_folder(self): + def experiment_folder(self) -> str: return self._experiment_folder + @experiment_folder.setter + def experiment_folder(self, value): + self._experiment_folder = value + @property - def experiments(self): + def experiments(self) -> dict: return self._experiments + @experiments.setter + def experiments(self, value): + self._experiments = value + def scan_for_experiments(self): """ - Scan the experiment folder for experiments. - :return: + Scan the experiment folder for experiments and load them. + :return: None """ for experiments_folders in os.listdir(self.experiment_folder): if os.path.isdir(os.path.join(self.experiment_folder, experiments_folders)): @@ -703,20 +1289,191 @@ def scan_for_experiments(self): logging.error(e, exc_info=True) logging.info(f"Found {len(self._experiments)} experiments.") - def add_experiment(self, exp, register=True): + def add_experiment(self, experiment: MLExperiment): """ Add an experiment to the tracker. - :param exp: - :return: + :param experiment: An instance of MLExperiment. + :type experiment: :class:`mango.models.experiment_tracking.MLExperiment` + :return: None """ + # Make sure exp.name is not in self._experiments. - if exp.name in self._experiments or exp.name in os.listdir(self.experiment_folder): - logging.warning("Experiment name already exists. Creating with suffix.") - for i in range(1, 1000): - if f"{exp.name} ({i})" not in self._experiments and f"{exp.name} ({i})" not in os.listdir(self.experiment_folder): - exp.name = f"{exp.name} ({i})" - break - self._experiments[exp.name] = exp - logging.info(f"Added experiment {exp.name} to the tracker. Current experiments: {len(self._experiments)}.") - if register: - exp.register_experiment(self.experiment_folder) + exp_folder_name = experiment.register_experiment(self.experiment_folder) + self._experiments[os.path.basename(exp_folder_name)] = experiment + logging.info( + f"Added experiment {exp_folder_name} to the tracker. Current experiments: {len(self._experiments)}." + ) + + def create_plots( + self, show_plots: bool = False + ) -> Optional[Tuple[plt.Figure, plt.Axes]]: + """ + Create plots for the experiments. In classification problems, it creates the ROC curve, precision recall curve, + and feature importance plots. + + In regression problems, it creates the feature importance plot only. + + :param show_plots: If True, it displays the plots. + :return: figures and axes of the plots if show_plots is False. + """ + for experiment_name, experiment in self._experiments.items(): + if experiment.problem_type == ProblemType.CLASSIFICATION: + fig, ax = experiment.plot_roc_curve() + ax.set_title(experiment_name + "_" + ax.get_title()) + fig.savefig( + os.path.join( + self.experiment_folder, experiment_name, "roc_curve.png" + ) + ) + fig, ax = experiment.plot_precision_recall_curve() + ax.set_title(experiment_name + "_" + ax.get_title()) + fig.savefig( + os.path.join( + self.experiment_folder, experiment_name, "precision_recall.png" + ) + ) + if show_plots: + fig.show() + fig.close() + fig, ax = experiment.plot_feature_importance() + ax.set_title(experiment_name + "_" + ax.get_title()) + fig.savefig( + os.path.join( + self.experiment_folder, experiment_name, "feature_importance.png" + ) + ) + if show_plots: + fig.show() + fig.close() + + return None + + def update_experiments_metrics(self): + """ + Update the metrics of the experiments. Only use to update old versions of experiments. + """ + for experiment_name, experiment in self._experiments.items(): + # Make sure metrics are updated. + json_path = os.path.join( + self.experiment_folder, experiment_name, "summary.json" + ) + with open(json_path, "r") as f: + summary = json.load(f) + custom_metrics = experiment.metrics + if experiment.num_classes == 2: + # Make sure is inserted at the beginning of the dictionary. + threshold = ( + experiment.best_threshold_pr_curve + if experiment.imbalance + else experiment.best_threshold_roc_curve + ) + custom_metrics = { + "best_threshold": { + "value": threshold, + "train_score": experiment.metrics["train_score"][threshold], + "test_score": experiment.metrics["test_score"][threshold], + }, + **custom_metrics, + } + summary["results"] = custom_metrics + with open(json_path, "w") as f: + json.dump(summary, f, indent=4, ensure_ascii=False) + logging.info(f"Updated experiment {experiment_name}.") + + def create_comparison_df(self, save: bool = True) -> pd.DataFrame: + """ + Create a comparison dataframe. + :param save: If True, it saves the dataframe to an excel file or csv file if openpyxl is not installed. + :type save: bool, optional + :return: A pandas dataframe. + """ + row_index = [] + metrics_row = [] + for experiment_name, experiment in self.experiments.items(): + metrics = experiment.metrics + metadata = { + "experiment_name": experiment_name, + "description": experiment.description, + "date": pd.to_datetime(experiment_name.split("_")[0]), + } + if experiment.problem_type == ProblemType.CLASSIFICATION: + metrics = { + "train_score": metrics["train_score"][ + experiment.best_threshold_pr_curve + ], + "test_score": metrics["test_score"][ + experiment.best_threshold_pr_curve + ], + } + metadata["best_threshold"] = ( + experiment.best_threshold_pr_curve + if experiment.imbalance + else experiment.best_threshold_roc_curve + ) + else: + metrics = { + "train_score": metrics["train_score"], + "test_score": metrics["test_score"], + } + row_index.append( + {**metadata, **metrics}, + ) + metrics_row.append(metrics) + # Make a dataframe with multilevel column for the train and test scores which are dictionaries. + df = pd.DataFrame(row_index).drop(columns=["train_score", "test_score"]) + metrics_train = pd.DataFrame([row["train_score"] for row in metrics_row]) + metrics_test = pd.DataFrame([row["test_score"] for row in metrics_row]) + # Concatenate the dataframes in a way that one from train next from test and so on. + metrics = pd.DataFrame() + for col in metrics_train.columns: + metrics = pd.concat( + [metrics, metrics_train[col], metrics_test[col]], axis=1 + ).copy() + + metrics.columns = pd.MultiIndex.from_product( + [metrics_train.columns, ["train", "test"]] + ) + df = pd.concat([df, metrics], axis=1) + # Set multilevel index + df = df.set_index(["experiment_name", "description", "date", "best_threshold"]) + # df = df.reset_index() + # level_3 must be a subindex of train_score and test_score + if save: + try: + import openpyxl + + df.to_excel( + os.path.join(self.experiment_folder, "comparison.xlsx"), + index=True, + ) + except ImportError: + logging.warning( + "Could not import openpyxl. Saving to excel will not work. Will save to csv." + ) + df.to_csv( + os.path.join(self.experiment_folder, "comparison.csv"), index=True + ) + return df + + def create_hyperparameters_json(self, save: bool = True) -> dict: + """ + Create a json with the hyperparameters of the experiments. + :param save: If True, it saves the json to a file. + :type save: bool, optional + :return: + """ + hyperparameters = {} + for experiment_name, experiment in self.experiments.items(): + with open( + os.path.join(self.experiment_folder, experiment_name, "summary.json"), + "r", + ) as f: + summary = json.load(f) + hyperparameters[experiment_name] = summary["model"]["hyperparameters"] + if save: + with open( + os.path.join(self.experiment_folder, "hyperparameters_summary.json"), + "w", + ) as f: + json.dump(hyperparameters, f, indent=4, ensure_ascii=False) + return hyperparameters diff --git a/mango/models/metrics.py b/mango/models/metrics.py index 726e9536..1f61a807 100644 --- a/mango/models/metrics.py +++ b/mango/models/metrics.py @@ -145,6 +145,8 @@ def precision_score( 0.5 """ if average == "binary": + if (y_pred == 1).sum() == 0: + return 0 return ((y_true == 1) & (y_pred == 1)).sum() / (y_pred == 1).sum() elif average == "macro": return ( @@ -210,6 +212,8 @@ def f1_score(y_true: pd.Series, y_pred: pd.Series, average: str = "binary") -> f """ precision = precision_score(y_true, y_pred, average=average) recall = recall_score(y_true, y_pred, average=average) + if precision + recall == 0: + return 0 return 2 * (precision * recall) / (precision + recall) @@ -284,8 +288,12 @@ def generate_metrics_classification( {'confusion_matrix': [[1, 1], [1, 1]], 'accuracy': 0.5, 'precision': 0.5, 'recall': 0.5, 'f1_score': 0.5} """ if len(y_true.unique()) == 2: + tp = int(((y_true == 1) & (y_pred == 1)).sum()) + tn = int(((y_true == 0) & (y_pred == 0)).sum()) + fp = int(((y_true == 0) & (y_pred == 1)).sum()) + fn = int(((y_true == 1) & (y_pred == 0)).sum()) return { - "confusion_matrix": confusion_matrix(y_true, y_pred).tolist(), + "confusion_matrix": [[tn, fp], [fn, tp]], "accuracy": round((y_true == y_pred).sum() / len(y_true), 4), "precision": round(precision_score(y_true, y_pred), 4), "recall": round(recall_score(y_true, y_pred), 4), diff --git a/mango/tests/models_module/test_experiment_tracking.py b/mango/tests/models_module/test_experiment_tracking.py index 1cde2058..c0dbfd0a 100644 --- a/mango/tests/models_module/test_experiment_tracking.py +++ b/mango/tests/models_module/test_experiment_tracking.py @@ -6,20 +6,27 @@ import numpy as np import pandas as pd from catboost import CatBoostClassifier, CatBoostRegressor -from pandas.testing import assert_frame_equal +from lightgbm import LGBMClassifier, LGBMRegressor +from pandas.testing import assert_frame_equal, assert_series_equal +from sklearn.compose import ColumnTransformer from sklearn.datasets import make_classification, make_regression from sklearn.linear_model import LinearRegression, LogisticRegression -from lightgbm import LGBMClassifier, LGBMRegressor +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import StandardScaler from mango.models.experiment_tracking import ( - export_model, ProblemType, + export_model, + MLExperiment, + MLTracker, ) +from mango.models.enums import ProblemType, ModelLibrary class InvalidModel: """ Dummy class to test errors """ + pass @@ -27,6 +34,7 @@ class TestExperimentTracking(TestCase): """ Tes suite for the experiment tracking module inside models. """ + folder_name = "test_experiment_tracking" @classmethod @@ -34,7 +42,6 @@ def setUpClass(cls): """ Create data for the tests and needed folders. """ - os.makedirs(cls.folder_name, exist_ok=True) # Classification X_clf, y_clf = make_classification( @@ -50,8 +57,8 @@ def setUpClass(cls): # Split cls.X_train_clf = X_clf[: int(len(X_clf) * 0.8)].reset_index(drop=True) cls.y_train_clf = y_clf[: int(len(y_clf) * 0.8)].reset_index(drop=True) - cls.X_test_clf = X_clf[int(len(X_clf) * 0.8):].reset_index(drop=True) - cls.y_test_clf = y_clf[int(len(y_clf) * 0.8):].reset_index(drop=True) + cls.X_test_clf = X_clf[int(len(X_clf) * 0.8) :].reset_index(drop=True) + cls.y_test_clf = y_clf[int(len(y_clf) * 0.8) :].reset_index(drop=True) # Regression X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42) @@ -65,16 +72,1343 @@ def setUpClass(cls): # Split cls.X_train_reg = X_reg[: int(len(X_reg) * 0.8)].reset_index(drop=True) cls.y_train_reg = y_reg[: int(len(y_reg) * 0.8)].reset_index(drop=True) - cls.X_test_reg = X_reg[int(len(X_reg) * 0.8):].reset_index(drop=True) - cls.y_test_reg = y_reg[int(len(y_reg) * 0.8):].reset_index(drop=True) + cls.X_test_reg = X_reg[int(len(X_reg) * 0.8) :].reset_index(drop=True) + cls.y_test_reg = y_reg[int(len(y_reg) * 0.8) :].reset_index(drop=True) - @classmethod - def tearDownClass(cls): + # Binary Classification + X_bin_clf, y_bin_clf = make_classification( + n_samples=1000, n_features=10, random_state=42, n_classes=2, n_informative=5 + ) + X_bin_clf = pd.DataFrame(X_bin_clf, columns=[f"feature_{i}" for i in range(10)]) + y_bin_clf = pd.Series(y_bin_clf, name="target") + + # Shuffle + X_bin_clf = X_bin_clf.sample(frac=1, random_state=42) + y_bin_clf = y_bin_clf[X_bin_clf.index] + + # Split + cls.X_train_bin_clf = X_bin_clf[: int(len(X_bin_clf) * 0.8)].reset_index( + drop=True + ) + cls.y_train_bin_clf = y_bin_clf[: int(len(y_bin_clf) * 0.8)].reset_index( + drop=True + ) + cls.X_test_bin_clf = X_bin_clf[int(len(X_bin_clf) * 0.8) :].reset_index( + drop=True + ) + cls.y_test_bin_clf = y_bin_clf[int(len(y_bin_clf) * 0.8) :].reset_index( + drop=True + ) + + # Expected values for roc curve + cls.expected_tpr_logistic = [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9696969696969697, + 0.9696969696969697, + 0.9696969696969697, + 0.9595959595959596, + 0.9494949494949495, + 0.9393939393939394, + 0.9393939393939394, + 0.9393939393939394, + 0.9191919191919192, + 0.9090909090909091, + 0.9090909090909091, + 0.9090909090909091, + 0.898989898989899, + 0.8888888888888888, + 0.8787878787878788, + 0.8787878787878788, + 0.8787878787878788, + 0.8686868686868687, + 0.8686868686868687, + 0.8686868686868687, + 0.8686868686868687, + 0.8484848484848485, + 0.8282828282828283, + 0.8080808080808081, + 0.797979797979798, + 0.797979797979798, + 0.7878787878787878, + 0.7878787878787878, + 0.7878787878787878, + 0.7878787878787878, + 0.7878787878787878, + 0.7878787878787878, + 0.7575757575757576, + 0.7373737373737373, + 0.7272727272727273, + 0.7272727272727273, + 0.7272727272727273, + 0.7171717171717171, + 0.6868686868686869, + 0.6868686868686869, + 0.6565656565656566, + 0.6565656565656566, + 0.6464646464646465, + 0.6363636363636364, + 0.6262626262626263, + 0.5959595959595959, + 0.5757575757575758, + 0.5656565656565656, + 0.5555555555555556, + 0.5454545454545454, + 0.5353535353535354, + 0.5252525252525253, + 0.5252525252525253, + 0.5151515151515151, + 0.47474747474747475, + 0.45454545454545453, + 0.4444444444444444, + 0.40404040404040403, + 0.37373737373737376, + 0.3333333333333333, + 0.3333333333333333, + 0.3333333333333333, + 0.29292929292929293, + 0.26262626262626265, + 0.25252525252525254, + 0.21212121212121213, + 0.1919191919191919, + 0.1717171717171717, + 0.1414141414141414, + 0.12121212121212122, + 0.09090909090909091, + 0.0707070707070707, + 0.04040404040404041, + 0.0, + ] + cls.expected_fpr_logistic = [ + 0.9504950495049505, + 0.8811881188118812, + 0.8415841584158416, + 0.7821782178217822, + 0.7425742574257426, + 0.7128712871287128, + 0.6831683168316832, + 0.6831683168316832, + 0.6435643564356436, + 0.6237623762376238, + 0.594059405940594, + 0.594059405940594, + 0.5841584158415841, + 0.5643564356435643, + 0.5445544554455446, + 0.5445544554455446, + 0.5148514851485149, + 0.504950495049505, + 0.4752475247524752, + 0.46534653465346537, + 0.45544554455445546, + 0.45544554455445546, + 0.43564356435643564, + 0.42574257425742573, + 0.39603960396039606, + 0.37623762376237624, + 0.36633663366336633, + 0.36633663366336633, + 0.3564356435643564, + 0.3465346534653465, + 0.33663366336633666, + 0.33663366336633666, + 0.32673267326732675, + 0.32673267326732675, + 0.31683168316831684, + 0.297029702970297, + 0.2871287128712871, + 0.27722772277227725, + 0.27722772277227725, + 0.26732673267326734, + 0.25742574257425743, + 0.25742574257425743, + 0.25742574257425743, + 0.2376237623762376, + 0.22772277227722773, + 0.22772277227722773, + 0.22772277227722773, + 0.21782178217821782, + 0.2079207920792079, + 0.2079207920792079, + 0.2079207920792079, + 0.2079207920792079, + 0.2079207920792079, + 0.19801980198019803, + 0.19801980198019803, + 0.19801980198019803, + 0.19801980198019803, + 0.1782178217821782, + 0.1782178217821782, + 0.16831683168316833, + 0.15841584158415842, + 0.15841584158415842, + 0.15841584158415842, + 0.1485148514851485, + 0.1485148514851485, + 0.1485148514851485, + 0.1485148514851485, + 0.1485148514851485, + 0.13861386138613863, + 0.13861386138613863, + 0.13861386138613863, + 0.13861386138613863, + 0.1188118811881188, + 0.10891089108910891, + 0.10891089108910891, + 0.09900990099009901, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.07920792079207921, + 0.06930693069306931, + 0.0594059405940594, + 0.04950495049504951, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.0297029702970297, + 0.0297029702970297, + 0.019801980198019802, + 0.019801980198019802, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + cls.expected_precision_logistic = [ + 0.5077, + 0.5266, + 0.538, + 0.5562, + 0.569, + 0.5789, + 0.5893, + 0.5893, + 0.6037, + 0.6087, + 0.6203, + 0.6203, + 0.6242, + 0.6323, + 0.6405, + 0.6405, + 0.6533, + 0.6577, + 0.6712, + 0.6759, + 0.6806, + 0.6806, + 0.6901, + 0.6929, + 0.708, + 0.7185, + 0.7218, + 0.7218, + 0.7273, + 0.7308, + 0.7344, + 0.7323, + 0.7381, + 0.7381, + 0.7398, + 0.75, + 0.7563, + 0.7627, + 0.7607, + 0.7652, + 0.7699, + 0.7699, + 0.7699, + 0.7818, + 0.789, + 0.789, + 0.789, + 0.7925, + 0.7961, + 0.7921, + 0.79, + 0.79, + 0.7879, + 0.7959, + 0.7959, + 0.7959, + 0.7959, + 0.8125, + 0.8065, + 0.8111, + 0.8182, + 0.8182, + 0.8182, + 0.8256, + 0.8193, + 0.8193, + 0.8125, + 0.8125, + 0.8205, + 0.8182, + 0.8158, + 0.8082, + 0.8261, + 0.8358, + 0.8333, + 0.8438, + 0.8548, + 0.8525, + 0.8525, + 0.85, + 0.8393, + 0.8333, + 0.8302, + 0.8333, + 0.8409, + 0.8462, + 0.8684, + 0.8919, + 0.8788, + 0.8667, + 0.8929, + 0.875, + 0.9048, + 0.8947, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0, + ] + cls.expected_recall_logistic = [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9798, + 0.9798, + 0.9798, + 0.9697, + 0.9697, + 0.9697, + 0.9596, + 0.9495, + 0.9394, + 0.9394, + 0.9394, + 0.9192, + 0.9091, + 0.9091, + 0.9091, + 0.899, + 0.8889, + 0.8788, + 0.8788, + 0.8788, + 0.8687, + 0.8687, + 0.8687, + 0.8687, + 0.8485, + 0.8283, + 0.8081, + 0.798, + 0.798, + 0.7879, + 0.7879, + 0.7879, + 0.7879, + 0.7879, + 0.7879, + 0.7576, + 0.7374, + 0.7273, + 0.7273, + 0.7273, + 0.7172, + 0.6869, + 0.6869, + 0.6566, + 0.6566, + 0.6465, + 0.6364, + 0.6263, + 0.596, + 0.5758, + 0.5657, + 0.5556, + 0.5455, + 0.5354, + 0.5253, + 0.5253, + 0.5152, + 0.4747, + 0.4545, + 0.4444, + 0.404, + 0.3737, + 0.3333, + 0.3333, + 0.3333, + 0.2929, + 0.2626, + 0.2525, + 0.2121, + 0.1919, + 0.1717, + 0.1414, + 0.1212, + 0.0909, + 0.0707, + 0.0404, + 0.0, + ] + cls.expected_threshold_roc_curve_logistic = 0.47 + cls.expected_threshold_pr_logistic = 0.47 + + cls.expected_tpr_catboost = [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9696969696969697, + 0.9696969696969697, + 0.9696969696969697, + 0.9696969696969697, + 0.9696969696969697, + 0.9696969696969697, + 0.9696969696969697, + 0.9595959595959596, + 0.9595959595959596, + 0.9595959595959596, + 0.9494949494949495, + 0.9494949494949495, + 0.9393939393939394, + 0.9292929292929293, + 0.9292929292929293, + 0.9292929292929293, + 0.9292929292929293, + 0.9191919191919192, + 0.9191919191919192, + 0.9090909090909091, + 0.8888888888888888, + 0.8787878787878788, + 0.8686868686868687, + 0.8383838383838383, + 0.8080808080808081, + 0.7777777777777778, + 0.7676767676767676, + 0.7373737373737373, + 0.7272727272727273, + 0.696969696969697, + 0.696969696969697, + 0.6666666666666666, + 0.6464646464646465, + 0.6363636363636364, + 0.6161616161616161, + 0.5858585858585859, + 0.494949494949495, + 0.46464646464646464, + 0.3939393939393939, + 0.35353535353535354, + 0.31313131313131315, + 0.2828282828282828, + 0.1919191919191919, + 0.0707070707070707, + 0.010101010101010102, + 0.0, + 0.0, + ] + cls.expected_fpr_catboost = [ + 1.0, + 1.0, + 0.9306930693069307, + 0.8415841584158416, + 0.7128712871287128, + 0.594059405940594, + 0.5445544554455446, + 0.48514851485148514, + 0.44554455445544555, + 0.40594059405940597, + 0.38613861386138615, + 0.3465346534653465, + 0.33663366336633666, + 0.297029702970297, + 0.297029702970297, + 0.25742574257425743, + 0.2376237623762376, + 0.21782178217821782, + 0.21782178217821782, + 0.19801980198019803, + 0.18811881188118812, + 0.18811881188118812, + 0.1782178217821782, + 0.16831683168316833, + 0.16831683168316833, + 0.16831683168316833, + 0.16831683168316833, + 0.16831683168316833, + 0.1485148514851485, + 0.1485148514851485, + 0.1485148514851485, + 0.1485148514851485, + 0.1485148514851485, + 0.1485148514851485, + 0.13861386138613863, + 0.13861386138613863, + 0.13861386138613863, + 0.12871287128712872, + 0.12871287128712872, + 0.1188118811881188, + 0.10891089108910891, + 0.10891089108910891, + 0.09900990099009901, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.07920792079207921, + 0.07920792079207921, + 0.07920792079207921, + 0.06930693069306931, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.04950495049504951, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.0297029702970297, + 0.0297029702970297, + 0.0297029702970297, + 0.0297029702970297, + 0.0297029702970297, + 0.0297029702970297, + 0.0297029702970297, + 0.019801980198019802, + 0.009900990099009901, + 0.009900990099009901, + 0.009900990099009901, + 0.009900990099009901, + 0.009900990099009901, + 0.009900990099009901, + 0.009900990099009901, + 0.009900990099009901, + 0.009900990099009901, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + ] + cls.expected_precision_catboost = [ + 0.495, + 0.495, + 0.513, + 0.538, + 0.5789, + 0.6226, + 0.6429, + 0.6689, + 0.6875, + 0.7071, + 0.7174, + 0.7388, + 0.7444, + 0.7674, + 0.7674, + 0.792, + 0.8049, + 0.8182, + 0.8182, + 0.8319, + 0.839, + 0.839, + 0.8462, + 0.8534, + 0.8534, + 0.8522, + 0.8522, + 0.8522, + 0.8673, + 0.8673, + 0.8673, + 0.8673, + 0.8673, + 0.8673, + 0.875, + 0.875, + 0.875, + 0.8829, + 0.8829, + 0.8909, + 0.8991, + 0.8991, + 0.9074, + 0.9159, + 0.9159, + 0.9159, + 0.9159, + 0.9238, + 0.9238, + 0.9238, + 0.9327, + 0.9417, + 0.9417, + 0.9412, + 0.9412, + 0.9412, + 0.9412, + 0.9412, + 0.9412, + 0.9412, + 0.9406, + 0.9406, + 0.9406, + 0.94, + 0.94, + 0.9394, + 0.9388, + 0.9388, + 0.9388, + 0.9388, + 0.9381, + 0.9381, + 0.9375, + 0.9462, + 0.956, + 0.9556, + 0.954, + 0.9524, + 0.9625, + 0.962, + 0.9605, + 0.96, + 0.9583, + 0.9583, + 0.9565, + 0.9697, + 0.9844, + 0.9839, + 0.9831, + 0.98, + 0.9787, + 0.975, + 0.9722, + 0.9688, + 0.9655, + 1.0, + 1.0, + 1.0, + 0, + 0, + ] + cls.expected_recall_catboost = [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9697, + 0.9697, + 0.9697, + 0.9697, + 0.9697, + 0.9697, + 0.9697, + 0.9596, + 0.9596, + 0.9596, + 0.9495, + 0.9495, + 0.9394, + 0.9293, + 0.9293, + 0.9293, + 0.9293, + 0.9192, + 0.9192, + 0.9091, + 0.8889, + 0.8788, + 0.8687, + 0.8384, + 0.8081, + 0.7778, + 0.7677, + 0.7374, + 0.7273, + 0.697, + 0.697, + 0.6667, + 0.6465, + 0.6364, + 0.6162, + 0.5859, + 0.4949, + 0.4646, + 0.3939, + 0.3535, + 0.3131, + 0.2828, + 0.1919, + 0.0707, + 0.0101, + 0.0, + 0.0, + ] + cls.expected_threshold_pr_catboost = 0.53 + cls.expected_threshold_roc_curve_catboost = 0.53 + + cls.expected_tpr_lightgbm = [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.98989898989899, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9797979797979798, + 0.9696969696969697, + 0.9595959595959596, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9494949494949495, + 0.9393939393939394, + 0.9393939393939394, + 0.9393939393939394, + 0.9393939393939394, + 0.9292929292929293, + 0.9292929292929293, + 0.9292929292929293, + 0.9191919191919192, + 0.9191919191919192, + 0.9191919191919192, + 0.9191919191919192, + 0.9191919191919192, + 0.9090909090909091, + 0.9090909090909091, + 0.9090909090909091, + 0.9090909090909091, + 0.9090909090909091, + 0.898989898989899, + 0.8888888888888888, + 0.8888888888888888, + 0.8787878787878788, + 0.8787878787878788, + 0.8787878787878788, + 0.8787878787878788, + 0.8686868686868687, + 0.8686868686868687, + 0.8686868686868687, + 0.8484848484848485, + 0.8383838383838383, + 0.7878787878787878, + 0.7171717171717171, + 0.0, + ] + cls.expected_fpr_lightgbm = [ + 0.2376237623762376, + 0.21782178217821782, + 0.1782178217821782, + 0.16831683168316833, + 0.1485148514851485, + 0.13861386138613863, + 0.13861386138613863, + 0.12871287128712872, + 0.1188118811881188, + 0.1188118811881188, + 0.09900990099009901, + 0.09900990099009901, + 0.09900990099009901, + 0.09900990099009901, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.0891089108910891, + 0.07920792079207921, + 0.07920792079207921, + 0.06930693069306931, + 0.06930693069306931, + 0.06930693069306931, + 0.06930693069306931, + 0.06930693069306931, + 0.06930693069306931, + 0.06930693069306931, + 0.06930693069306931, + 0.06930693069306931, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.0594059405940594, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.04950495049504951, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.039603960396039604, + 0.0297029702970297, + 0.0297029702970297, + 0.0297029702970297, + 0.0297029702970297, + 0.019801980198019802, + 0.019801980198019802, + 0.0, + ] + cls.expected_precision_lightgbm = [ + 0.8049, + 0.8182, + 0.8462, + 0.8534, + 0.8684, + 0.8761, + 0.8761, + 0.8839, + 0.8919, + 0.8919, + 0.9083, + 0.9083, + 0.9074, + 0.9074, + 0.9159, + 0.9159, + 0.9159, + 0.9159, + 0.9159, + 0.9159, + 0.9159, + 0.9159, + 0.9159, + 0.9159, + 0.9151, + 0.9151, + 0.9151, + 0.9151, + 0.9151, + 0.9151, + 0.9151, + 0.9238, + 0.9238, + 0.9327, + 0.9327, + 0.932, + 0.9314, + 0.9307, + 0.9307, + 0.9307, + 0.9307, + 0.9307, + 0.94, + 0.94, + 0.94, + 0.94, + 0.94, + 0.94, + 0.94, + 0.94, + 0.94, + 0.94, + 0.94, + 0.94, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.949, + 0.9588, + 0.9588, + 0.9588, + 0.9583, + 0.9583, + 0.9583, + 0.9579, + 0.9579, + 0.9579, + 0.9579, + 0.9579, + 0.9574, + 0.9574, + 0.9574, + 0.9574, + 0.9574, + 0.957, + 0.9565, + 0.9565, + 0.956, + 0.956, + 0.956, + 0.956, + 0.9556, + 0.9663, + 0.9663, + 0.9655, + 0.9651, + 0.975, + 0.9726, + 0, + ] + cls.expected_recall_lightgbm = [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9899, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9798, + 0.9697, + 0.9596, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9495, + 0.9394, + 0.9394, + 0.9394, + 0.9394, + 0.9293, + 0.9293, + 0.9293, + 0.9192, + 0.9192, + 0.9192, + 0.9192, + 0.9192, + 0.9091, + 0.9091, + 0.9091, + 0.9091, + 0.9091, + 0.899, + 0.8889, + 0.8889, + 0.8788, + 0.8788, + 0.8788, + 0.8788, + 0.8687, + 0.8687, + 0.8687, + 0.8485, + 0.8384, + 0.7879, + 0.7172, + 0.0, + ] + cls.expected_threshold_pr_lightgbm = 0.35 + cls.expected_threshold_roc_curve_lightgbm = 0.68 + + # Feature importance + cls.expected_feature_importance_logistic = pd.Series( + { + "feature_3": 0.878184846, + "feature_0": 0.6807735248, + "feature_8": 0.0515334867, + "feature_2": 0.0358143277, + "feature_9": -0.0381204653, + "feature_6": -0.0883332814, + "feature_7": -0.1301885331, + "feature_1": -0.275092971, + "feature_4": -0.6430405339, + "feature_5": -0.7505753324, + } + ) + cls.expected_feature_importance_catboost = pd.Series( + { + "feature_4": 44.3253726823, + "feature_5": 20.1107110351, + "feature_0": 15.1218134278, + "feature_3": 10.9642748095, + "feature_9": 3.3279302184, + "feature_1": 2.6251848816, + "feature_6": 2.178141271, + "feature_7": 0.8941374171, + "feature_8": 0.2557275556, + "feature_2": 0.1967067016, + } + ) + cls.expected_feature_importance_lightgbm = pd.Series( + { + "feature_0": 544, + "feature_4": 515, + "feature_5": 412, + "feature_9": 317, + "feature_3": 288, + "feature_1": 215, + "feature_7": 186, + "feature_6": 185, + "feature_2": 178, + "feature_8": 142, + } + ) + + cls.expected_feature_importance_catboost_regression = pd.Series( + { + "feature_1": 28.8642418698, + "feature_5": 26.0319957349, + "feature_9": 21.1700903921, + "feature_2": 8.0373088615, + "feature_0": 6.5587814183, + "feature_6": 5.7264275476, + "feature_7": 1.4749623714, + "feature_3": 1.0968849531, + "feature_8": 0.8971516205, + "feature_4": 0.1421552306, + } + ) + cls.expected_metrics_catboost_regression = { + "r2_score": 0.908, + "mean_absolute_error": 30.17, + "mean_squared_error": 1602.8024, + "root_mean_squared_error": 40.035, + "median_absolute_error": 24.5409, + } + + def setUp(self): + os.makedirs(self.folder_name, exist_ok=True) + + def tearDown(self): """ Delete the folders created for the tests. """ - if os.path.exists(cls.folder_name): - shutil.rmtree(cls.folder_name) + if os.path.exists(self.folder_name): + shutil.rmtree(self.folder_name) def _check_model_with_zip(self, output_folder): """ @@ -143,21 +1477,35 @@ def _check_model_without_zip(self, model, output_folder, problem_type): self.assertFalse(os.path.exists(os.path.join(output_folder, "datasets.zip"))) # Assert files are valid for data folder X_train = pd.read_csv(os.path.join(output_folder, "datasets", "X_train.csv")) - y_train = pd.read_csv(os.path.join(output_folder, "datasets", "y_train.csv")).values + y_train = pd.read_csv( + os.path.join(output_folder, "datasets", "y_train.csv") + ).values X_test = pd.read_csv(os.path.join(output_folder, "datasets", "X_test.csv")) - y_test = pd.read_csv(os.path.join(output_folder, "datasets", "y_test.csv")).values + y_test = pd.read_csv( + os.path.join(output_folder, "datasets", "y_test.csv") + ).values if problem_type == ProblemType.CLASSIFICATION: assert_frame_equal(X_train, self.X_train_clf) - self.assertListEqual(list([y for y in y_train.reshape(-1)]), list([y for y in self.y_train_clf.values])) + self.assertListEqual( + list([y for y in y_train.reshape(-1)]), + list([y for y in self.y_train_clf.values]), + ) assert_frame_equal(X_test, self.X_test_clf) - self.assertListEqual(list([y for y in y_test.reshape(-1)]), list([y for y in self.y_test_clf.values])) + self.assertListEqual( + list([y for y in y_test.reshape(-1)]), + list([y for y in self.y_test_clf.values]), + ) elif problem_type == ProblemType.REGRESSION: assert_frame_equal(X_train, self.X_train_reg) - self.assertListEqual(list([round(y, 4) for y in y_train.reshape(-1)]), - list([round(y, 4) for y in self.y_train_reg.values])) + self.assertListEqual( + list([round(y, 4) for y in y_train.reshape(-1)]), + list([round(y, 4) for y in self.y_train_reg.values]), + ) assert_frame_equal(X_test, self.X_test_reg) - self.assertListEqual(list([round(y, 4) for y in y_test.reshape(-1)]), - list([round(y, 4) for y in self.y_test_reg.values])) + self.assertListEqual( + list([round(y, 4) for y in y_test.reshape(-1)]), + list([round(y, 4) for y in self.y_test_reg.values]), + ) else: raise ValueError("Problem type not supported") # Assert model is the same @@ -189,7 +1537,11 @@ def test_serialize_sklearn(self): save_datasets=True, zip_files=False, ) - self._check_model_without_zip(output_folder=output_folder, model=model, problem_type=ProblemType.REGRESSION) + self._check_model_without_zip( + output_folder=output_folder, + model=model, + problem_type=ProblemType.REGRESSION, + ) # Assert works for classification with Zip model = LogisticRegression() model.fit(self.X_train_clf, self.y_train_clf) @@ -223,7 +1575,11 @@ def test_serialize_catboost(self): save_datasets=True, zip_files=False, ) - self._check_model_without_zip(output_folder=output_folder, model=model, problem_type=ProblemType.CLASSIFICATION) + self._check_model_without_zip( + output_folder=output_folder, + model=model, + problem_type=ProblemType.CLASSIFICATION, + ) # Assert works for regression with Zip model = CatBoostRegressor(allow_writing_files=False, verbose=5, iterations=10) @@ -241,6 +1597,44 @@ def test_serialize_catboost(self): ) self._check_model_with_zip(output_folder=output_folder) + def test_serialize_pipeline_with_catboost(self): + """ + Test serialization of a pipeline with CatBoost model. + """ + col_transformer = ColumnTransformer( + [ + ("num", StandardScaler(), self.X_train_clf.columns), + ] + ) + model = Pipeline( + [ + ("col_transformer", col_transformer), + ( + "model", + CatBoostClassifier( + allow_writing_files=False, verbose=5, iterations=10 + ), + ), + ] + ) + model.fit(self.X_train_clf, self.y_train_clf) + output_folder = export_model( + model, + self.X_train_clf, + self.y_train_clf, + self.X_test_clf, + self.y_test_clf, + self.folder_name, + save_model=True, + save_datasets=True, + zip_files=False, + ) + self._check_model_without_zip( + output_folder=output_folder, + model=model, + problem_type=ProblemType.CLASSIFICATION, + ) + def test_serialize_lightgbm(self): """ Test serialization of a LightGBM model. @@ -258,7 +1652,11 @@ def test_serialize_lightgbm(self): save_datasets=True, zip_files=False, ) - self._check_model_without_zip(output_folder=output_folder, model=model, problem_type=ProblemType.CLASSIFICATION) + self._check_model_without_zip( + output_folder=output_folder, + model=model, + problem_type=ProblemType.CLASSIFICATION, + ) # Assert works for regression with Zip model = LGBMRegressor() @@ -308,3 +1706,384 @@ def test_errors(self): save_datasets=True, zip_files=False, ) + + def assert_ml_experiment_init_correct( + self, + experiment, + full_model, + X_train, + y_train, + X_test, + y_test, + name, + description, + problem_type, + base_library, + base_model, + num_classes=None, + config=None, + ): + self.assertEqual(experiment.model, full_model) + assert_frame_equal(experiment.X_train, X_train) + assert_frame_equal(experiment.X_test, X_test) + assert_series_equal(experiment.y_train, y_train) + assert_series_equal(experiment.y_test, y_test) + self.assertEqual(experiment.problem_type, problem_type) + self.assertEqual(experiment.name, name) + self.assertEqual(experiment.description, description) + self.assertEqual(experiment.base_model, base_model) + self.assertEqual(experiment.base_model_library, base_library) + self.assertEqual(experiment.num_classes, num_classes) + self.assertEqual(experiment._config, config) + + def test_ml_experiment_errors(self): + # Create Logistic Regression model + model = LogisticRegression() + model.fit(self.X_train_clf, self.y_train_clf) + # Create experiment + experiment = MLExperiment( + model=model, + X_train=self.X_train_clf, + y_train=self.y_train_clf, + X_test=self.X_test_clf, + y_test=self.y_test_clf, + problem_type="classification", + name="Test sklearn experiment", + description="Test sklearn experiment", + ) + # Assert experiment is created correctly + self.assert_ml_experiment_init_correct( + experiment=experiment, + full_model=model, + X_train=self.X_train_clf, + y_train=self.y_train_clf, + X_test=self.X_test_clf, + y_test=self.y_test_clf, + name="Test sklearn experiment", + description="Test sklearn experiment", + problem_type=ProblemType.CLASSIFICATION, + base_library=ModelLibrary.SCIKIT_LEARN, + base_model=model, # Not a pipeline, hence, same as full model + num_classes=3, + ) + # Not implemented for n_classes > 2 + self.assertRaises(NotImplementedError, experiment.plot_roc_curve, show=False) + self.assertRaises(NotImplementedError, experiment._calc_roc_curve_data) + + def _check_threshold_calculation( + self, + experiment, + expected_tpr, + expected_fpr, + expected_precision, + expected_recall, + expected_threshold_pr, + expected_threshold_roc_curve, + ): + """ + Test the threshold calculation for the roc curve or pr + :param experiment: + :return: + """ + self.assertEqual(experiment._tpr_list, expected_tpr) + self.assertEqual(experiment._fpr_list, expected_fpr) + self.assertEqual(experiment._precision_list, expected_precision) + self.assertEqual(experiment._recall_list, expected_recall) + self.assertEqual(experiment.best_threshold_pr_curve, expected_threshold_pr) + self.assertEqual( + experiment.best_threshold_roc_curve, expected_threshold_roc_curve + ) + + def _check_feature_importance(self, experiment, expected_feature_importance): + feature_importance = experiment.get_feature_importance() + assert_series_equal( + feature_importance, expected_feature_importance, check_dtype=False + ) + + def test_ml_experiment_sklearn(self): + # Create Logistic Regression model + model = LogisticRegression(random_state=33) + model.fit(self.X_train_bin_clf, self.y_train_bin_clf) + # Create experiment + experiment = MLExperiment( + model=model, + X_train=self.X_train_bin_clf, + y_train=self.y_train_bin_clf, + X_test=self.X_test_bin_clf, + y_test=self.y_test_bin_clf, + problem_type="classification", + name="Test sklearn experiment", + description="Test sklearn experiment", + ) + # Assert experiment is created correctly + self.assert_ml_experiment_init_correct( + experiment=experiment, + full_model=model, + X_train=self.X_train_bin_clf, + y_train=self.y_train_bin_clf, + X_test=self.X_test_bin_clf, + y_test=self.y_test_bin_clf, + name="Test sklearn experiment", + description="Test sklearn experiment", + problem_type=ProblemType.CLASSIFICATION, + base_library=ModelLibrary.SCIKIT_LEARN, + base_model=model, # Not a pipeline, hence, same as full model + num_classes=2, + ) + + # Roc curve + self._check_threshold_calculation( + experiment=experiment, + expected_tpr=self.expected_tpr_logistic, + expected_fpr=self.expected_fpr_logistic, + expected_precision=self.expected_precision_logistic, + expected_recall=self.expected_recall_logistic, + expected_threshold_pr=self.expected_threshold_pr_logistic, + expected_threshold_roc_curve=self.expected_threshold_roc_curve_logistic, + ) + + # Feature importance + self._check_feature_importance( + experiment, + expected_feature_importance=self.expected_feature_importance_logistic, + ) + + def test_ml_experiment_catboost(self): + # Create CatBoost model + model = CatBoostClassifier( + random_state=33, verbose=5, iterations=10, allow_writing_files=False + ) + model.fit(self.X_train_bin_clf, self.y_train_bin_clf) + # Create experiment + experiment = MLExperiment( + model=model, + X_train=self.X_train_bin_clf, + y_train=self.y_train_bin_clf, + X_test=self.X_test_bin_clf, + y_test=self.y_test_bin_clf, + problem_type="classification", + name="Test catboost experiment", + description="Test catboost experiment", + ) + # Assert experiment is created correctly + self.assert_ml_experiment_init_correct( + experiment=experiment, + full_model=model, + X_train=self.X_train_bin_clf, + y_train=self.y_train_bin_clf, + X_test=self.X_test_bin_clf, + y_test=self.y_test_bin_clf, + name="Test catboost experiment", + description="Test catboost experiment", + problem_type=ProblemType.CLASSIFICATION, + base_library=ModelLibrary.CATBOOST, + base_model=model, # Not a pipeline, hence, same as full model + num_classes=2, + ) + + # Roc curve + self._check_threshold_calculation( + experiment, + expected_tpr=self.expected_tpr_catboost, + expected_fpr=self.expected_fpr_catboost, + expected_precision=self.expected_precision_catboost, + expected_recall=self.expected_recall_catboost, + expected_threshold_pr=self.expected_threshold_pr_catboost, + expected_threshold_roc_curve=self.expected_threshold_roc_curve_catboost, + ) + + # Feature importance + self._check_feature_importance( + experiment, + expected_feature_importance=self.expected_feature_importance_catboost, + ) + + # Test with regression + # Create CatBoost model + model = CatBoostRegressor( + random_state=33, verbose=5, iterations=10, allow_writing_files=False + ) + model.fit(self.X_train_reg, self.y_train_reg) + # Create experiment + experiment = MLExperiment( + model=model, + X_train=self.X_train_reg, + y_train=self.y_train_reg, + X_test=self.X_test_reg, + y_test=self.y_test_reg, + problem_type="regression", + name="Test catboost experiment", + description="Test catboost experiment", + ) + # Assert experiment is created correctly + self.assert_ml_experiment_init_correct( + experiment=experiment, + full_model=model, + X_train=self.X_train_reg, + y_train=self.y_train_reg, + X_test=self.X_test_reg, + y_test=self.y_test_reg, + name="Test catboost experiment", + description="Test catboost experiment", + problem_type=ProblemType.REGRESSION, + base_library=ModelLibrary.CATBOOST, + base_model=model, # Not a pipeline, hence, same as full model + ) + + # Feature importance + self._check_feature_importance( + experiment, + expected_feature_importance=self.expected_feature_importance_catboost_regression, + ) + + # Metrics + self.assertDictEqual( + experiment.metrics, + self.expected_metrics_catboost_regression, + ) + + def test_ml_experiment_pipeline_with_catboost(self): + # Create CatBoost model + model = CatBoostClassifier( + random_state=33, verbose=5, iterations=10, allow_writing_files=False + ) + col_transformer = ColumnTransformer( + [ + ("num", "passthrough", self.X_train_bin_clf.columns), + ], + verbose_feature_names_out=False, + ) + # Create pipeline + pipeline = Pipeline( + [ + ("col_transformer", col_transformer), + ("model", model), + ] + ) + pipeline.fit(self.X_train_bin_clf, self.y_train_bin_clf) + # Create experiment + experiment = MLExperiment( + model=pipeline, + X_train=self.X_train_bin_clf, + y_train=self.y_train_bin_clf, + X_test=self.X_test_bin_clf, + y_test=self.y_test_bin_clf, + problem_type="classification", + name="Test catboost experiment", + description="Test catboost experiment", + ) + # Assert experiment is created correctly + self.assert_ml_experiment_init_correct( + experiment=experiment, + full_model=pipeline, + X_train=self.X_train_bin_clf, + y_train=self.y_train_bin_clf, + X_test=self.X_test_bin_clf, + y_test=self.y_test_bin_clf, + name="Test catboost experiment", + description="Test catboost experiment", + problem_type=ProblemType.CLASSIFICATION, + base_library=ModelLibrary.CATBOOST, + base_model=model, + num_classes=2, + ) + + # Roc curve + self._check_threshold_calculation( + experiment, + expected_tpr=self.expected_tpr_catboost, + expected_fpr=self.expected_fpr_catboost, + expected_precision=self.expected_precision_catboost, + expected_recall=self.expected_recall_catboost, + expected_threshold_pr=self.expected_threshold_pr_catboost, + expected_threshold_roc_curve=self.expected_threshold_roc_curve_catboost, + ) + + # Feature importance + self._check_feature_importance( + experiment, + expected_feature_importance=self.expected_feature_importance_catboost, + ) + + def test_ml_experiment_lightgbm(self): + # Create LightGBM model + model = LGBMClassifier(random_state=33) + model.fit(self.X_train_bin_clf, self.y_train_bin_clf) + # Create experiment + experiment = MLExperiment( + model=model, + X_train=self.X_train_bin_clf, + y_train=self.y_train_bin_clf, + X_test=self.X_test_bin_clf, + y_test=self.y_test_bin_clf, + problem_type="classification", + name="Test lightgbm experiment", + description="Test lightgbm experiment", + ) + # Assert experiment is created correctly + self.assert_ml_experiment_init_correct( + experiment=experiment, + full_model=model, + X_train=self.X_train_bin_clf, + y_train=self.y_train_bin_clf, + X_test=self.X_test_bin_clf, + y_test=self.y_test_bin_clf, + name="Test lightgbm experiment", + description="Test lightgbm experiment", + problem_type=ProblemType.CLASSIFICATION, + base_library=ModelLibrary.LIGHTGBM, + base_model=model, # Not a pipeline, hence, same as full model + num_classes=2, + ) + + # Roc curve + self._check_threshold_calculation( + experiment, + expected_tpr=self.expected_tpr_lightgbm, + expected_fpr=self.expected_fpr_lightgbm, + expected_precision=self.expected_precision_lightgbm, + expected_recall=self.expected_recall_lightgbm, + expected_threshold_pr=self.expected_threshold_pr_lightgbm, + expected_threshold_roc_curve=self.expected_threshold_roc_curve_lightgbm, + ) + + # Feature importance + self._check_feature_importance( + experiment, + expected_feature_importance=self.expected_feature_importance_lightgbm, + ) + + def test_ml_tracker_add_experiment(self): + ml_tracker = MLTracker(experiment_folder=self.folder_name) + + # Create a couple of experiments for the binary classification problem + # Create Logistic Regression model + model = LogisticRegression(random_state=33) + model.fit(self.X_train_bin_clf, self.y_train_bin_clf) + + # Create experiment + experiment = MLExperiment( + model=model, + X_train=self.X_train_bin_clf, + y_train=self.y_train_bin_clf, + X_test=self.X_test_bin_clf, + y_test=self.y_test_bin_clf, + problem_type="classification", + name="Test sklearn experiment", + description="Test sklearn experiment", + ) + + ml_tracker.add_experiment(experiment) + + # Assert experiment in tracker + self.assertEqual(len(ml_tracker.experiments), 1) + + # Test scan for experiments + ml_tracker_new = MLTracker(experiment_folder=self.folder_name) + + ml_tracker_new.scan_for_experiments() + + self.assertEqual(len(ml_tracker_new.experiments), 1) + + # Assert experiments are equal + self.assertEqual(list(ml_tracker_new.experiments.values())[0], experiment) From b6da2a5d21a66deff709476c7e7b646d01bab7a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Tue, 23 Jan 2024 11:45:31 +0100 Subject: [PATCH 08/14] Bugfix with fig.close() --- mango/models/experiment_tracking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index cc316f25..c9af45a3 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -1334,7 +1334,7 @@ def create_plots( ) if show_plots: fig.show() - fig.close() + plt.close() fig, ax = experiment.plot_feature_importance() ax.set_title(experiment_name + "_" + ax.get_title()) fig.savefig( @@ -1344,7 +1344,7 @@ def create_plots( ) if show_plots: fig.show() - fig.close() + plt.close() return None From 5f0991e31875e6ea810c14556b1f2e308df57bc0 Mon Sep 17 00:00:00 2001 From: AntonioGonzalezSuarez <112699773+AntonioGonzalezSuarez@users.noreply.github.com> Date: Mon, 5 Feb 2024 10:50:51 +0100 Subject: [PATCH 09/14] Update experiment_tracking.py --- mango/models/experiment_tracking.py | 304 +++++++++++++++++++++++++--- 1 file changed, 272 insertions(+), 32 deletions(-) diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index c9af45a3..10197dcb 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -54,6 +54,8 @@ def export_model( y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, + X_validation: pd.DataFrame, + y_validation: pd.Series, base_path: str, custom_metrics: dict = None, description: str = None, @@ -75,6 +77,10 @@ def export_model( :type X_test: :class:`pandas.DataFrame` :param y_test: Test target as a pandas series. :type y_test: :class:`pandas.Series` + :param X_validation: Validation data as a pandas dataframe. + :type X_validation: :class:`pandas.DataFrame` + :param y_validation: Validation target as a pandas series. + :type y_validation: :class:`pandas.Series` :param description: Description of the experiment. :type description: :class:`str` :param base_path: Path to the base folder where the model and datasets will be saved in a subfolder structure. @@ -162,7 +168,7 @@ def export_model( summary["model"]["library"] = model_library.value if model_library == ModelLibrary.CATBOOST: if pipeline is not None: - summary["model"]["input"] = list(col_transformer.get_feature_names_out()) + summary["model"]["input"] = list(col_transformer.feature_names_in_) summary["model"]["hyperparameters"] = pipeline.get_params(deep=True) else: summary["model"]["hyperparameters"] = model.get_all_params() @@ -170,14 +176,14 @@ def export_model( elif model_library == ModelLibrary.SCIKIT_LEARN: if pipeline is not None: - summary["model"]["input"] = list(col_transformer.get_feature_names_out()) + summary["model"]["input"] = list(col_transformer.feature_names_in_) summary["model"]["hyperparameters"] = pipeline.get_params(deep=True) else: summary["model"]["input"] = list(model.feature_names_in_) summary["model"]["hyperparameters"] = model.get_params(deep=True) elif model_library == ModelLibrary.LIGHTGBM: if pipeline is not None: - summary["model"]["input"] = list(col_transformer.get_feature_names_out()) + summary["model"]["input"] = list(col_transformer.feature_names_in_) summary["model"]["hyperparameters"] = pipeline.get_params(deep=True) else: summary["model"]["input"] = list(model.feature_name_) @@ -228,9 +234,15 @@ def export_model( y_test_pred = pd.Series(model.predict(X_test).reshape(-1)).reset_index( drop=True ) + y_validation_pred = pd.Series( + model.predict(X_validation).reshape(-1) + ).reset_index(drop=True) elif model_library in [ModelLibrary.SCIKIT_LEARN, ModelLibrary.LIGHTGBM]: y_train_pred = pd.Series(model.predict(X_train)).reset_index(drop=True) y_test_pred = pd.Series(model.predict(X_test)).reset_index(drop=True) + y_validation_pred = pd.Series(model.predict(X_validation)).reset_index( + drop=True + ) if problem_type == ProblemType.CLASSIFICATION: if not custom_metrics: @@ -241,6 +253,9 @@ def export_model( "test_score": generate_metrics_classification( y_test.reset_index(drop=True), y_test_pred ), + "validation_score": generate_metrics_classification( + y_validation.reset_index(drop=True), y_validation_pred + ), } else: summary["results"] = custom_metrics @@ -252,6 +267,9 @@ def export_model( "test_score": generate_metrics_regression( y_test.reset_index(drop=True), y_test_pred ), + "validation_score": generate_metrics_regression( + y_validation.reset_index(drop=True), y_validation_pred + ), } # Prepare environment to save files @@ -318,6 +336,20 @@ def export_model( summary["files"]["datasets"]["y_test"]["path"] = os.path.abspath(y_test_path) summary["files"]["datasets"]["y_test"]["shape"] = y_test.shape y_test.to_csv(y_test_path, index=False) + X_validation_path = os.path.join(folder_name, "datasets", "X_validation.csv") + summary["files"]["datasets"]["X_validation"] = {} + summary["files"]["datasets"]["X_validation"]["path"] = os.path.abspath( + X_validation_path + ) + summary["files"]["datasets"]["X_validation"]["shape"] = X_validation.shape + X_validation.to_csv(X_validation_path, index=False) + y_validation_path = os.path.join(folder_name, "datasets", "y_validation.csv") + summary["files"]["datasets"]["y_validation"] = {} + summary["files"]["datasets"]["y_validation"]["path"] = os.path.abspath( + y_validation_path + ) + summary["files"]["datasets"]["y_validation"]["shape"] = y_validation.shape + y_validation.to_csv(y_validation_path, index=False) if zip_files: # Compress data and save zip_path = os.path.join(folder_name, "datasets.zip") @@ -391,6 +423,8 @@ def __init__( y_train: Optional[pd.Series] = None, X_test: Optional[pd.DataFrame] = None, y_test: Optional[pd.Series] = None, + X_validation: Optional[pd.DataFrame] = None, + y_validation: Optional[pd.Series] = None, model: Any = None, problem_type: Union[str, ProblemType] = None, name: str = None, @@ -444,6 +478,8 @@ def __init__( self.y_train = y_train self.X_test = X_test self.y_test = y_test + self.X_validation = X_validation + self.y_validation = y_validation if self.problem_type == ProblemType.CLASSIFICATION: self.num_classes = len(self.y_test.unique()) @@ -451,7 +487,7 @@ def __init__( self.imbalance = ( self.y_train.value_counts().values[1] / self.y_train.value_counts().values[0] - < 0.2 + < 0.35 ) # Private properties @@ -461,6 +497,7 @@ def __init__( self._recall_list = None self._config = None self._is_pipeline = isinstance(self.model, self.pipeline_class) + self._model_input_cols = self._get_model_input_cols() # Final Setup self._set_base_model_and_library() @@ -587,6 +624,39 @@ def y_test(self, value): raise ValueError("y_train must be a pandas Series.") self._y_test = value + @property + def X_validation(self) -> pd.DataFrame: + """ + Test data. + """ + return self._X_validation + + @X_validation.setter + def X_validation(self, value): + if value is None: + raise ValueError("X_test cannot be None.") + self._X_validation = value + + @property + def y_validation(self) -> pd.Series: + """ + Test target. + """ + return self._y_validation + + @y_validation.setter + def y_validation(self, value): + if value is None: + raise ValueError("y_test cannot be None.") + if isinstance(value, pd.DataFrame): + if value.shape[1] == 1: + value = value.iloc[:, 0] + else: + raise ValueError("y_train must be a pandas Series.") + if not isinstance(value, pd.Series): + raise ValueError("y_train must be a pandas Series.") + self._y_validation = value + @property def metrics(self) -> dict: """ @@ -704,15 +774,26 @@ def _set_datasets_dtypes(self): str ) self.X_test.iloc[:, col_idx] = self.X_test.iloc[:, col_idx].astype(str) + self.X_validation.iloc[:, col_idx] = self.X_validation.iloc[ + :, col_idx + ].astype(str) def _generate_classification_metrics_with_threshold(self): """ Helper function to generate the classification metrics with different thresholds. """ - self.metrics = {"train_score": {}, "test_score": {}} + self.metrics = {"train_score": {}, "test_score": {}, "validation_score": {}} if self.num_classes == 2: - y_pred_train = self.model.predict_proba(self.X_train)[:, 1] - y_pred_test = self.model.predict_proba(self.X_test)[:, 1] + y_pred_train = self.model.predict_proba( + self.X_train[self._model_input_cols] + )[:, 1] + y_pred_test = self.model.predict_proba(self.X_test[self._model_input_cols])[ + :, 1 + ] + y_pred_validation = self.model.predict_proba( + self.X_validation[self._model_input_cols] + )[:, 1] + for threshold in [i / 100 for i in range(1, 101)]: self.metrics["train_score"][ threshold @@ -722,16 +803,27 @@ def _generate_classification_metrics_with_threshold(self): self.metrics["test_score"][threshold] = generate_metrics_classification( self.y_test, y_pred_test >= threshold ) + self.metrics["validation_score"][ + threshold + ] = generate_metrics_classification( + self.y_validation, y_pred_validation >= threshold + ) else: self.metrics = {} - y_pred_train = self.model.predict(self.X_train) - y_pred_test = self.model.predict(self.X_test) + y_pred_train = self.model.predict(self.X_train[self._model_input_cols]) + y_pred_test = self.model.predict(self.X_test[self._model_input_cols]) + y_pred_validation = self.model.predict( + self.X_validation[self._model_input_cols] + ) self.metrics["train_score"] = generate_metrics_classification( self.y_train, y_pred_train ) self.metrics["test_score"] = generate_metrics_classification( self.y_test, y_pred_test ) + self.metrics["validation_score"] = generate_metrics_classification( + self.y_validation, y_pred_validation + ) @staticmethod def _find_saving_parameters_from_structure(experiment_folder: str) -> dict: @@ -886,7 +978,7 @@ def _calc_precision_recall_curve_data(self): recall_list = [] best_distance = 9999 best_threshold = None - for threshold, metric in self.metrics["test_score"].items(): + for threshold, metric in self.metrics["validation_score"].items(): precision = metric["precision"] recall = metric["recall"] precision_list.append(precision) @@ -920,7 +1012,7 @@ def _calc_roc_curve_data(self): fpr_list = [] best_distance = 9999 best_threshold = None - for threshold, metric in self.metrics["test_score"].items(): + for threshold, metric in self.metrics["validation_score"].items(): (tn, fp), (fn, tp) = metric["confusion_matrix"] tpr = tp / (tp + fn) fpr = fp / (fp + tn) @@ -945,9 +1037,28 @@ def _init_metrics(self): :return: """ if self.problem_type == ProblemType.REGRESSION: - self.metrics = generate_metrics_regression( - self.y_test, self.model.predict(self.X_test) + # Metrics for the training set (optional, depending on your needs) + train_metrics = generate_metrics_regression( + self.y_train, self.model.predict(self.X_train[self._model_input_cols]) + ) + # Metrics for the test set + test_metrics = generate_metrics_regression( + self.y_test, self.model.predict(self.X_test[self._model_input_cols]) ) + + # Metrics for the validation set + validation_metrics = generate_metrics_regression( + self.y_validation, + self.model.predict(self.X_validation[self._model_input_cols]), + ) + + # Store metrics in a dictionary + self.metrics = { + "train": train_metrics, + "validation": validation_metrics, + "test": test_metrics, + } + elif self.problem_type == ProblemType.CLASSIFICATION: self._generate_classification_metrics_with_threshold() if self.num_classes == 2: @@ -989,9 +1100,10 @@ def get_feature_importance(self) -> pd.Series: return pd.Series(feature_importance, index=feature_names).sort_values( ascending=False ) - raise NotImplementedError( + logging.warning( f"Feature importance is not supported for model {self.base_model.__class__.__name__}" ) + return pd.Series(index=feature_names) def plot_roc_curve( self, show: bool = False @@ -1013,7 +1125,7 @@ def plot_roc_curve( ) fig, ax = plt.subplots(figsize=(15, 10)) # Scatter and show cmap legend - thresholds = list(self.metrics["test_score"].keys()) + thresholds = list(self.metrics["validation_score"].keys()) ax.scatter(self._fpr_list, self._tpr_list, c=thresholds, cmap="viridis") ax.set_title(f"ROC Curve, best threshold {self.best_threshold_roc_curve:.2f}") ax.set_xlabel("False Positive Rate") @@ -1059,7 +1171,7 @@ def plot_precision_recall_curve( ) fig, ax = plt.subplots(figsize=(15, 10)) # Scatter and show cmap legend - thresholds = list(self.metrics["test_score"].keys()) + thresholds = list(self.metrics["validation_score"].keys()) ax.scatter( self._recall_list, self._precision_list, c=thresholds, cmap="viridis" ) @@ -1110,6 +1222,37 @@ def plot_feature_importance( else: return fig, ax + def plot_probabilities_histogram( + self, show=False, bins=20, dataset: str = "test" + ) -> Optional[Tuple[plt.Figure, plt.Axes]]: + X = getattr(self, f"X_{dataset}") + y = getattr(self, f"y_{dataset}") + preds = self.get_probabilities(X, y) + fig, ax = plt.subplots(figsize=(20, 20)) + ax.hist( + preds[preds["target"] == 0]["prediction_probability"], + bins=bins, + alpha=0.5, + label="0", + density=True, + ) + ax.hist( + preds[preds["target"] == 1]["prediction_probability"], + bins=bins, + alpha=0.5, + label="1", + density=True, + ) + ax.set_title(f"Probabilities Histogram {dataset} set") + ax.set_xlabel("Probability") + ax.set_ylabel("Density (%)") + ax.legend() + fig.tight_layout() + if show: + plt.show() + else: + return fig, ax + def register_experiment( self, base_path, @@ -1136,7 +1279,7 @@ def register_experiment( "best_threshold": { "value": threshold, "train_score": self.metrics["train_score"][threshold], - "test_score": self.metrics["test_score"][threshold], + "test_score": self.metrics["validation_score"][threshold], }, **custom_metrics, } @@ -1146,6 +1289,8 @@ def register_experiment( self.y_train, self.X_test, self.y_test, + self.X_validation, + self.y_validation, description=self.description, custom_metrics=custom_metrics, base_path=base_path, @@ -1188,9 +1333,17 @@ def from_registered_experiment(cls, experiment_path: str): y_test = pd.read_csv( os.path.join(experiment_path, "datasets", "y_test.csv") ) + X_validation = pd.read_csv( + os.path.join(experiment_path, "datasets", "X_validation.csv"), + low_memory=False, + ) + y_validation = pd.read_csv( + os.path.join(experiment_path, "datasets", "y_validation.csv") + ) # Make sure is a pd.Series y_train = y_train.iloc[:, 0] y_test = y_test.iloc[:, 0] + y_validation = y_validation.iloc[:, 0] experiment = cls( name=summary.get("name", experiment_path.split("-", 1)[1].rstrip("/")), @@ -1201,6 +1354,8 @@ def from_registered_experiment(cls, experiment_path: str): y_train=y_train, X_test=X_test, y_test=y_test, + X_validation=X_validation, + y_validation=y_validation, ) except Exception as e: raise e @@ -1211,6 +1366,32 @@ def from_registered_experiment(cls, experiment_path: str): cls._zip_experiment_folder(experiment_path) return experiment + def get_probabilities(self, X=None, y=None): + if X is not None and y is not None: + return pd.DataFrame( + { + "prediction_probability": self.model.predict_proba( + X[self._model_input_cols] + )[:, 1], + "target": y, + } + ) + return pd.DataFrame( + { + "prediction_probability": self.model.predict_proba( + self.X_test[self._model_input_cols] + )[:, 1], + "target": self.y_test, + } + ) + + def _get_model_input_cols(self): + if self._is_pipeline: + # Assume first step is the column transformer and get feature names in + return self.model[0].feature_names_in_ + else: + return self.model.feature_names_in_ + class MLTracker: """ @@ -1303,6 +1484,7 @@ def add_experiment(self, experiment: MLExperiment): logging.info( f"Added experiment {exp_folder_name} to the tracker. Current experiments: {len(self._experiments)}." ) + return exp_folder_name def create_plots( self, show_plots: bool = False @@ -1317,7 +1499,33 @@ def create_plots( :return: figures and axes of the plots if show_plots is False. """ for experiment_name, experiment in self._experiments.items(): - if experiment.problem_type == ProblemType.CLASSIFICATION: + roc_exists = os.path.exists( + os.path.join(self.experiment_folder, experiment_name, "roc_curve.png") + ) + precision_recall_exists = os.path.exists( + os.path.join( + self.experiment_folder, experiment_name, "precision_recall.png" + ) + ) + prob_histogram_exists = os.path.exists( + os.path.join( + self.experiment_folder, + experiment_name, + "probabilities_histogram.png", + ) + ) + clf_plots_exist = ( + roc_exists and precision_recall_exists and prob_histogram_exists + ) + feature_importance_exists = os.path.exists( + os.path.join( + self.experiment_folder, experiment_name, "feature_importance.png" + ) + ) + if ( + experiment.problem_type == ProblemType.CLASSIFICATION + and not clf_plots_exist + ): fig, ax = experiment.plot_roc_curve() ax.set_title(experiment_name + "_" + ax.get_title()) fig.savefig( @@ -1325,6 +1533,8 @@ def create_plots( self.experiment_folder, experiment_name, "roc_curve.png" ) ) + if show_plots: + fig.show() fig, ax = experiment.plot_precision_recall_curve() ax.set_title(experiment_name + "_" + ax.get_title()) fig.savefig( @@ -1332,21 +1542,33 @@ def create_plots( self.experiment_folder, experiment_name, "precision_recall.png" ) ) + if show_plots: + fig.show() + fig, ax = experiment.plot_probabilities_histogram() + ax.set_title(experiment_name + "_" + ax.get_title()) + fig.savefig( + os.path.join( + self.experiment_folder, + experiment_name, + "probabilities_histogram.png", + ) + ) if show_plots: fig.show() plt.close() - fig, ax = experiment.plot_feature_importance() - ax.set_title(experiment_name + "_" + ax.get_title()) - fig.savefig( - os.path.join( - self.experiment_folder, experiment_name, "feature_importance.png" + if not feature_importance_exists: + fig, ax = experiment.plot_feature_importance() + ax.set_title(experiment_name + "_" + ax.get_title()) + fig.savefig( + os.path.join( + self.experiment_folder, + experiment_name, + "feature_importance.png", + ) ) - ) - if show_plots: - fig.show() - plt.close() - - return None + if show_plots: + fig.show() + plt.close() def update_experiments_metrics(self): """ @@ -1371,6 +1593,9 @@ def update_experiments_metrics(self): "best_threshold": { "value": threshold, "train_score": experiment.metrics["train_score"][threshold], + "validation_score": experiment.metrics["validation_score"][ + threshold + ], "test_score": experiment.metrics["test_score"][threshold], }, **custom_metrics, @@ -1401,6 +1626,9 @@ def create_comparison_df(self, save: bool = True) -> pd.DataFrame: "train_score": metrics["train_score"][ experiment.best_threshold_pr_curve ], + "validation_score": metrics["validation_score"][ + experiment.best_threshold_pr_curve + ], "test_score": metrics["test_score"][ experiment.best_threshold_pr_curve ], @@ -1413,6 +1641,7 @@ def create_comparison_df(self, save: bool = True) -> pd.DataFrame: else: metrics = { "train_score": metrics["train_score"], + "validation_score": metrics["validation_score"], "test_score": metrics["test_score"], } row_index.append( @@ -1420,18 +1649,29 @@ def create_comparison_df(self, save: bool = True) -> pd.DataFrame: ) metrics_row.append(metrics) # Make a dataframe with multilevel column for the train and test scores which are dictionaries. - df = pd.DataFrame(row_index).drop(columns=["train_score", "test_score"]) + df = pd.DataFrame(row_index).drop( + columns=["train_score", "validation_score", "test_score"] + ) metrics_train = pd.DataFrame([row["train_score"] for row in metrics_row]) + metrics_validation = pd.DataFrame( + [row["validation_score"] for row in metrics_row] + ) metrics_test = pd.DataFrame([row["test_score"] for row in metrics_row]) # Concatenate the dataframes in a way that one from train next from test and so on. metrics = pd.DataFrame() for col in metrics_train.columns: metrics = pd.concat( - [metrics, metrics_train[col], metrics_test[col]], axis=1 + [ + metrics, + metrics_train[col], + metrics_validation[col], + metrics_test[col], + ], + axis=1, ).copy() metrics.columns = pd.MultiIndex.from_product( - [metrics_train.columns, ["train", "test"]] + [metrics_train.columns, ["train", "validation", "test"]] ) df = pd.concat([df, metrics], axis=1) # Set multilevel index From 6a297c9b21ece2755f3051a2aed92fcf120efa36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Tue, 6 Feb 2024 15:42:51 +0100 Subject: [PATCH 10/14] Make all metrics round to 4 decimals --- mango/models/metrics.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mango/models/metrics.py b/mango/models/metrics.py index 1f61a807..6bc869d4 100644 --- a/mango/models/metrics.py +++ b/mango/models/metrics.py @@ -26,7 +26,7 @@ def r2_score(y_true: pd.Series, y_pred: pd.Series) -> float: mean_y_true = y_true.mean() ss_tot = ((y_true - mean_y_true) ** 2).sum() ss_res = ((y_true - y_pred) ** 2).sum() - return 1 - ss_res / ss_tot + return round((1 - ss_res / ss_tot), 4) def mean_absolute_error(y_true: pd.Series, y_pred: pd.Series) -> float: @@ -117,7 +117,7 @@ def confusion_matrix(y_true: pd.Series, y_pred: pd.Series) -> np.ndarray: >>> y_pred = pd.Series([0, 0, 1, 1]) >>> confusion_matrix(y_true, y_pred) array([[1, 1], - [1, 1]]) + [1, 1]], dtype=int64) """ return pd.crosstab(y_true, y_pred).to_numpy() @@ -298,6 +298,8 @@ def generate_metrics_classification( "precision": round(precision_score(y_true, y_pred), 4), "recall": round(recall_score(y_true, y_pred), 4), "f1_score": round(f1_score(y_true, y_pred), 4), + "tpr": round(tp / (tp + fn), 4), + "fpr": round(fp / (fp + tn), 4), } else: return { From 26b738887c93294735eaf96539077833b5131617 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Wed, 14 Feb 2024 09:51:53 +0100 Subject: [PATCH 11/14] Added predict methods --- mango/models/experiment_tracking.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index 10197dcb..1f4a6ddf 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -1385,6 +1385,30 @@ def get_probabilities(self, X=None, y=None): } ) + def predict(self, X): + X = self._prepare_dataset_for_prediction(X.copy()) + return self.model.predict(X) + + def predict_proba(self, X, threshold=None): + if self.problem_type == ProblemType.REGRESSION: + raise ValueError("predict_proba is only for classification problems.") + if threshold is None: + threshold = ( + self.best_threshold_pr_curve + if self.imbalance + else self.best_threshold_roc_curve + ) + X = self._prepare_dataset_for_prediction(X.copy()) + return self.model.predict_proba(X, threshold=threshold) + + def _prepare_dataset_for_prediction(self, X): + if self.base_model_library == ModelLibrary.CATBOOST: + for col_idx in self.base_model.get_param("cat_features") or []: + X.iloc[:, col_idx] = X.iloc[:, col_idx].astype(str) + + # Select only the columns that were used in the training + return X[self._model_input_cols] + def _get_model_input_cols(self): if self._is_pipeline: # Assume first step is the column transformer and get feature names in From 37932b29d424f1d6cfe43307e7d081cf4623325f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Wed, 14 Feb 2024 10:13:35 +0100 Subject: [PATCH 12/14] Fix small issues in predict --- mango/models/experiment_tracking.py | 39 ++++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index 1f4a6ddf..0fd2af8b 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -496,7 +496,7 @@ def __init__( self._precision_list = None self._recall_list = None self._config = None - self._is_pipeline = isinstance(self.model, self.pipeline_class) + self._is_pipeline = isinstance(self.model, self._pipeline_class) self._model_input_cols = self._get_model_input_cols() # Final Setup @@ -741,15 +741,15 @@ def _search_for_supported_libraries(self): from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression, LinearRegression - self.pipeline_class = Pipeline - self.sklearn_linear_regression_class = LinearRegression - self.sklearn_logistic_regression_class = LogisticRegression + self._pipeline_class = Pipeline + self._sklearn_linear_regression_class = LinearRegression + self._sklearn_logistic_regression_class = LogisticRegression self._SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator except ImportError: - self.pipeline_class = _DummyPipeline - self.sklearn_linear_regression_class = _DummyLinearRegression - self.sklearn_logistic_regression_class = _DummyLogisticRegression + self._pipeline_class = _DummyPipeline + self._sklearn_linear_regression_class = _DummyLinearRegression + self._sklearn_logistic_regression_class = _DummyLogisticRegression try: from catboost import CatBoost @@ -1076,8 +1076,8 @@ def get_feature_importance(self) -> pd.Series: is_linear_model = isinstance( self.base_model, ( - self.sklearn_linear_regression_class, - self.sklearn_logistic_regression_class, + self._sklearn_linear_regression_class, + self._sklearn_logistic_regression_class, ), ) @@ -1385,21 +1385,24 @@ def get_probabilities(self, X=None, y=None): } ) - def predict(self, X): + def predict(self, X, threshold=None): X = self._prepare_dataset_for_prediction(X.copy()) + if self.problem_type == ProblemType.CLASSIFICATION: + if threshold is not None: + return self.model.predict_proba(X)[:, 1] >= threshold + else: + return ( + self.model.predict(X)[:, 1] >= self.best_threshold_pr_curve + if self.imbalance + else self.best_threshold_roc_curve + ) return self.model.predict(X) - def predict_proba(self, X, threshold=None): + def predict_proba(self, X): if self.problem_type == ProblemType.REGRESSION: raise ValueError("predict_proba is only for classification problems.") - if threshold is None: - threshold = ( - self.best_threshold_pr_curve - if self.imbalance - else self.best_threshold_roc_curve - ) X = self._prepare_dataset_for_prediction(X.copy()) - return self.model.predict_proba(X, threshold=threshold) + return self.model.predict_proba(X) def _prepare_dataset_for_prediction(self, X): if self.base_model_library == ModelLibrary.CATBOOST: From 8a05e530112991659b02a080f6466994fd8f670e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Wed, 24 Jul 2024 12:12:04 +0200 Subject: [PATCH 13/14] Fixed issues on testing --- mango/models/experiment_tracking.py | 750 ++++++++---------- .../models_module/test_experiment_tracking.py | 454 ++++++----- requirements-dev.txt | 1 + 3 files changed, 593 insertions(+), 612 deletions(-) diff --git a/mango/models/experiment_tracking.py b/mango/models/experiment_tracking.py index 0fd2af8b..03a70179 100644 --- a/mango/models/experiment_tracking.py +++ b/mango/models/experiment_tracking.py @@ -8,15 +8,13 @@ import pandas as pd from matplotlib import pyplot as plt +from pandas.testing import assert_frame_equal, assert_series_equal from .enums import ProblemType, ModelLibrary from .metrics import ( generate_metrics_regression, generate_metrics_classification, ) -from mango.config import BaseConfig - -from pandas.testing import assert_frame_equal, assert_series_equal class _DummyPipeline: @@ -31,6 +29,35 @@ class _DummyLogisticRegression: pass +_SUPPORTED_LIBRARIES_CLASSES = {} +try: + from sklearn.base import BaseEstimator + from sklearn.pipeline import Pipeline + from sklearn.linear_model import LogisticRegression, LinearRegression + + _PIPELINE_CLASS = Pipeline + _SKLEARN_LINEAR_REGRESSION_CLASS = LinearRegression + _SKLEARN_LOGISTIC_REGRESSION_CLASS = LogisticRegression + + _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator +except ImportError: + _PIPELINE_CLASS = _DummyPipeline + _SKLEARN_LINEAR_REGRESSION_CLASS = _DummyLinearRegression + _SKLEARN_LOGISTIC_REGRESSION_CLASS = _DummyLogisticRegression +try: + from catboost import CatBoost + + _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.CATBOOST] = CatBoost +except ImportError: + pass +try: + from lightgbm import LGBMModel + + _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.LIGHTGBM] = LGBMModel +except ImportError: + pass + + def _json_serializable(value: Any) -> bool: try: json.dumps(value) @@ -39,333 +66,16 @@ def _json_serializable(value: Any) -> bool: return False -def _clean_hyperparameters(hyperparameters: dict) -> dict: +def _clean_json(hyperparameters: Union[dict, Any]) -> dict: + final_hyperparameters = {} for key, value in hyperparameters.items(): if isinstance(value, dict): - _clean_hyperparameters(value) + final_hyperparameters[key] = _clean_json(value) elif not _json_serializable(value): - hyperparameters[key] = str(value) - return hyperparameters - - -def export_model( - model: Any, - X_train: pd.DataFrame, - y_train: pd.Series, - X_test: pd.DataFrame, - y_test: pd.Series, - X_validation: pd.DataFrame, - y_validation: pd.Series, - base_path: str, - custom_metrics: dict = None, - description: str = None, - base_folder_name: str = None, - save_model: bool = True, - save_datasets: bool = False, - zip_files: bool = True, -) -> str: - """ - Register model and metrics in a json file and save the model and datasets in a folder. - - :param model: A model from one of the supported libraries. - :type model: :class:`Any` - :param X_train: Training data as a pandas dataframe. - :type X_train: :class:`pandas.DataFrame` - :param y_train: Training target as a pandas series. - :type y_train: :class:`pandas.Series` - :param X_test: Test data as a pandas dataframe. - :type X_test: :class:`pandas.DataFrame` - :param y_test: Test target as a pandas series. - :type y_test: :class:`pandas.Series` - :param X_validation: Validation data as a pandas dataframe. - :type X_validation: :class:`pandas.DataFrame` - :param y_validation: Validation target as a pandas series. - :type y_validation: :class:`pandas.Series` - :param description: Description of the experiment. - :type description: :class:`str` - :param base_path: Path to the base folder where the model and datasets will be saved in a subfolder structure. - :type base_path: :class:`str` - :param base_folder_name: Custom name for the folder where the model and datasets will be saved. - :type base_folder_name: :class:`str` - :param zip_files: Whether to zip the files or not. - :type zip_files: :class:`bool` - :param save_datasets: Whether to save the datasets or not. - :type save_datasets: :class:`bool` - :param save_model: Whether to save the model or not. - :type save_model: :class:`bool` - :return: The path to the subfolder inside base_path where the model and datasets have been saved. - :rtype: :class:`str` - - Usage - ----- - >>> from sklearn.datasets import fetch_california_housing - >>> from sklearn.linear_model import LogisticRegression - >>> from sklearn.model_selection import train_test_split - >>> X, y = fetch_california_housing(return_X_y=True, as_frame=True) - >>> X_train, X_test, y_train, y_test = train_test_split(X, y) - >>> model = LogisticRegression() - >>> model.fit(X_train, y_train) - >>> output_folder = export_model(model, X_train, y_train, X_test, y_test, "/my_experiments_folder") - >>> print(output_folder) # /my_experiments_folder/experiment_LogisticRegression_YYYYMMDD-HHMMSS - """ - _SUPPORTED_LIBRARIES_CLASSES = {} - try: - from sklearn.base import BaseEstimator - from sklearn.pipeline import Pipeline - - pipeline_class = Pipeline - - _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator - except ImportError: - pipeline_class = _DummyPipeline - try: - from catboost import CatBoost - - _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.CATBOOST] = CatBoost - except ImportError: - pass - try: - from lightgbm import LGBMModel - - _SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.LIGHTGBM] = LGBMModel - except ImportError: - pass - - if not os.path.exists(base_path): - raise FileNotFoundError(f"Folder {base_path} does not exist.") - - model_name = model.__class__.__name__ - - if isinstance(model, pipeline_class): - pipeline = model - col_transformer = model[0] - model = model[-1] - else: - pipeline = None - col_transformer = None - model_library = None - for library, class_name in _SUPPORTED_LIBRARIES_CLASSES.items(): - if isinstance(model, class_name): - model_library = library - if model_library is None: - raise ValueError(f"Model {model_name} is not supported.") - - # Detect if it is a classification or regression model - if hasattr(model, "predict_proba"): - problem_type = ProblemType.CLASSIFICATION - else: - problem_type = ProblemType.REGRESSION - summary = {} - extra_params = [] - # Fill structure - summary["description"] = description - summary["name"] = base_folder_name or model_name - summary["training_date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - summary["model"] = {} - summary["model"]["name"] = model_name - summary["model"]["problem_type"] = problem_type.value - summary["model"]["target"] = y_train.name - summary["model"]["library"] = model_library.value - if model_library == ModelLibrary.CATBOOST: - if pipeline is not None: - summary["model"]["input"] = list(col_transformer.feature_names_in_) - summary["model"]["hyperparameters"] = pipeline.get_params(deep=True) - else: - summary["model"]["hyperparameters"] = model.get_all_params() - summary["model"]["input"] = list(model.feature_names_) - - elif model_library == ModelLibrary.SCIKIT_LEARN: - if pipeline is not None: - summary["model"]["input"] = list(col_transformer.feature_names_in_) - summary["model"]["hyperparameters"] = pipeline.get_params(deep=True) - else: - summary["model"]["input"] = list(model.feature_names_in_) - summary["model"]["hyperparameters"] = model.get_params(deep=True) - elif model_library == ModelLibrary.LIGHTGBM: - if pipeline is not None: - summary["model"]["input"] = list(col_transformer.feature_names_in_) - summary["model"]["hyperparameters"] = pipeline.get_params(deep=True) - else: - summary["model"]["input"] = list(model.feature_name_) - summary["model"]["hyperparameters"] = model.get_params(deep=True) - - # Clean hyperparameters for the sklearn pipeline or other non-serializable objects - _clean_hyperparameters(summary["model"]["hyperparameters"]) - - # Sort keys in summary["model"] - if problem_type == ProblemType.CLASSIFICATION: - summary["model"]["num_classes"] = len(y_train.unique()) - # Sort keys in summary["model"] to be: name, problem_type, num_classes, input, target, hyperparameters, library - summary["model"] = { - k: summary["model"][k] - for k in [ - "name", - "problem_type", - "num_classes", - "input", - "target", - "hyperparameters", - "library", - ] - } - else: - # Sort keys in summary["model"] to be: name, problem_type, input, target, hyperparameters, library - summary["model"] = { - k: summary["model"][k] - for k in [ - "name", - "problem_type", - "input", - "target", - "hyperparameters", - "library", - ] - } - - # Restore pipeline to model variable - if pipeline: - model = pipeline - - # Generate metrics - if model_library == ModelLibrary.CATBOOST: - y_train_pred = pd.Series(model.predict(X_train).reshape(-1)).reset_index( - drop=True - ) - y_test_pred = pd.Series(model.predict(X_test).reshape(-1)).reset_index( - drop=True - ) - y_validation_pred = pd.Series( - model.predict(X_validation).reshape(-1) - ).reset_index(drop=True) - elif model_library in [ModelLibrary.SCIKIT_LEARN, ModelLibrary.LIGHTGBM]: - y_train_pred = pd.Series(model.predict(X_train)).reset_index(drop=True) - y_test_pred = pd.Series(model.predict(X_test)).reset_index(drop=True) - y_validation_pred = pd.Series(model.predict(X_validation)).reset_index( - drop=True - ) - - if problem_type == ProblemType.CLASSIFICATION: - if not custom_metrics: - summary["results"] = { - "train_score": generate_metrics_classification( - y_train.reset_index(drop=True), y_train_pred - ), - "test_score": generate_metrics_classification( - y_test.reset_index(drop=True), y_test_pred - ), - "validation_score": generate_metrics_classification( - y_validation.reset_index(drop=True), y_validation_pred - ), - } + final_hyperparameters[key] = str(value) else: - summary["results"] = custom_metrics - elif problem_type == ProblemType.REGRESSION: - summary["results"] = { - "train_score": generate_metrics_regression( - y_train.reset_index(drop=True), y_train_pred - ), - "test_score": generate_metrics_regression( - y_test.reset_index(drop=True), y_test_pred - ), - "validation_score": generate_metrics_regression( - y_validation.reset_index(drop=True), y_validation_pred - ), - } - - # Prepare environment to save files - folder_name_default = ( - f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_experiment_{model_name}" - ) - folder_name = base_folder_name or folder_name_default - folder_name = os.path.join( - base_path, f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_{folder_name}" - ) - - # Compress model and save - if save_model: - os.makedirs(os.path.join(folder_name, "model")) - if not "files" in summary: - summary["files"] = {} - if not "model" in summary["files"]: - summary["files"]["model"] = {} - # Save hyperparameters - hyperparameters_path = os.path.join( - folder_name, "model", "hyperparameters.json" - ) - summary["files"]["model"]["hyperparameters.json"] = os.path.abspath( - hyperparameters_path - ) - with open(hyperparameters_path, "w") as f: - json.dump(summary["model"]["hyperparameters"], f, indent=4) - # Save the model - model_path = os.path.join(folder_name, "model", "model.pkl") - summary["files"]["model"]["model.pkl"] = os.path.abspath(model_path) - with open(model_path, "wb") as f: - pickle.dump(model, f) - if zip_files: - zip_path = os.path.join(folder_name, "model.zip") - summary["files"]["model"]["zip"] = os.path.abspath(zip_path) - shutil.make_archive( - zip_path.rstrip(".zip"), "zip", os.path.join(folder_name, "model") - ) - shutil.rmtree(os.path.join(folder_name, "model")) - - if save_datasets: - os.makedirs(os.path.join(folder_name, "datasets")) - if not "files" in summary: - summary["files"] = {} - if not "datasets" in summary["files"]: - summary["files"]["datasets"] = {} - X_train_path = os.path.join(folder_name, "datasets", "X_train.csv") - summary["files"]["datasets"]["X_train"] = {} - summary["files"]["datasets"]["X_train"]["path"] = os.path.abspath(X_train_path) - summary["files"]["datasets"]["X_train"]["shape"] = X_train.shape - X_train.to_csv(X_train_path, index=False) - y_train_path = os.path.join(folder_name, "datasets", "y_train.csv") - summary["files"]["datasets"]["y_train"] = {} - summary["files"]["datasets"]["y_train"]["path"] = os.path.abspath(y_train_path) - summary["files"]["datasets"]["y_train"]["shape"] = y_train.shape - y_train.to_csv(y_train_path, index=False) - X_test_path = os.path.join(folder_name, "datasets", "X_test.csv") - summary["files"]["datasets"]["X_test"] = {} - summary["files"]["datasets"]["X_test"]["path"] = os.path.abspath(X_test_path) - summary["files"]["datasets"]["X_test"]["shape"] = X_test.shape - X_test.to_csv(X_test_path, index=False) - y_test_path = os.path.join(folder_name, "datasets", "y_test.csv") - summary["files"]["datasets"]["y_test"] = {} - summary["files"]["datasets"]["y_test"]["path"] = os.path.abspath(y_test_path) - summary["files"]["datasets"]["y_test"]["shape"] = y_test.shape - y_test.to_csv(y_test_path, index=False) - X_validation_path = os.path.join(folder_name, "datasets", "X_validation.csv") - summary["files"]["datasets"]["X_validation"] = {} - summary["files"]["datasets"]["X_validation"]["path"] = os.path.abspath( - X_validation_path - ) - summary["files"]["datasets"]["X_validation"]["shape"] = X_validation.shape - X_validation.to_csv(X_validation_path, index=False) - y_validation_path = os.path.join(folder_name, "datasets", "y_validation.csv") - summary["files"]["datasets"]["y_validation"] = {} - summary["files"]["datasets"]["y_validation"]["path"] = os.path.abspath( - y_validation_path - ) - summary["files"]["datasets"]["y_validation"]["shape"] = y_validation.shape - y_validation.to_csv(y_validation_path, index=False) - if zip_files: - # Compress data and save - zip_path = os.path.join(folder_name, "datasets.zip") - summary["files"]["datasets"]["zip"] = {} - summary["files"]["datasets"]["zip"]["path"] = os.path.abspath(zip_path) - shutil.make_archive( - zip_path.rstrip(".zip"), "zip", os.path.join(folder_name, "datasets") - ) - shutil.rmtree(os.path.join(folder_name, "datasets")) - - # Save json - json_path = os.path.join(folder_name, "summary.json") - with open(json_path, "w", encoding="utf-8") as f: - json.dump(summary, f, indent=4, ensure_ascii=False) - - return folder_name + final_hyperparameters[key] = value + return final_hyperparameters class MLExperiment: @@ -417,14 +127,12 @@ class MLExperiment: def __init__( self, - *, - config: BaseConfig = None, - X_train: Optional[pd.DataFrame] = None, - y_train: Optional[pd.Series] = None, - X_test: Optional[pd.DataFrame] = None, - y_test: Optional[pd.Series] = None, - X_validation: Optional[pd.DataFrame] = None, - y_validation: Optional[pd.Series] = None, + X_train: pd.DataFrame, + y_train: pd.Series, + X_test: pd.DataFrame, + y_test: pd.Series, + X_validation: pd.DataFrame, + y_validation: pd.Series, model: Any = None, problem_type: Union[str, ProblemType] = None, name: str = None, @@ -433,16 +141,16 @@ def __init__( """ Initializes an instance of the MLExperiment class. - :param config: Configuration for the experiment. Not implemented yet. - :type config: :class:`BaseConfig`, optional :param X_train: Training data. - :type X_train: :class:`pd.DataFrame`, optional + :type X_train: :class:`pd.DataFrame` :param y_train: Training target. - :type y_train: :class:`pd.Series`, optional + :type y_train: :class:`pd.Series` :param X_test: Test data. - :type X_test: :class:`pd.DataFrame`, optional + :type X_test: :class:`pd.DataFrame` :param y_test: Test target. - :type y_test: :class:`pd.Series`, optional + :type y_test: :class:`pd.Series` + :param X_validation: Validation data. + :type X_validation: :class:`pd.DataFrame` :param model: A model from one of the supported libraries. :type model: Any, optional :param problem_type: Type of the problem (classification or regression). @@ -451,20 +159,13 @@ def __init__( :type name: str, optional :param description: Description of the experiment. :type description: str, optional - - :raises NotImplementedError: If the config parameter is provided, as it's not implemented yet. """ - # For this version not implement config setup of the experiment - if config: - raise NotImplementedError("Config usage is not implemented yet.") - - # Search for supported libraries - self._search_for_supported_libraries() # Public properties (Not defined in the if config block) self.name = name self.description = description self.problem_type = problem_type + self.base_model_name = model.__class__.__name__ self.model = model self.base_model = None self.num_classes = None @@ -496,11 +197,11 @@ def __init__( self._precision_list = None self._recall_list = None self._config = None - self._is_pipeline = isinstance(self.model, self._pipeline_class) - self._model_input_cols = self._get_model_input_cols() + self._is_pipeline = isinstance(self.model, _PIPELINE_CLASS) + self._set_base_model_and_library() # Final Setup - self._set_base_model_and_library() + self._model_input_cols = self.get_model_input_cols() self._set_datasets_dtypes() self._init_metrics() @@ -510,6 +211,8 @@ def __eq__(self, other): assert_series_equal(self.y_train, other.y_train, check_dtype=False) assert_frame_equal(self.X_test, other.X_test, check_dtype=False) assert_series_equal(self.y_test, other.y_test, check_dtype=False) + assert_frame_equal(self.X_validation, other.X_validation, check_dtype=False) + assert_series_equal(self.y_validation, other.y_validation, check_dtype=False) return ( self.name == other.name and self.description == other.description @@ -619,9 +322,9 @@ def y_test(self, value): if value.shape[1] == 1: value = value.iloc[:, 0] else: - raise ValueError("y_train must be a pandas Series.") + raise ValueError("y_test must be a pandas Series.") if not isinstance(value, pd.Series): - raise ValueError("y_train must be a pandas Series.") + raise ValueError("y_test must be a pandas Series.") self._y_test = value @property @@ -634,7 +337,7 @@ def X_validation(self) -> pd.DataFrame: @X_validation.setter def X_validation(self, value): if value is None: - raise ValueError("X_test cannot be None.") + raise ValueError("X_validation cannot be None.") self._X_validation = value @property @@ -647,14 +350,14 @@ def y_validation(self) -> pd.Series: @y_validation.setter def y_validation(self, value): if value is None: - raise ValueError("y_test cannot be None.") + raise ValueError("y_validation cannot be None.") if isinstance(value, pd.DataFrame): if value.shape[1] == 1: value = value.iloc[:, 0] else: - raise ValueError("y_train must be a pandas Series.") + raise ValueError("y_validation must be a pandas Series.") if not isinstance(value, pd.Series): - raise ValueError("y_train must be a pandas Series.") + raise ValueError("y_validation must be a pandas Series.") self._y_validation = value @property @@ -731,37 +434,6 @@ def imbalance(self, value): self._imbalance = value # Utility methods - def _search_for_supported_libraries(self): - """ - Search if libraries are installed and lazy import them. - """ - self._SUPPORTED_LIBRARIES_CLASSES = {} - try: - from sklearn.base import BaseEstimator - from sklearn.pipeline import Pipeline - from sklearn.linear_model import LogisticRegression, LinearRegression - - self._pipeline_class = Pipeline - self._sklearn_linear_regression_class = LinearRegression - self._sklearn_logistic_regression_class = LogisticRegression - - self._SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.SCIKIT_LEARN] = BaseEstimator - except ImportError: - self._pipeline_class = _DummyPipeline - self._sklearn_linear_regression_class = _DummyLinearRegression - self._sklearn_logistic_regression_class = _DummyLogisticRegression - try: - from catboost import CatBoost - - self._SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.CATBOOST] = CatBoost - except ImportError: - pass - try: - from lightgbm import LGBMModel - - self._SUPPORTED_LIBRARIES_CLASSES[ModelLibrary.LIGHTGBM] = LGBMModel - except ImportError: - pass def _set_datasets_dtypes(self): """ @@ -810,11 +482,9 @@ def _generate_classification_metrics_with_threshold(self): ) else: self.metrics = {} - y_pred_train = self.model.predict(self.X_train[self._model_input_cols]) - y_pred_test = self.model.predict(self.X_test[self._model_input_cols]) - y_pred_validation = self.model.predict( - self.X_validation[self._model_input_cols] - ) + y_pred_train = self.predict(self.X_train[self._model_input_cols]) + y_pred_test = self.predict(self.X_test[self._model_input_cols]) + y_pred_validation = self.predict(self.X_validation[self._model_input_cols]) self.metrics["train_score"] = generate_metrics_classification( self.y_train, y_pred_train ) @@ -938,7 +608,7 @@ def _set_base_model_and_library(self): # Get the library matching_libraries = [] - for library, class_name in self._SUPPORTED_LIBRARIES_CLASSES.items(): + for library, class_name in _SUPPORTED_LIBRARIES_CLASSES.items(): if isinstance(model, class_name): matching_libraries.append(library) # Some models inherit from sklearn hence if len(matching_libraries) > 1 and sklearn is one of them pop it @@ -1039,17 +709,17 @@ def _init_metrics(self): if self.problem_type == ProblemType.REGRESSION: # Metrics for the training set (optional, depending on your needs) train_metrics = generate_metrics_regression( - self.y_train, self.model.predict(self.X_train[self._model_input_cols]) + self.y_train, self.predict(self.X_train[self._model_input_cols]) ) # Metrics for the test set test_metrics = generate_metrics_regression( - self.y_test, self.model.predict(self.X_test[self._model_input_cols]) + self.y_test, self.predict(self.X_test[self._model_input_cols]) ) # Metrics for the validation set validation_metrics = generate_metrics_regression( self.y_validation, - self.model.predict(self.X_validation[self._model_input_cols]), + self.predict(self.X_validation[self._model_input_cols]), ) # Store metrics in a dictionary @@ -1076,8 +746,8 @@ def get_feature_importance(self) -> pd.Series: is_linear_model = isinstance( self.base_model, ( - self._sklearn_linear_regression_class, - self._sklearn_logistic_regression_class, + _SKLEARN_LINEAR_REGRESSION_CLASS, + _SKLEARN_LOGISTIC_REGRESSION_CLASS, ), ) @@ -1284,16 +954,9 @@ def register_experiment( **custom_metrics, } return export_model( - self.model, - self.X_train, - self.y_train, - self.X_test, - self.y_test, - self.X_validation, - self.y_validation, - description=self.description, - custom_metrics=custom_metrics, + self, base_path=base_path, + custom_metrics=custom_metrics, base_folder_name=self.name, save_model=True, save_datasets=True, @@ -1390,13 +1053,9 @@ def predict(self, X, threshold=None): if self.problem_type == ProblemType.CLASSIFICATION: if threshold is not None: return self.model.predict_proba(X)[:, 1] >= threshold - else: - return ( - self.model.predict(X)[:, 1] >= self.best_threshold_pr_curve - if self.imbalance - else self.best_threshold_roc_curve - ) - return self.model.predict(X) + preds = self.model.predict(X) + # Flatten the array + return preds.flatten() def predict_proba(self, X): if self.problem_type == ProblemType.REGRESSION: @@ -1412,12 +1071,30 @@ def _prepare_dataset_for_prediction(self, X): # Select only the columns that were used in the training return X[self._model_input_cols] - def _get_model_input_cols(self): + def get_model_input_cols(self): if self._is_pipeline: # Assume first step is the column transformer and get feature names in return self.model[0].feature_names_in_ - else: + + # For each library, get the feature names + if self.base_model_library == ModelLibrary.SCIKIT_LEARN: return self.model.feature_names_in_ + elif self.base_model_library == ModelLibrary.CATBOOST: + return self.model.feature_names_ + elif self.base_model_library == ModelLibrary.LIGHTGBM: + return self.model.feature_name_ + + def get_model_hyperparameters(self): + if self._is_pipeline: + return self.model.get_params() + + if self.base_model_library == ModelLibrary.CATBOOST: + return self.model.get_all_params() + elif self.base_model_library in [ + ModelLibrary.SCIKIT_LEARN, + ModelLibrary.LIGHTGBM, + ]: + return self.model.get_params(deep=True) class MLTracker: @@ -1486,7 +1163,7 @@ def scan_for_experiments(self): exp = MLExperiment.from_registered_experiment( os.path.join(self.experiment_folder, experiments_folders) ) - if not experiments_folders in self._experiments: + if experiments_folders not in self._experiments: self._experiments[experiments_folders] = exp else: logging.warning( @@ -1744,3 +1421,240 @@ def create_hyperparameters_json(self, save: bool = True) -> dict: ) as f: json.dump(hyperparameters, f, indent=4, ensure_ascii=False) return hyperparameters + + +def export_model( + ml_experiment: MLExperiment, + base_path: str, + custom_metrics: dict = None, + base_folder_name: str = None, + save_model: bool = True, + save_datasets: bool = False, + zip_files: bool = True, +) -> str: + """ + Register model and metrics in a json file and save the model and datasets in a folder. + + :param ml_experiment: An MLExperiment instance. + :type ml_experiment: :class:`MLExperiment` + :param description: Description of the experiment. + :type description: :class:`str` + :param base_path: Path to the base folder where the model and datasets will be saved in a subfolder structure. + :type base_path: :class:`str` + :param base_folder_name: Custom name for the folder where the model and datasets will be saved. + :type base_folder_name: :class:`str` + :param zip_files: Whether to zip the files or not. + :type zip_files: :class:`bool` + :param save_datasets: Whether to save the datasets or not. + :type save_datasets: :class:`bool` + :param save_model: Whether to save the model or not. + :type save_model: :class:`bool` + :return: The path to the subfolder inside base_path where the model and datasets have been saved. + :rtype: :class:`str` + + Usage + ----- + >>> from sklearn.datasets import fetch_california_housing + >>> from sklearn.linear_model import LogisticRegression + >>> from sklearn.model_selection import train_test_split + >>> X, y = fetch_california_housing(return_X_y=True, as_frame=True) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y) + >>> model = LogisticRegression() + >>> ml_experiment.fit(X_train, y_train) + >>> output_folder = export_model(model,X_train,y_train,X_test,y_test,"/my_experiments_folder") + >>> print(output_folder) # /my_experiments_folder/experiment_LogisticRegression_YYYYMMDD-HHMMSS + """ + + if not os.path.exists(base_path): + raise FileNotFoundError(f"Folder {base_path} does not exist.") + + if base_folder_name is None: + base_folder_name = ml_experiment.name + + # Detect if it is a classification or regression model + if hasattr(ml_experiment, "predict_proba"): + problem_type = ProblemType.CLASSIFICATION + else: + problem_type = ProblemType.REGRESSION + summary = {} + # Fill structure + summary["description"] = ml_experiment.description + summary["name"] = base_folder_name + summary["training_date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + summary["model"] = {} + summary["model"]["name"] = ml_experiment.base_model_name + summary["model"]["problem_type"] = problem_type.value + summary["model"]["target"] = ml_experiment.y_train.name + summary["model"]["library"] = ml_experiment.base_model_library.value + summary["model"]["input"] = ml_experiment.get_model_input_cols() + summary["model"]["hyperparameters"] = ml_experiment.get_model_hyperparameters() + + # Clean hyperparameters for the sklearn pipeline or other non-serializable objects + summary["model"]["hyperparameters"] = _clean_json(summary["model"]["hyperparameters"]) + + # Sort keys in summary["model"] + if problem_type == ProblemType.CLASSIFICATION: + summary["model"]["num_classes"] = ml_experiment.num_classes + # Sort keys in summary["model"] to be: name, problem_type, num_classes, input, target, hyperparameters, library + summary["model"] = { + k: summary["model"][k] + for k in [ + "name", + "problem_type", + "num_classes", + "input", + "target", + "hyperparameters", + "library", + ] + } + else: + # Sort keys in summary["model"] to be: name, problem_type, input, target, hyperparameters, library + summary["model"] = { + k: summary["model"][k] + for k in [ + "name", + "problem_type", + "input", + "target", + "hyperparameters", + "library", + ] + } + + # Generate metrics + y_train_pred = pd.Series(ml_experiment.predict(ml_experiment.X_train)).reset_index( + drop=True + ) + y_test_pred = pd.Series(ml_experiment.predict(ml_experiment.X_test)).reset_index( + drop=True + ) + y_validation_pred = pd.Series( + ml_experiment.predict(ml_experiment.X_validation) + ).reset_index(drop=True) + + if problem_type == ProblemType.CLASSIFICATION: + if not custom_metrics: + summary["results"] = { + "train_score": generate_metrics_classification( + ml_experiment.y_train.reset_index(drop=True), y_train_pred + ), + "test_score": generate_metrics_classification( + ml_experiment.y_test.reset_index(drop=True), y_test_pred + ), + "validation_score": generate_metrics_classification( + ml_experiment.y_validation.reset_index(drop=True), y_validation_pred + ), + } + else: + summary["results"] = custom_metrics + elif problem_type == ProblemType.REGRESSION: + summary["results"] = { + "train_score": generate_metrics_regression( + ml_experiment.y_train.reset_index(drop=True), y_train_pred + ), + "test_score": generate_metrics_regression( + ml_experiment.y_test.reset_index(drop=True), y_test_pred + ), + "validation_score": generate_metrics_regression( + ml_experiment.y_validation.reset_index(drop=True), y_validation_pred + ), + } + + # Prepare environment to save files + folder_name_default = f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_experiment_{ml_experiment.base_model_name}" + folder_name = base_folder_name or folder_name_default + folder_name = os.path.join( + base_path, f"{datetime.now().strftime('%Y%m%d-%H%M%S')}_{folder_name}" + ) + + # Compress model and save + if save_model: + os.makedirs(os.path.join(folder_name, "model")) + if "files" not in summary: + summary["files"] = {} + if "model" not in summary["files"]: + summary["files"]["model"] = {} + # Save hyperparameters + hyperparameters_path = os.path.join( + folder_name, "model", "hyperparameters.json" + ) + summary["files"]["model"]["hyperparameters.json"] = os.path.abspath( + hyperparameters_path + ) + with open(hyperparameters_path, "w") as f: + json.dump(summary["model"]["hyperparameters"], f, indent=4) + # Save the model + model_path = os.path.join(folder_name, "model", "model.pkl") + summary["files"]["model"]["model.pkl"] = os.path.abspath(model_path) + with open(model_path, "wb") as f: + pickle.dump(ml_experiment.model, f) + if zip_files: + zip_path = os.path.join(folder_name, "model.zip") + summary["files"]["model"]["zip"] = os.path.abspath(zip_path) + shutil.make_archive( + zip_path.rstrip(".zip"), "zip", os.path.join(folder_name, "model") + ) + shutil.rmtree(os.path.join(folder_name, "model")) + + if save_datasets: + os.makedirs(os.path.join(folder_name, "datasets")) + if "files" not in summary: + summary["files"] = {} + if "datasets" not in summary["files"]: + summary["files"]["datasets"] = {} + X_train_path = os.path.join(folder_name, "datasets", "X_train.csv") + summary["files"]["datasets"]["X_train"] = {} + summary["files"]["datasets"]["X_train"]["path"] = os.path.abspath(X_train_path) + summary["files"]["datasets"]["X_train"]["shape"] = ml_experiment.X_train.shape + ml_experiment.X_train.to_csv(X_train_path, index=False) + y_train_path = os.path.join(folder_name, "datasets", "y_train.csv") + summary["files"]["datasets"]["y_train"] = {} + summary["files"]["datasets"]["y_train"]["path"] = os.path.abspath(y_train_path) + summary["files"]["datasets"]["y_train"]["shape"] = ml_experiment.y_train.shape + ml_experiment.y_train.to_csv(y_train_path, index=False) + X_test_path = os.path.join(folder_name, "datasets", "X_test.csv") + summary["files"]["datasets"]["X_test"] = {} + summary["files"]["datasets"]["X_test"]["path"] = os.path.abspath(X_test_path) + summary["files"]["datasets"]["X_test"]["shape"] = ml_experiment.X_test.shape + ml_experiment.X_test.to_csv(X_test_path, index=False) + y_test_path = os.path.join(folder_name, "datasets", "y_test.csv") + summary["files"]["datasets"]["y_test"] = {} + summary["files"]["datasets"]["y_test"]["path"] = os.path.abspath(y_test_path) + summary["files"]["datasets"]["y_test"]["shape"] = ml_experiment.y_test.shape + ml_experiment.y_test.to_csv(y_test_path, index=False) + X_validation_path = os.path.join(folder_name, "datasets", "X_validation.csv") + summary["files"]["datasets"]["X_validation"] = {} + summary["files"]["datasets"]["X_validation"]["path"] = os.path.abspath( + X_validation_path + ) + summary["files"]["datasets"]["X_validation"][ + "shape" + ] = ml_experiment.X_validation.shape + ml_experiment.X_validation.to_csv(X_validation_path, index=False) + y_validation_path = os.path.join(folder_name, "datasets", "y_validation.csv") + summary["files"]["datasets"]["y_validation"] = {} + summary["files"]["datasets"]["y_validation"]["path"] = os.path.abspath( + y_validation_path + ) + summary["files"]["datasets"]["y_validation"][ + "shape" + ] = ml_experiment.y_validation.shape + ml_experiment.y_validation.to_csv(y_validation_path, index=False) + if zip_files: + # Compress data and save + zip_path = os.path.join(folder_name, "datasets.zip") + summary["files"]["datasets"]["zip"] = {} + summary["files"]["datasets"]["zip"]["path"] = os.path.abspath(zip_path) + shutil.make_archive( + zip_path.rstrip(".zip"), "zip", os.path.join(folder_name, "datasets") + ) + shutil.rmtree(os.path.join(folder_name, "datasets")) + + # Save json + summary = _clean_json(summary) + json_path = os.path.join(folder_name, "summary.json") + with open(json_path, "w", encoding="utf-8") as f: + json.dump(summary, f, indent=4, ensure_ascii=False) + + return folder_name diff --git a/mango/tests/models_module/test_experiment_tracking.py b/mango/tests/models_module/test_experiment_tracking.py index c0dbfd0a..1ea0822d 100644 --- a/mango/tests/models_module/test_experiment_tracking.py +++ b/mango/tests/models_module/test_experiment_tracking.py @@ -59,6 +59,8 @@ def setUpClass(cls): cls.y_train_clf = y_clf[: int(len(y_clf) * 0.8)].reset_index(drop=True) cls.X_test_clf = X_clf[int(len(X_clf) * 0.8) :].reset_index(drop=True) cls.y_test_clf = y_clf[int(len(y_clf) * 0.8) :].reset_index(drop=True) + cls.X_val_clf = cls.X_test_clf.copy() + cls.y_val_clf = cls.y_test_clf.copy() # Regression X_reg, y_reg = make_regression(n_samples=1000, n_features=10, random_state=42) @@ -74,6 +76,8 @@ def setUpClass(cls): cls.y_train_reg = y_reg[: int(len(y_reg) * 0.8)].reset_index(drop=True) cls.X_test_reg = X_reg[int(len(X_reg) * 0.8) :].reset_index(drop=True) cls.y_test_reg = y_reg[int(len(y_reg) * 0.8) :].reset_index(drop=True) + cls.X_val_reg = cls.X_test_reg.copy() + cls.y_val_reg = cls.y_test_reg.copy() # Binary Classification X_bin_clf, y_bin_clf = make_classification( @@ -99,6 +103,8 @@ def setUpClass(cls): cls.y_test_bin_clf = y_bin_clf[int(len(y_bin_clf) * 0.8) :].reset_index( drop=True ) + cls.X_val_bin_clf = cls.X_test_bin_clf.copy() + cls.y_val_bin_clf = cls.y_test_bin_clf.copy() # Expected values for roc curve cls.expected_tpr_logistic = [ @@ -1401,6 +1407,7 @@ def setUpClass(cls): } def setUp(self): + self.maxDiff = None os.makedirs(self.folder_name, exist_ok=True) def tearDown(self): @@ -1445,7 +1452,7 @@ def _check_model_with_zip(self, output_folder): self.assertFalse(os.path.exists(os.path.join(output_folder, "model"))) self.assertFalse(os.path.exists(os.path.join(output_folder, "datasets"))) - def _check_model_without_zip(self, model, output_folder, problem_type): + def _check_model_without_zip(self, ml_experiment, output_folder, problem_type): """ Helper function to check the model is saved correctly when zip_files is False. """ @@ -1510,203 +1517,15 @@ def _check_model_without_zip(self, model, output_folder, problem_type): raise ValueError("Problem type not supported") # Assert model is the same # Assert model is the same - with open(os.path.join(output_folder, "model", "model.pkl"), "rb") as f: - model_load = pickle.load(f) + loaded_ml_experiment = MLExperiment.from_registered_experiment(output_folder) # Generate predictions from both models - original_predictions = model.predict(self.X_test_reg) - loaded_predictions = model_load.predict(self.X_test_reg) + original_predictions = ml_experiment.predict(self.X_test_reg) + loaded_predictions = loaded_ml_experiment.predict(self.X_test_reg) # Check if the predictions are almost the same self.assertTrue(np.allclose(original_predictions, loaded_predictions)) - def test_serialize_sklearn(self): - """ - Test serialization of a sklearn model. - """ - model = LinearRegression() - model.fit(self.X_train_reg, self.y_train_reg) - output_folder = export_model( - model, - self.X_train_reg, - self.y_train_reg, - self.X_test_reg, - self.y_test_reg, - self.folder_name, - save_model=True, - save_datasets=True, - zip_files=False, - ) - self._check_model_without_zip( - output_folder=output_folder, - model=model, - problem_type=ProblemType.REGRESSION, - ) - # Assert works for classification with Zip - model = LogisticRegression() - model.fit(self.X_train_clf, self.y_train_clf) - output_folder = export_model( - model, - self.X_train_clf, - self.y_train_clf, - self.X_test_clf, - self.y_test_clf, - self.folder_name, - save_model=True, - save_datasets=True, - zip_files=True, - ) - self._check_model_with_zip(output_folder=output_folder) - - def test_serialize_catboost(self): - """ - Test serialization of a CatBoost model. - """ - model = CatBoostClassifier(allow_writing_files=False, verbose=5, iterations=10) - model.fit(self.X_train_clf, self.y_train_clf) - output_folder = export_model( - model, - self.X_train_clf, - self.y_train_clf, - self.X_test_clf, - self.y_test_clf, - self.folder_name, - save_model=True, - save_datasets=True, - zip_files=False, - ) - self._check_model_without_zip( - output_folder=output_folder, - model=model, - problem_type=ProblemType.CLASSIFICATION, - ) - - # Assert works for regression with Zip - model = CatBoostRegressor(allow_writing_files=False, verbose=5, iterations=10) - model.fit(self.X_train_reg, self.y_train_reg) - output_folder = export_model( - model, - self.X_train_reg, - self.y_train_reg, - self.X_test_reg, - self.y_test_reg, - self.folder_name, - save_model=True, - save_datasets=True, - zip_files=True, - ) - self._check_model_with_zip(output_folder=output_folder) - - def test_serialize_pipeline_with_catboost(self): - """ - Test serialization of a pipeline with CatBoost model. - """ - col_transformer = ColumnTransformer( - [ - ("num", StandardScaler(), self.X_train_clf.columns), - ] - ) - model = Pipeline( - [ - ("col_transformer", col_transformer), - ( - "model", - CatBoostClassifier( - allow_writing_files=False, verbose=5, iterations=10 - ), - ), - ] - ) - model.fit(self.X_train_clf, self.y_train_clf) - output_folder = export_model( - model, - self.X_train_clf, - self.y_train_clf, - self.X_test_clf, - self.y_test_clf, - self.folder_name, - save_model=True, - save_datasets=True, - zip_files=False, - ) - self._check_model_without_zip( - output_folder=output_folder, - model=model, - problem_type=ProblemType.CLASSIFICATION, - ) - - def test_serialize_lightgbm(self): - """ - Test serialization of a LightGBM model. - """ - model = LGBMClassifier() - model.fit(self.X_train_clf, self.y_train_clf) - output_folder = export_model( - model, - self.X_train_clf, - self.y_train_clf, - self.X_test_clf, - self.y_test_clf, - self.folder_name, - save_model=True, - save_datasets=True, - zip_files=False, - ) - self._check_model_without_zip( - output_folder=output_folder, - model=model, - problem_type=ProblemType.CLASSIFICATION, - ) - - # Assert works for regression with Zip - model = LGBMRegressor() - model.fit(self.X_train_reg, self.y_train_reg) - output_folder = export_model( - model, - self.X_train_reg, - self.y_train_reg, - self.X_test_reg, - self.y_test_reg, - self.folder_name, - save_model=True, - save_datasets=True, - zip_files=True, - ) - self._check_model_with_zip(output_folder=output_folder) - - def test_errors(self): - """ - Test errors raised by the function. - """ - # Not supported model - model = InvalidModel() - with self.assertRaises(ValueError): - export_model( - model, - self.X_train_reg, - self.y_train_reg, - self.X_test_reg, - self.y_test_reg, - self.folder_name, - save_model=True, - save_datasets=True, - zip_files=False, - ) - - # Invalid folder - with self.assertRaises(FileNotFoundError): - export_model( - model, - self.X_train_reg, - self.y_train_reg, - self.X_test_reg, - self.y_test_reg, - "invalid_folder", - save_model=True, - save_datasets=True, - zip_files=False, - ) - def assert_ml_experiment_init_correct( self, experiment, @@ -1715,6 +1534,8 @@ def assert_ml_experiment_init_correct( y_train, X_test, y_test, + X_validation, + y_validation, name, description, problem_type, @@ -1726,8 +1547,10 @@ def assert_ml_experiment_init_correct( self.assertEqual(experiment.model, full_model) assert_frame_equal(experiment.X_train, X_train) assert_frame_equal(experiment.X_test, X_test) + assert_frame_equal(experiment.X_validation, X_validation) assert_series_equal(experiment.y_train, y_train) assert_series_equal(experiment.y_test, y_test) + assert_series_equal(experiment.y_validation, y_validation) self.assertEqual(experiment.problem_type, problem_type) self.assertEqual(experiment.name, name) self.assertEqual(experiment.description, description) @@ -1747,6 +1570,8 @@ def test_ml_experiment_errors(self): y_train=self.y_train_clf, X_test=self.X_test_clf, y_test=self.y_test_clf, + X_validation=self.X_val_clf, + y_validation=self.y_val_clf, problem_type="classification", name="Test sklearn experiment", description="Test sklearn experiment", @@ -1759,6 +1584,8 @@ def test_ml_experiment_errors(self): y_train=self.y_train_clf, X_test=self.X_test_clf, y_test=self.y_test_clf, + X_validation=self.X_val_clf, + y_validation=self.y_val_clf, name="Test sklearn experiment", description="Test sklearn experiment", problem_type=ProblemType.CLASSIFICATION, @@ -1797,7 +1624,7 @@ def _check_threshold_calculation( def _check_feature_importance(self, experiment, expected_feature_importance): feature_importance = experiment.get_feature_importance() assert_series_equal( - feature_importance, expected_feature_importance, check_dtype=False + feature_importance, expected_feature_importance, check_dtype=False, atol=1e-4 ) def test_ml_experiment_sklearn(self): @@ -1811,6 +1638,8 @@ def test_ml_experiment_sklearn(self): y_train=self.y_train_bin_clf, X_test=self.X_test_bin_clf, y_test=self.y_test_bin_clf, + X_validation=self.X_val_bin_clf, + y_validation=self.y_val_bin_clf, problem_type="classification", name="Test sklearn experiment", description="Test sklearn experiment", @@ -1823,6 +1652,8 @@ def test_ml_experiment_sklearn(self): y_train=self.y_train_bin_clf, X_test=self.X_test_bin_clf, y_test=self.y_test_bin_clf, + X_validation=self.X_val_bin_clf, + y_validation=self.y_val_bin_clf, name="Test sklearn experiment", description="Test sklearn experiment", problem_type=ProblemType.CLASSIFICATION, @@ -1861,6 +1692,8 @@ def test_ml_experiment_catboost(self): y_train=self.y_train_bin_clf, X_test=self.X_test_bin_clf, y_test=self.y_test_bin_clf, + X_validation=self.X_val_bin_clf, + y_validation=self.y_val_bin_clf, problem_type="classification", name="Test catboost experiment", description="Test catboost experiment", @@ -1873,6 +1706,8 @@ def test_ml_experiment_catboost(self): y_train=self.y_train_bin_clf, X_test=self.X_test_bin_clf, y_test=self.y_test_bin_clf, + X_validation=self.X_val_bin_clf, + y_validation=self.y_val_bin_clf, name="Test catboost experiment", description="Test catboost experiment", problem_type=ProblemType.CLASSIFICATION, @@ -1911,6 +1746,8 @@ def test_ml_experiment_catboost(self): y_train=self.y_train_reg, X_test=self.X_test_reg, y_test=self.y_test_reg, + X_validation=self.X_val_reg, + y_validation=self.y_val_reg, problem_type="regression", name="Test catboost experiment", description="Test catboost experiment", @@ -1923,6 +1760,8 @@ def test_ml_experiment_catboost(self): y_train=self.y_train_reg, X_test=self.X_test_reg, y_test=self.y_test_reg, + X_validation=self.X_val_reg, + y_validation=self.y_val_reg, name="Test catboost experiment", description="Test catboost experiment", problem_type=ProblemType.REGRESSION, @@ -1938,7 +1777,7 @@ def test_ml_experiment_catboost(self): # Metrics self.assertDictEqual( - experiment.metrics, + experiment.metrics["test"], self.expected_metrics_catboost_regression, ) @@ -1968,6 +1807,8 @@ def test_ml_experiment_pipeline_with_catboost(self): y_train=self.y_train_bin_clf, X_test=self.X_test_bin_clf, y_test=self.y_test_bin_clf, + X_validation=self.X_val_bin_clf, + y_validation=self.y_val_bin_clf, problem_type="classification", name="Test catboost experiment", description="Test catboost experiment", @@ -1980,6 +1821,8 @@ def test_ml_experiment_pipeline_with_catboost(self): y_train=self.y_train_bin_clf, X_test=self.X_test_bin_clf, y_test=self.y_test_bin_clf, + X_validation=self.X_val_bin_clf, + y_validation=self.y_val_bin_clf, name="Test catboost experiment", description="Test catboost experiment", problem_type=ProblemType.CLASSIFICATION, @@ -2016,6 +1859,8 @@ def test_ml_experiment_lightgbm(self): y_train=self.y_train_bin_clf, X_test=self.X_test_bin_clf, y_test=self.y_test_bin_clf, + X_validation=self.X_val_bin_clf, + y_validation=self.y_val_bin_clf, problem_type="classification", name="Test lightgbm experiment", description="Test lightgbm experiment", @@ -2028,6 +1873,8 @@ def test_ml_experiment_lightgbm(self): y_train=self.y_train_bin_clf, X_test=self.X_test_bin_clf, y_test=self.y_test_bin_clf, + X_validation=self.X_val_bin_clf, + y_validation=self.y_val_bin_clf, name="Test lightgbm experiment", description="Test lightgbm experiment", problem_type=ProblemType.CLASSIFICATION, @@ -2068,6 +1915,8 @@ def test_ml_tracker_add_experiment(self): y_train=self.y_train_bin_clf, X_test=self.X_test_bin_clf, y_test=self.y_test_bin_clf, + X_validation=self.X_val_bin_clf, + y_validation=self.y_val_bin_clf, problem_type="classification", name="Test sklearn experiment", description="Test sklearn experiment", @@ -2087,3 +1936,220 @@ def test_ml_tracker_add_experiment(self): # Assert experiments are equal self.assertEqual(list(ml_tracker_new.experiments.values())[0], experiment) + + def test_serialize_sklearn(self): + """ + Test serialization of a sklearn model. + """ + model = LinearRegression() + model.fit(self.X_train_reg, self.y_train_reg) + + experiment = MLExperiment( + model=model, + X_train=self.X_train_reg, + y_train=self.y_train_reg, + X_test=self.X_test_reg, + y_test=self.y_test_reg, + X_validation=self.X_val_reg, + y_validation=self.y_val_reg, + name="Test Experiment regression", + description="Test Description", + problem_type=ProblemType.REGRESSION + ) + output_folder = experiment.register_experiment( + base_path=self.folder_name, + zip_files=False + ) + + self._check_model_without_zip( + output_folder=output_folder, + ml_experiment=experiment, + problem_type=ProblemType.REGRESSION, + ) + # Assert works for classification with Zip + model = LogisticRegression() + model.fit(self.X_train_clf, self.y_train_clf) + experiment = MLExperiment( + model=model, + X_train=self.X_train_clf, + y_train=self.y_train_clf, + X_test=self.X_test_clf, + y_test=self.y_test_clf, + X_validation=self.X_val_clf, + y_validation=self.y_val_clf, + name="Test Experiment classification", + description="Test Description", + problem_type=ProblemType.CLASSIFICATION + ) + output_folder = experiment.register_experiment( + base_path=self.folder_name, + zip_files=True + ) + self._check_model_with_zip(output_folder=output_folder) + + def test_serialize_catboost(self): + """ + Test serialization of a CatBoost model. + """ + model = CatBoostClassifier(allow_writing_files=False, verbose=5, iterations=10) + model.fit(self.X_train_clf, self.y_train_clf) + experiment = MLExperiment( + model=model, + X_train=self.X_train_clf, + y_train=self.y_train_clf, + X_test=self.X_test_clf, + y_test=self.y_test_clf, + X_validation=self.X_val_clf, + y_validation=self.y_val_clf, + name="Test Experiment", + description="Test Description", + problem_type=ProblemType.CLASSIFICATION + ) + output_folder = experiment.register_experiment( + base_path=self.folder_name, + zip_files=False + ) + + self._check_model_without_zip( + output_folder=output_folder, + ml_experiment=experiment, + problem_type=ProblemType.CLASSIFICATION, + ) + + # Assert works for regression with Zip + model = CatBoostRegressor(allow_writing_files=False, verbose=5, iterations=10) + model.fit(self.X_train_reg, self.y_train_reg) + experiment = MLExperiment( + model=model, + X_train=self.X_train_reg, + y_train=self.y_train_reg, + X_test=self.X_test_reg, + y_test=self.y_test_reg, + X_validation=self.X_val_reg, + y_validation=self.y_val_reg, + name="Test Experiment regression", + description="Test Description", + problem_type=ProblemType.REGRESSION + ) + output_folder = experiment.register_experiment( + base_path=output_folder, + zip_files=True + ) + self._check_model_with_zip(output_folder=output_folder) + + def test_serialize_pipeline_with_catboost(self): + """ + Test serialization of a pipeline with CatBoost model. + """ + col_transformer = ColumnTransformer( + [ + ("num", StandardScaler(), self.X_train_clf.columns), + ] + ) + model = Pipeline( + [ + ("col_transformer", col_transformer), + ( + "model", + CatBoostClassifier( + allow_writing_files=False, verbose=5, iterations=10 + ), + ), + ] + ) + model.fit(self.X_train_clf, self.y_train_clf) + experiment = MLExperiment( + model=model, + X_train=self.X_train_clf, + y_train=self.y_train_clf, + X_test=self.X_test_clf, + y_test=self.y_test_clf, + X_validation=self.X_val_clf, + y_validation=self.y_val_clf, + name="Test Experiment classification", + description="Test Description", + problem_type=ProblemType.CLASSIFICATION + ) + output_folder = experiment.register_experiment( + base_path=self.folder_name, + zip_files=False + ) + self._check_model_without_zip( + output_folder=output_folder, + ml_experiment=experiment, + problem_type=ProblemType.CLASSIFICATION, + ) + + def test_serialize_lightgbm(self): + """ + Test serialization of a LightGBM model. + """ + model = LGBMClassifier() + model.fit(self.X_train_clf, self.y_train_clf) + experiment = MLExperiment( + model=model, + X_train=self.X_train_clf, + y_train=self.y_train_clf, + X_test=self.X_test_clf, + y_test=self.y_test_clf, + X_validation=self.X_val_clf, + y_validation=self.y_val_clf, + name="Test Experiment classification", + description="Test Description", + problem_type=ProblemType.CLASSIFICATION + ) + output_folder = experiment.register_experiment( + base_path=self.folder_name, + zip_files=False + ) + self._check_model_without_zip( + output_folder=output_folder, + ml_experiment=experiment, + problem_type=ProblemType.CLASSIFICATION, + ) + + # Assert works for regression with Zip + model = LGBMRegressor() + model.fit(self.X_train_reg, self.y_train_reg) + experiment = MLExperiment( + model=model, + X_train=self.X_train_reg, + y_train=self.y_train_reg, + X_test=self.X_test_reg, + y_test=self.y_test_reg, + X_validation=self.X_val_reg, + y_validation=self.y_val_reg, + name="Test Experiment regression", + description="Test Description", + problem_type=ProblemType.REGRESSION + ) + output_folder = experiment.register_experiment( + base_path=self.folder_name, + zip_files=True + ) + self._check_model_with_zip(output_folder=output_folder) + + + def test_errors_serialize(self): + """ + Test errors raised by the function. + """ + # Not supported model + experiment = MLExperiment( + model=LinearRegression().fit(self.X_train_reg, self.y_train_reg), + X_train=self.X_train_reg, + y_train=self.y_train_reg, + X_test=self.X_test_reg, + y_test=self.y_test_reg, + X_validation=self.X_val_reg, + y_validation=self.y_val_reg, + name="Test Experiment", + description="Test Description", + problem_type=ProblemType.REGRESSION + ) + # Invalid folder + with self.assertRaises(FileNotFoundError): + experiment.register_experiment( + base_path="invalid_folder", + zip_files=False + ) \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt index aaeb0734..b4e2701d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,6 +3,7 @@ coverage scikit-learn>=1.3.2,<2.0.0 lightgbm>=4.1.0,<5.0.0 xgboost>=2.0.2,<3.0.0 +catboost>=1.2.5,<2.0.0 sphinx shibuya sphinxcontrib-bibtex \ No newline at end of file From 045af73c878821aa42308097f279e0a0a318c662 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?AntonioGonz=C3=A1lez?= Date: Wed, 24 Jul 2024 12:24:56 +0200 Subject: [PATCH 14/14] Update docs --- docs/source/experiment_tracking.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/source/experiment_tracking.rst b/docs/source/experiment_tracking.rst index 09781dca..2a3ac07b 100644 --- a/docs/source/experiment_tracking.rst +++ b/docs/source/experiment_tracking.rst @@ -9,7 +9,8 @@ We will use the california housing dataset from sklearn as an example. from sklearn.datasets import fetch_california_housing X, y = fetch_california_housing(return_X_y=True, as_frame=True) - X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.3) + X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, random_state=0, test_size=0.5) Now we will create a simple pipeline to train a linear regression model and wrap it in an instance of :class:`MLExperiment` @@ -33,14 +34,16 @@ Now we will create a simple pipeline to train a linear regression model and wrap X_train=X_train, X_test=X_test, y_train=y_train, - y_test=y_test + y_test=y_test, + X_validation=X_validation, + y_validation=y_validation ) Once the model is wrapped several metrics are pre-computed and stored in the experiment object. .. code-block:: python - print(experiment.metrics) + print(experiment.metrics["test"]) { "train_score":{