diff --git a/clinicadl/mlflow_test.py b/clinicadl/mlflow_test.py deleted file mode 100644 index 35a2c0995..000000000 --- a/clinicadl/mlflow_test.py +++ /dev/null @@ -1,83 +0,0 @@ -import logging -import os -import sys -import warnings -from urllib.parse import urlparse - -import mlflow -import mlflow.sklearn -import numpy as np -import pandas as pd -from sklearn.linear_model import ElasticNet -from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score -from sklearn.model_selection import train_test_split - -logging.basicConfig(level=logging.WARN) -logger = logging.getLogger(__name__) - - -def eval_metrics(actual, pred): - rmse = np.sqrt(mean_squared_error(actual, pred)) - mae = mean_absolute_error(actual, pred) - r2 = r2_score(actual, pred) - return rmse, mae, r2 - - -if __name__ == "__main__": - warnings.filterwarnings("ignore") - np.random.seed(40) - - # Read the wine-quality csv file from the URL - csv_url = "https://raw.githubusercontent.com/mlflow/mlflow/master/tests/data/winequality-red.csv" - try: - data = pd.read_csv(csv_url, sep=";") - except Exception as e: - logger.exception( - "Unable to download training & test CSV, check your internet connection. Error: %s", - e, - ) - - # Split the data into training and test sets. (0.75, 0.25) split. - train, test = train_test_split(data) - - # The predicted column is "quality" which is a scalar from [3, 9] - train_x = train.drop(["quality"], axis=1) - test_x = test.drop(["quality"], axis=1) - train_y = train[["quality"]] - test_y = test[["quality"]] - - alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5 - l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5 - - with mlflow.start_run(): - lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42) - lr.fit(train_x, train_y) - - predicted_qualities = lr.predict(test_x) - - (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities) - - print("Elasticnet model (alpha={:f}, l1_ratio={:f}):".format(alpha, l1_ratio)) - print(" RMSE: %s" % rmse) - print(" MAE: %s" % mae) - print(" R2: %s" % r2) - - mlflow.log_param("alpha", alpha) - mlflow.log_param("l1_ratio", l1_ratio) - mlflow.log_metric("rmse", rmse) - mlflow.log_metric("r2", r2) - mlflow.log_metric("mae", mae) - - tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme - - # Model registry does not work with file store - if tracking_url_type_store != "file": - # Register the model - # There are other ways to use the Model Registry, which depends on the use case, - # please refer to the doc for more information: - # https://mlflow.org/docs/latest/model-registry.html#api-workflow - mlflow.sklearn.log_model( - lr, "model", registered_model_name="ElasticnetWineModel" - ) - else: - mlflow.sklearn.log_model(lr, "model") diff --git a/clinicadl/resources/config/train_config.toml b/clinicadl/resources/config/train_config.toml index 10035887c..6944325a0 100644 --- a/clinicadl/resources/config/train_config.toml +++ b/clinicadl/resources/config/train_config.toml @@ -45,6 +45,7 @@ amp = false seed = 0 deterministic = false compensation = "memory" # Only used if deterministic = true +track_exp = "" [Transfer_learning] transfer_path = "" diff --git a/clinicadl/train/tasks/classification_cli.py b/clinicadl/train/tasks/classification_cli.py index 5aab10a65..f94257196 100644 --- a/clinicadl/train/tasks/classification_cli.py +++ b/clinicadl/train/tasks/classification_cli.py @@ -46,6 +46,7 @@ @train_option.tolerance @train_option.accumulation_steps @train_option.profiler +@train_option.track_exp # transfer learning @train_option.transfer_path @train_option.transfer_selection_metric diff --git a/clinicadl/train/tasks/reconstruction_cli.py b/clinicadl/train/tasks/reconstruction_cli.py index f3b9db658..566803b35 100644 --- a/clinicadl/train/tasks/reconstruction_cli.py +++ b/clinicadl/train/tasks/reconstruction_cli.py @@ -46,6 +46,7 @@ @train_option.tolerance @train_option.accumulation_steps @train_option.profiler +@train_option.track_exp # transfer learning @train_option.transfer_path @train_option.transfer_selection_metric diff --git a/clinicadl/train/tasks/regression_cli.py b/clinicadl/train/tasks/regression_cli.py index 5cf595895..ec3de92d6 100644 --- a/clinicadl/train/tasks/regression_cli.py +++ b/clinicadl/train/tasks/regression_cli.py @@ -46,6 +46,7 @@ @train_option.tolerance @train_option.accumulation_steps @train_option.profiler +@train_option.track_exp # transfer learning @train_option.transfer_path @train_option.transfer_selection_metric diff --git a/clinicadl/train/tasks/task_utils.py b/clinicadl/train/tasks/task_utils.py index 7f05cb1d1..0893d085d 100644 --- a/clinicadl/train/tasks/task_utils.py +++ b/clinicadl/train/tasks/task_utils.py @@ -57,6 +57,7 @@ def task_launcher(network_task: str, task_options_list: List[str], **kwargs): "patience", "profiler", "tolerance", + "track_exp", "transfer_path", "transfer_selection_metric", "weight_decay", diff --git a/clinicadl/utils/cli_param/train_option.py b/clinicadl/utils/cli_param/train_option.py index f8fbce857..f7e1407d0 100644 --- a/clinicadl/utils/cli_param/train_option.py +++ b/clinicadl/utils/cli_param/train_option.py @@ -282,6 +282,18 @@ help="Use `--profiler` to enable Pytorch profiler for the first 30 steps after a short warmup. " "It will make an execution trace and some statistics about the CPU and GPU usage.", ) +track_exp = cli_param.option_group.optimization_group.option( + "--track_exp", + "-te", + type=click.Choice( + [ + "wandb", + "mlflow", + "", + ] + ), + help="Use `--track_exp` to enable wandb/mlflow to track the metric (loss, accuracy, etc...) during the training.", +) # transfer learning transfer_path = cli_param.option_group.transfer_learning_group.option( "-tp", diff --git a/clinicadl/utils/maps_manager/maps_manager.py b/clinicadl/utils/maps_manager/maps_manager.py index 72c5a234e..d33506bd4 100644 --- a/clinicadl/utils/maps_manager/maps_manager.py +++ b/clinicadl/utils/maps_manager/maps_manager.py @@ -831,7 +831,7 @@ def _train( logger.info(f"Criterion for {self.network_task} is {criterion}") optimizer = self._init_optimizer(model, split=split, resume=resume) - logger.debug(f"Optimizer used for training is optimizer") + logger.debug(f"Optimizer used for training is {optimizer}") model.train() train_loader.dataset.train() @@ -856,6 +856,16 @@ def _train( scaler = GradScaler(enabled=self.amp) profiler = self._init_profiler() + if self.parameters["track_exp"] == "wandb": + from clinicadl.utils.tracking_exp import WandB_handler + + run = WandB_handler(split, self.parameters, self.maps_path.name) + + if self.parameters["track_exp"] == "mlflow": + from clinicadl.utils.tracking_exp import Mlflow_handler + + run = Mlflow_handler(split, self.parameters, self.maps_path.name) + while epoch < self.epochs and not early_stopping.step(metrics_valid["loss"]): logger.info(f"Beginning epoch {epoch}.") @@ -956,6 +966,23 @@ def _train( f"{self.mode} level validation loss is {metrics_valid['loss']} " f"at the end of iteration {i}" ) + if self.track_exp == "wandb": + run.log_metrics( + run._wandb, + self.track_exp, + self.network_task, + metrics_train, + metrics_valid, + ) + + if self.track_exp == "mlflow": + run.log_metrics( + run._mlflow, + self.track_exp, + self.network_task, + metrics_train, + metrics_valid, + ) # Save checkpoints and best models best_dict = retain_best.step(metrics_valid) @@ -981,6 +1008,11 @@ def _train( ) epoch += 1 + if self.parameters["track_exp"] == "mlflow": + run._mlflow.end_run() + + if self.parameters["track_exp"] == "wandb": + run._wandb.finish() self._test_loader( train_loader, diff --git a/clinicadl/utils/tracking_exp.py b/clinicadl/utils/tracking_exp.py new file mode 100644 index 000000000..2a18d0b15 --- /dev/null +++ b/clinicadl/utils/tracking_exp.py @@ -0,0 +1,137 @@ +"""Training Callbacks for training monitoring integrated in `pythae` (inspired from +https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_callback.py)""" + +import importlib +import logging +from copy import copy +from pathlib import Path + +logger = logging.getLogger(__name__) + + +def wandb_is_available(): + return importlib.util.find_spec("wandb") + + +def mlflow_is_available(): + return importlib.util.find_spec("mlflow") is not None + + +class Tracker: + """ + Base class to track the metrics during training depending on the network task. + """ + + def __init__(self): + pass + + def log_metrics( + self, + tracker, + track_exp: bool = False, + network_task: str = "classification", + metrics_train: list = [], + metrics_valid: list = [], + ): + metrics_dict = {} + if network_task == "classification": + metrics_dict = { + "loss_train": metrics_train["loss"], + "accuracy_train": metrics_train["accuracy"], + "sensitivity_train": metrics_train["sensitivity"], + "accuracy_train": metrics_train["accuracy"], + "specificity_train": metrics_train["specificity"], + "PPV_train": metrics_train["PPV"], + "NPV_train": metrics_train["NPV"], + "BA_train": metrics_train["BA"], + "loss_valid": metrics_valid["loss"], + "accuracy_valid": metrics_valid["accuracy"], + "sensitivity_valid": metrics_valid["sensitivity"], + "accuracy_valid": metrics_valid["accuracy"], + "specificity_valid": metrics_valid["specificity"], + "PPV_valid": metrics_valid["PPV"], + "NPV_valid": metrics_valid["NPV"], + "BA_valid": metrics_valid["BA"], + } + elif network_task == "reconstruction": + metrics_dict = { + "loss_train": metrics_train["loss"], + "MSE_train": metrics_train["MSE"], + "MAE_train": metrics_train["MAE"], + "PSNR_train": metrics_train["PSNR"], + "SSIM_train": metrics_train["SSIM"], + "loss_valid": metrics_valid["loss"], + "MSE_valid": metrics_valid["MSE"], + "MAE_valid": metrics_valid["MAE"], + "PSNR_valid": metrics_valid["PSNR"], + "SSIM_valid": metrics_valid["SSIM"], + } + elif network_task == "regression": + metrics_dict = { + "loss_train": metrics_train["loss"], + "MSE_train": metrics_train["MSE"], + "MAE_train": metrics_train["MAE"], + "loss_valid": metrics_valid["loss"], + "MSE_valid": metrics_valid["MSE"], + "MAE_valid": metrics_valid["MAE"], + } + + if track_exp == "wandb": + tracker.log(metrics_dict) + return metrics_dict + elif track_exp == "mlflow": + tracker.log_metrics(metrics_dict) + return metrics_dict + + +class WandB_handler(Tracker): + def __init__(self, split: str, config: dict, maps_name: str): + if not wandb_is_available(): + raise ModuleNotFoundError( + "`wandb` package must be installed. Run `pip install wandb`" + ) + else: + import wandb + + self._wandb = wandb + + self._wandb.init( + project="ClinicaDL", + entity="clinicadl", + config=config, + save_code=True, + group=maps_name, + mode="online", + name=f"split-{split}", + reinit=True, + ) + + +class Mlflow_handler(Tracker): + def __init__(self, split: str, config: dict, maps_name: str): + if not mlflow_is_available(): + raise ModuleNotFoundError( + "`mlflow` package must be installed. Run `pip install mlflow`" + ) + else: + import mlflow + + self._mlflow = mlflow + + try: + experiment_id = self._mlflow.create_experiment( + f"clinicadl-{maps_name}", + artifact_location=Path.cwd().joinpath("mlruns").as_uri(), + ) + + except mlflow.exceptions.MlflowException: + self._mlflow.set_experiment(maps_name) + + self._mlflow.start_run(experiment_id=experiment_id, run_name=f"split-{split}") + self._mlflow.autolog() + config_bis = copy(config) + for cle, valeur in config.items(): + if cle == "preprocessing_dict": + del config_bis[cle] + config = config_bis + self._mlflow.log_params(config) diff --git a/docs/Train/Introduction.md b/docs/Train/Introduction.md index d77339157..6808e627e 100644 --- a/docs/Train/Introduction.md +++ b/docs/Train/Introduction.md @@ -106,6 +106,9 @@ Options shared for all values of `NETWORK_TASK` are organized in groups: - `--transfer_selection_metric` (str) is the transfer learning selection metric. - `--nb_unfrozen_layer` (int) is the number of layer that will be retrain during training. For example, if it is 2, the last two layers of the model will not be freezed. See [Implementation details](Details.md/#transfer-learning) for more information about transfer learning. +- **Track an experiment** + - `--track_exp` (str) is the name of the experiment tracker you want to use. Must be chosen between `wandb` (Weight & Biases) and `mlflow`. As mlflow and W&B are not ClinicaDL dependencies, you must install the one chosen on your own (by running `pip install wandb/mlflow`). + For more information, check out the documentation of [W&B](https://docs.wandb.ai) or [Mlflow](https://mlflow.org/docs/latest/index.html)