From b018bf3c68e99ddae75574fea86b5b6e786bb440 Mon Sep 17 00:00:00 2001 From: Yiheng Wang <68361391+yiheng-wang-nv@users.noreply.github.com> Date: Wed, 6 Nov 2024 22:03:46 +0800 Subject: [PATCH] Add mlflow support for Vista3d (#708) Fixes # . ### Description A few sentences describing the changes proposed in this pull request. ### Status **Ready/Work in progress/Hold** ### Please ensure all the checkboxes: - [x] Codeformat tests passed locally by running `./runtests.sh --codeformat`. - [ ] In-line docstrings updated. - [ ] Update `version` and `changelog` in `metadata.json` if changing an existing bundle. - [ ] Please ensure the naming rules in config files meet our requirements (please refer to: `CONTRIBUTING.md`). - [ ] Ensure versions of packages such as `monai`, `pytorch` and `numpy` are correct in `metadata.json`. - [ ] Descriptions should be consistent with the content, such as `eval_metrics` of the provided weights and TorchScript modules. - [ ] Files larger than 25MB are excluded and replaced by providing download links in `large_file.yml`. - [ ] Avoid using path that contains personal information within config files (such as use `/home/your_name/` for `"bundle_root"`). Signed-off-by: Yiheng Wang --- models/vista3d/configs/metadata.json | 6 ++++-- models/vista3d/configs/train.json | 14 ++++++++++++++ models/vista3d/docs/README.md | 18 ++++++++++++++++++ 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/models/vista3d/configs/metadata.json b/models/vista3d/configs/metadata.json index 80e974b9..48d812e8 100644 --- a/models/vista3d/configs/metadata.json +++ b/models/vista3d/configs/metadata.json @@ -1,7 +1,8 @@ { "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20240725.json", - "version": "0.5.5", + "version": "0.5.6", "changelog": { + "0.5.6": "add mlflow support", "0.5.5": "add arg for trt compiler base path", "0.5.4": "add undefined label prompt check", "0.5.3": "update readme", @@ -27,7 +28,8 @@ "scikit-image": "0.23.2", "nibabel": "5.2.1", "pytorch-ignite": "0.4.11", - "cucim-cu12": "24.6.0" + "cucim-cu12": "24.6.0", + "mlflow": "2.17.2" }, "supported_apps": { "vista3d-nim": "" diff --git a/models/vista3d/configs/train.json b/models/vista3d/configs/train.json index 9af1647c..319604ca 100644 --- a/models/vista3d/configs/train.json +++ b/models/vista3d/configs/train.json @@ -15,6 +15,8 @@ "finetune": false, "finetune_model_path": "$@bundle_root + '/models/model.pt'", "early_stop": false, + "use_mlflow": false, + "mlflow_dir": "$@bundle_root + '/mlruns'", "fold": 0, "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')", "epochs": 5, @@ -248,6 +250,12 @@ "tag_name": "train_loss", "name": "StatsHandler", "output_transform": "$monai.handlers.from_engine(['loss'], first=True)" + }, + { + "_target_": "MLFlowHandler", + "_disabled_": "$not @use_mlflow", + "tracking_uri": "$os.path.abspath(@mlflow_dir)", + "output_transform": "$monai.handlers.from_engine(['loss'], first=True)" } ], "key_metric": { @@ -343,6 +351,12 @@ }, "save_key_metric": true, "key_metric_filename": "model.pt" + }, + { + "_target_": "MLFlowHandler", + "_disabled_": "$not @use_mlflow", + "iteration_log": false, + "tracking_uri": "$os.path.abspath(@mlflow_dir)" } ], "key_metric": { diff --git a/models/vista3d/docs/README.md b/models/vista3d/docs/README.md index 18a5971c..049c13b2 100644 --- a/models/vista3d/docs/README.md +++ b/models/vista3d/docs/README.md @@ -133,6 +133,24 @@ torchrun --nnodes=1 --nproc_per_node=8 -m monai.bundle run \ --config_file="['configs/train.json','configs/train_continual.json','configs/multi_gpu_train.json']" --epochs=320 --learning_rate=0.00005 ``` +### MLFlow support + +MLflow can be enabled to track and manage your machine learning experiments. To enable MLflow, set the `use_mlflow` parameter to `True`. Below is an example of how to run a single-GPU training command with MLflow enabled: + +```bash +python -m monai.bundle run \ + --config_file="['configs/train.json','configs/train_continual.json']" --epochs=320 --learning_rate=0.00005 --use_mlflow True +``` + +By default, the data of MLflow is stored in the `mlruns/` folder under the bundle's root directory. To launch the MLflow UI and track your experiment data, follow these steps: + +1. Open a terminal and navigate to the root directory of your bundle where the `mlruns/` folder is located. + +2. Execute the following command to start the MLflow server. This will make the MLflow UI accessible. + +```Bash +mlflow ui +``` ## Evaluation Evaluation can be used to calculate dice scores for the model or a finetuned model. Change the `ckpt_path` to the checkpoint you wish to evaluate. The dice score is calculated on the original image spacing using `invertd`, while the dice score during finetuning is calculated on resampled space.