From b018bf3c68e99ddae75574fea86b5b6e786bb440 Mon Sep 17 00:00:00 2001
From: Yiheng Wang <68361391+yiheng-wang-nv@users.noreply.github.com>
Date: Wed, 6 Nov 2024 22:03:46 +0800
Subject: [PATCH] Add mlflow support for Vista3d (#708)

Fixes # .

### Description
A few sentences describing the changes proposed in this pull request.

### Status
**Ready/Work in progress/Hold**

### Please ensure all the checkboxes:
<!--- Put an `x` in all the boxes that apply, and remove the not
applicable items -->
- [x] Codeformat tests passed locally by running `./runtests.sh
--codeformat`.
- [ ] In-line docstrings updated.
- [ ] Update `version` and `changelog` in `metadata.json` if changing an
existing bundle.
- [ ] Please ensure the naming rules in config files meet our
requirements (please refer to: `CONTRIBUTING.md`).
- [ ] Ensure versions of packages such as `monai`, `pytorch` and `numpy`
are correct in `metadata.json`.
- [ ] Descriptions should be consistent with the content, such as
`eval_metrics` of the provided weights and TorchScript modules.
- [ ] Files larger than 25MB are excluded and replaced by providing
download links in `large_file.yml`.
- [ ] Avoid using path that contains personal information within config
files (such as use `/home/your_name/` for `"bundle_root"`).

Signed-off-by: Yiheng Wang <vennw@nvidia.com>
---
 models/vista3d/configs/metadata.json |  6 ++++--
 models/vista3d/configs/train.json    | 14 ++++++++++++++
 models/vista3d/docs/README.md        | 18 ++++++++++++++++++
 3 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/models/vista3d/configs/metadata.json b/models/vista3d/configs/metadata.json
index 80e974b9..48d812e8 100644
--- a/models/vista3d/configs/metadata.json
+++ b/models/vista3d/configs/metadata.json
@@ -1,7 +1,8 @@
 {
     "schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_20240725.json",
-    "version": "0.5.5",
+    "version": "0.5.6",
     "changelog": {
+        "0.5.6": "add mlflow support",
         "0.5.5": "add arg for trt compiler base path",
         "0.5.4": "add undefined label prompt check",
         "0.5.3": "update readme",
@@ -27,7 +28,8 @@
         "scikit-image": "0.23.2",
         "nibabel": "5.2.1",
         "pytorch-ignite": "0.4.11",
-        "cucim-cu12": "24.6.0"
+        "cucim-cu12": "24.6.0",
+        "mlflow": "2.17.2"
     },
     "supported_apps": {
         "vista3d-nim": ""
diff --git a/models/vista3d/configs/train.json b/models/vista3d/configs/train.json
index 9af1647c..319604ca 100644
--- a/models/vista3d/configs/train.json
+++ b/models/vista3d/configs/train.json
@@ -15,6 +15,8 @@
     "finetune": false,
     "finetune_model_path": "$@bundle_root + '/models/model.pt'",
     "early_stop": false,
+    "use_mlflow": false,
+    "mlflow_dir": "$@bundle_root + '/mlruns'",
     "fold": 0,
     "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')",
     "epochs": 5,
@@ -248,6 +250,12 @@
                 "tag_name": "train_loss",
                 "name": "StatsHandler",
                 "output_transform": "$monai.handlers.from_engine(['loss'], first=True)"
+            },
+            {
+                "_target_": "MLFlowHandler",
+                "_disabled_": "$not @use_mlflow",
+                "tracking_uri": "$os.path.abspath(@mlflow_dir)",
+                "output_transform": "$monai.handlers.from_engine(['loss'], first=True)"
             }
         ],
         "key_metric": {
@@ -343,6 +351,12 @@
                 },
                 "save_key_metric": true,
                 "key_metric_filename": "model.pt"
+            },
+            {
+                "_target_": "MLFlowHandler",
+                "_disabled_": "$not @use_mlflow",
+                "iteration_log": false,
+                "tracking_uri": "$os.path.abspath(@mlflow_dir)"
             }
         ],
         "key_metric": {
diff --git a/models/vista3d/docs/README.md b/models/vista3d/docs/README.md
index 18a5971c..049c13b2 100644
--- a/models/vista3d/docs/README.md
+++ b/models/vista3d/docs/README.md
@@ -133,6 +133,24 @@ torchrun --nnodes=1 --nproc_per_node=8 -m monai.bundle run \
 	--config_file="['configs/train.json','configs/train_continual.json','configs/multi_gpu_train.json']" --epochs=320 --learning_rate=0.00005
 ```
 
+### MLFlow support
+
+MLflow can be enabled to track and manage your machine learning experiments. To enable MLflow, set the `use_mlflow` parameter to `True`. Below is an example of how to run a single-GPU training command with MLflow enabled:
+
+```bash
+python -m monai.bundle run \
+	--config_file="['configs/train.json','configs/train_continual.json']" --epochs=320 --learning_rate=0.00005 --use_mlflow True
+```
+
+By default, the data of MLflow is stored in the `mlruns/` folder under the bundle's root directory. To launch the MLflow UI and track your experiment data, follow these steps:
+
+1. Open a terminal and navigate to the root directory of your bundle where the `mlruns/` folder is located.
+
+2. Execute the following command to start the MLflow server. This will make the MLflow UI accessible.
+
+```Bash
+mlflow ui
+```
 
 ## Evaluation
 Evaluation can be used to calculate dice scores for the model or a finetuned model. Change the `ckpt_path` to the checkpoint you wish to evaluate. The dice score is calculated on the original image spacing using `invertd`, while the dice score during finetuning is calculated on resampled space.