mlcommons · sarthakpati · Nov 22, 2023 · Nov 19, 2023 · Nov 19, 2023 · Nov 19, 2023
@@ -1,5 +1,7 @@
 import torchmetrics as tm
+from torch.nn.functional import one_hot
 from ..utils import get_output_from_calculator
+from GANDLF.utils.generic import determine_task
 
 
 def overall_stats(predictions, ground_truth, params):
@@ -26,42 +28,82 @@ def overall_stats(predictions, ground_truth, params):
         "per_class_average": "macro",
         "per_class_weighted": "weighted",
     }
+    task = determine_task(params)
+    # consider adding a "multilabel field in the future"
     # metrics that need the "average" parameter
-    for average_type, average_type_key in average_types_keys.items():
+
+    for average_type_key in average_types_keys.values():
+        # multidim_average is not used when constructing these metrics
+        # think of having it
         calculators = {
             "accuracy": tm.Accuracy(
-                num_classes=params["model"]["num_classes"], average=average_type_key
+                task=task,
+                num_classes=params["model"]["num_classes"],
+                average=average_type_key,
             ),
             "precision": tm.Precision(
-                num_classes=params["model"]["num_classes"], average=average_type_key
+                task=task,
+                num_classes=params["model"]["num_classes"],
+                average=average_type_key,
             ),
             "recall": tm.Recall(
-                num_classes=params["model"]["num_classes"], average=average_type_key
+                task=task,
+                num_classes=params["model"]["num_classes"],
+                average=average_type_key,
             ),
             "f1": tm.F1Score(
-                num_classes=params["model"]["num_classes"], average=average_type_key
+                task=task,
+                num_classes=params["model"]["num_classes"],
+                average=average_type_key,
             ),
             "specificity": tm.Specificity(
-                num_classes=params["model"]["num_classes"], average=average_type_key
+                task=task,
+                num_classes=params["model"]["num_classes"],
+                average=average_type_key,
+            ),
+            "aucroc": tm.AUROC(
+                task=task,
+                num_classes=params["model"]["num_classes"],
+                average=average_type_key
+                if average_type_key != "micro"
+                else "macro",
             ),
-            ## weird error for multi-class problem, where pos_label is not getting set
-            # "aucroc": tm.AUROC(
-            #     num_classes=params["model"]["num_classes"], average=average_type_key
-            # ),
         }
         for metric_name, calculator in calculators.items():
-            output_metrics[
-                f"{metric_name}_{average_type}"
-            ] = get_output_from_calculator(predictions, ground_truth, calculator)
+            if metric_name == "aucroc":
+                one_hot_preds = one_hot(
+                    predictions.long(),
+                    num_classes=params["model"]["num_classes"],
+                )
+                output_metrics[metric_name] = get_output_from_calculator(
+                    one_hot_preds.float(), ground_truth, calculator
+                )
+            else:
+                output_metrics[metric_name] = get_output_from_calculator(
+                    predictions, ground_truth, calculator
+                )
+
+    #### HERE WE NEED TO MODIFY TESTS - ROC IS RETURNING A TUPLE. WE MAY ALSO DISCRAD IT ####
+    # what is AUC metric telling at all? Computing it for predictions and ground truth
+    # is not making sense
     # metrics that do not have any "average" parameter
-    calculators = {
-        "auc": tm.AUC(reorder=True),
-        ## weird error for multi-class problem, where pos_label is not getting set
-        # "roc": tm.ROC(num_classes=params["model"]["num_classes"]),
-    }
-    for metric_name, calculator in calculators.items():
-        output_metrics[metric_name] = get_output_from_calculator(
-            predictions, ground_truth, calculator
-        )
+    # calculators = {
+    #
+    #     # "auc": tm.AUC(reorder=True),
+    #     ## weird error for multi-class problem, where pos_label is not getting set
+    #     "roc": tm.ROC(task=task, num_classes=params["model"]["num_classes"]),
+    # }
+    # for metric_name, calculator in calculators.items():
+    #     if metric_name == "roc":
+    #         one_hot_preds = one_hot(
+    #             predictions.long(), num_classes=params["model"]["num_classes"]
+    #         )
+    #         output_metrics[metric_name] = get_output_from_calculator(
+    #             one_hot_preds.float(), ground_truth, calculator
+    #         )
+    #     else:
+    #         output_metrics[metric_name] = get_output_from_calculator(
+    #             predictions, ground_truth, calculator
+    #         )
 
     return output_metrics
@@ -1,38 +1,60 @@
 import torch
-from torchmetrics import F1Score, Precision, Recall, JaccardIndex, Accuracy, Specificity
+from torchmetrics import (
+    F1Score,
+    Precision,
+    Recall,
+    JaccardIndex,
+    Accuracy,
+    Specificity,
+)
 from GANDLF.utils.tensor import one_hot
+from GANDLF.utils.generic import (
+    determine_task,
+    define_average_type_key,
+    define_multidim_average_type_key,
+)
 
 
-def generic_function_output_with_check(predicted_classes, label, metric_function):
+def generic_function_output_with_check(
+    predicted_classes, label, metric_function
+):
     if torch.min(predicted_classes) < 0:
         print(
             "WARNING: Negative values detected in prediction, cannot compute torchmetrics calculations."
         )
         return torch.zeros((1), device=predicted_classes.device)
     else:
+        # I need to do this with try-except, otherwise for binary problems it will
+        # raise and error as the binary metrics do not have .num_classes
+        # attribute.
+        # https://tinyurl.com/564rh9yp link to example from BinaryAccuracy.
+        try:
+            max_clamp_val = metric_function.num_classes - 1
+        except AttributeError:
+            max_clamp_val = 1
         predicted_new = torch.clamp(
-            predicted_classes.cpu().int(), max=metric_function.num_classes - 1
+            predicted_classes.cpu().int(), max=max_clamp_val
         )
         predicted_new = predicted_new.reshape(label.shape)
         return metric_function(predicted_new, label.cpu().int())
 
 
-def generic_torchmetrics_score(output, label, metric_class, metric_key, params):
+def generic_torchmetrics_score(
+    output, label, metric_class, metric_key, params
+):
+    task = determine_task(params)
     num_classes = params["model"]["num_classes"]
     predicted_classes = output
     if params["problem_type"] == "classification":
         predicted_classes = torch.argmax(output, 1)
     elif params["problem_type"] == "segmentation":
         label = one_hot(label, params["model"]["class_list"])
-    else:
-        params["metrics"][metric_key]["multi_class"] = False
-        params["metrics"][metric_key]["mdmc_average"] = None
     metric_function = metric_class(
-        average=params["metrics"][metric_key]["average"],
+        task=task,
         num_classes=num_classes,
-        multiclass=params["metrics"][metric_key]["multi_class"],
-        mdmc_average=params["metrics"][metric_key]["mdmc_average"],
         threshold=params["metrics"][metric_key]["threshold"],
+        average=define_average_type_key(params, metric_key),
+        multidim_average=define_multidim_average_type_key(params, metric_key),
     )
 
     return generic_function_output_with_check(
@@ -45,19 +67,25 @@ def recall_score(output, label, params):
 
 
 def precision_score(output, label, params):
-    return generic_torchmetrics_score(output, label, Precision, "precision", params)
+    return generic_torchmetrics_score(
+        output, label, Precision, "precision", params
+    )
 
 
 def f1_score(output, label, params):
     return generic_torchmetrics_score(output, label, F1Score, "f1", params)
 
 
 def accuracy(output, label, params):
-    return generic_torchmetrics_score(output, label, Accuracy, "accuracy", params)
+    return generic_torchmetrics_score(
+        output, label, Accuracy, "accuracy", params
+    )
 
 
 def specificity_score(output, label, params):
-    return generic_torchmetrics_score(output, label, Specificity, "specificity", params)
+    return generic_torchmetrics_score(
+        output, label, Specificity, "specificity", params
+    )
 
 
 def iou_score(output, label, params):
@@ -67,10 +95,11 @@ def iou_score(output, label, params):
         predicted_classes = torch.argmax(output, 1)
     elif params["problem_type"] == "segmentation":
         label = one_hot(label, params["model"]["class_list"])
-
+    task = determine_task(params)
     recall = JaccardIndex(
-        reduction=params["metrics"]["iou"]["reduction"],
+        task=task,
         num_classes=num_classes,
+        average=define_average_type_key(params, "iou"),
         threshold=params["metrics"]["iou"]["threshold"],
     )
 

@@ -50,6 +50,7 @@
     suppress_stdout_stderr,
     set_determinism,
     print_and_format_metrics,
+    determine_task,
 )
 
 from .modelio import (

@@ -6,6 +6,7 @@
 import SimpleITK as sitk
 from contextlib import contextmanager, redirect_stderr, redirect_stdout
 from os import devnull
+from typing import Dict, Any, Union
 
 
 @contextmanager
@@ -48,6 +49,15 @@ def checkPatchDivisibility(patch_size, number=16):
     return True
 
 
+def determine_task(params: Dict[str, Union[Dict[str, Any], Any]]) -> str:
+    """Determine the task (binary or multiclass) from the model config.
+    Args:
+        params (dict): The parameter dictionary containing training and data information.
+    """
+    task = "binary" if params["model"]["num_classes"] == 2 else "multiclass"
+    return task
+
+
 def get_date_time():
     """
     Get a well-parsed date string
@@ -146,7 +156,10 @@ def checkPatchDimensions(patch_size, numlay):
         patch_size_to_check = patch_size_to_check[:-1]
 
     if all(
-        [x >= 2 ** (numlay + 1) and x % 2**numlay == 0 for x in patch_size_to_check]
+        [
+            x >= 2 ** (numlay + 1) and x % 2**numlay == 0
+            for x in patch_size_to_check
+        ]
     ):
         return numlay
     else:
@@ -182,7 +195,9 @@ def get_array_from_image_or_tensor(input_tensor_or_image):
     elif isinstance(input_tensor_or_image, np.ndarray):
         return input_tensor_or_image
     else:
-        raise ValueError("Input must be a torch.Tensor or sitk.Image or np.ndarray")
+        raise ValueError(
+            "Input must be a torch.Tensor or sitk.Image or np.ndarray"
+        )
 
 
 def set_determinism(seed=42):
@@ -252,7 +267,9 @@ def __update_metric_from_list_to_single_string(input_metrics_dict) -> dict:
     output_metrics_dict = deepcopy(cohort_level_metrics)
     for metric in metrics_dict_from_parameters:
         if isinstance(sample_level_metrics[metric], np.ndarray):
-            to_print = (sample_level_metrics[metric] / length_of_dataloader).tolist()
+            to_print = (
+                sample_level_metrics[metric] / length_of_dataloader
+            ).tolist()
         else:
             to_print = sample_level_metrics[metric] / length_of_dataloader
         output_metrics_dict[metric] = to_print
@@ -266,3 +283,34 @@ def __update_metric_from_list_to_single_string(input_metrics_dict) -> dict:
     )
 
     return output_metrics_dict
+
+
+def define_average_type_key(
+    params: Dict[str, Union[Dict[str, Any], Any]], metric_name: str
+):
+    """Determine if the the 'average' filed is defined in the metric config.
+    If not, fallback to the default 'macro'
+    values.
+    Args:
+        params (dict): The parameter dictionary containing training and data information.
+        metric_name (str): The name of the metric.
+    Returns:
+        str: The average type key.
+    """
+    average_type_key = params["metrics"][metric_name].get("average", "macro")
+    return average_type_key
+
+
+def define_multidim_average_type_key(params, metric_name):
+    """Determine if the the 'multidim_average' filed is defined in the metric config.
+    If not, fallback to the default 'global'.
+    Args:
+        params (dict): The parameter dictionary containing training and data information.
+        metric_name (str): The name of the metric.
+    Returns:
+        str: The average type key.
+    """
+    average_type_key = params["metrics"][metric_name].get(
+        "multidim_average", "global"
+    )
+    return average_type_key
@@ -51,14 +51,14 @@ metrics:
   # - hausdorff # hausdorff 100 percentile, segmentation
   # - hausdorff95 # hausdorff 95 percentile, segmentation
   # - mse # regression/classification
-  # - accuracy # classification
+  # - accuracy # classification ## more details https://lightning.ai/docs/torchmetrics/v1.1.2/classification/accuracy.html
   # - classification_accuracy # classification
   # - balanced_accuracy # classification ## more details https://scikit-learn.org/stable/modules/generated/sklearn.metrics.balanced_accuracy_score.html
   # - per_label_accuracy # used for classification
-  # - f1 # classification/segmentation
-  # - precision # classification/segmentation ## more details https://torchmetrics.readthedocs.io/en/latest/references/modules.html#id3
-  # - recall # classification/segmentation ## more details https://torchmetrics.readthedocs.io/en/latest/references/modules.html#id4
-  # - iou # classification/segmentation ## more details https://torchmetrics.readthedocs.io/en/latest/references/modules.html#iou
+  # - f1 # classification/segmentation ## more details https://lightning.ai/docs/torchmetrics/v1.1.2/classification/f1_score.html
+  # - precision # classification/segmentation ## more details https://lightning.ai/docs/torchmetrics/v1.1.2/classification/precision.html
+  # - recall # classification/segmentation ## more details https://lightning.ai/docs/torchmetrics/v1.1.2/classification/recall.html
+  # - iou # classification/segmentation ## more details https://lightning.ai/docs/torchmetrics/v1.1.2/classification/jaccard_index.html
 ## this customizes the inference, primarily used for segmentation outputs
 inference_mechanism: {
   grid_aggregator_overlap: crop, # this option provides the option to strategize the grid aggregation output; should be either 'crop' or 'average' - https://torchio.readthedocs.io/patches/patch_inference.html#grid-aggregator

@@ -14,7 +14,9 @@
         readme = readme_file.read()
 except Exception as error:
     readme = "No README information found."
-    sys.stderr.write("Warning: Could not open '%s' due %s\n" % ("README.md", error))
+    sys.stderr.write(
+        "Warning: Could not open '%s' due %s\n" % ("README.md", error)
+    )
 
 
 class CustomInstallCommand(install):
@@ -39,7 +41,9 @@ def run(self):
 
 except Exception as error:
     __version__ = "0.0.1"
-    sys.stderr.write("Warning: Could not open '%s' due %s\n" % (filepath, error))
+    sys.stderr.write(
+        "Warning: Could not open '%s' due %s\n" % (filepath, error)
+    )
 
 # Handle cases where specific files need to be bundled into the final package as installed via PyPI
 dockerfiles = [
@@ -54,7 +58,9 @@ def run(self):
 ]
 setup_files = ["setup.py", ".dockerignore", "pyproject.toml", "MANIFEST.in"]
 all_extra_files = dockerfiles + entrypoint_files + setup_files
-all_extra_files_pathcorrected = [os.path.join("../", item) for item in all_extra_files]
+all_extra_files_pathcorrected = [
+    os.path.join("../", item) for item in all_extra_files
+]
 # find_packages should only ever find these as subpackages of gandlf, not as top-level packages
 # generate this dynamically?
 # GANDLF.GANDLF is needed to prevent recursion madness in deployments
@@ -99,7 +105,7 @@ def run(self):
     "psutil",
     "medcam",
     "opencv-python",
-    "torchmetrics==0.8.1",
+    "torchmetrics==1.1.2",
     "zarr==2.10.3",
     "pydicom",
     "onnx",

@@ -21,7 +21,7 @@ metrics:
   - recall
   - specificity
   - iou: {
-      reduction: sum,
+      average: micro,
     }
 
 modality: rad