From 1ebf921a05fe4e6d70ea01733f4e884786a01335 Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Fri, 2 Feb 2024 11:04:43 -0500
Subject: [PATCH 01/14] Add Matthews Correlation Coefficient (MCC) metric

---
 .../evaluate/metrics/experimental/__init__.py |   5 +
 .../experimental/functional/__init__.py       |   5 +
 .../functional/matthews_corr_coef.py          | 350 ++++++++++++++++
 .../experimental/matthews_corr_coef.py        | 187 +++++++++
 .../experimental/test_matthews_corr_coef.py   | 378 ++++++++++++++++++
 5 files changed, 925 insertions(+)
 create mode 100644 cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
 create mode 100644 cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
 create mode 100644 tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py

diff --git a/cyclops/evaluate/metrics/experimental/__init__.py b/cyclops/evaluate/metrics/experimental/__init__.py
index 3a5b9974a..51ab7a21e 100644
--- a/cyclops/evaluate/metrics/experimental/__init__.py
+++ b/cyclops/evaluate/metrics/experimental/__init__.py
@@ -29,6 +29,11 @@
 )
 from cyclops.evaluate.metrics.experimental.mae import MeanAbsoluteError
 from cyclops.evaluate.metrics.experimental.mape import MeanAbsolutePercentageError
+from cyclops.evaluate.metrics.experimental.matthews_corr_coef import (
+    BinaryMCC,
+    MulticlassMCC,
+    MultilabelMCC,
+)
 from cyclops.evaluate.metrics.experimental.metric_dict import MetricDict
 from cyclops.evaluate.metrics.experimental.mse import MeanSquaredError
 from cyclops.evaluate.metrics.experimental.negative_predictive_value import (
diff --git a/cyclops/evaluate/metrics/experimental/functional/__init__.py b/cyclops/evaluate/metrics/experimental/functional/__init__.py
index 1a2e5902b..56b7e825e 100644
--- a/cyclops/evaluate/metrics/experimental/functional/__init__.py
+++ b/cyclops/evaluate/metrics/experimental/functional/__init__.py
@@ -31,6 +31,11 @@
 from cyclops.evaluate.metrics.experimental.functional.mape import (
     mean_absolute_percentage_error,
 )
+from cyclops.evaluate.metrics.experimental.functional.matthews_corr_coef import (
+    binary_mcc,
+    multiclass_mcc,
+    multilabel_mcc,
+)
 from cyclops.evaluate.metrics.experimental.functional.mse import mean_squared_error
 from cyclops.evaluate.metrics.experimental.functional.negative_predictive_value import (
     binary_npv,
diff --git a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
new file mode 100644
index 000000000..5bb34375f
--- /dev/null
+++ b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
@@ -0,0 +1,350 @@
+"""Functional API for the matthews correlation coefficient (MCC) metric."""
+from typing import Optional, Tuple, Union
+
+import array_api_compat as apc
+
+from cyclops.evaluate.metrics.experimental.functional.confusion_matrix import (
+    _binary_confusion_matrix_compute,
+    _binary_confusion_matrix_format_arrays,
+    _binary_confusion_matrix_update_state,
+    _binary_confusion_matrix_validate_args,
+    _binary_confusion_matrix_validate_arrays,
+    _multiclass_confusion_matrix_format_arrays,
+    _multiclass_confusion_matrix_update_state,
+    _multiclass_confusion_matrix_validate_args,
+    _multiclass_confusion_matrix_validate_arrays,
+    _multilabel_confusion_matrix_compute,
+    _multilabel_confusion_matrix_format_arrays,
+    _multilabel_confusion_matrix_update_state,
+    _multilabel_confusion_matrix_validate_args,
+    _multilabel_confusion_matrix_validate_arrays,
+)
+from cyclops.evaluate.metrics.experimental.utils.types import Array
+
+
+def _mcc_reduce(confmat: Array) -> Array:
+    """Reduce an un-normalized confusion matrix into the matthews corrcoef."""
+    xp = apc.array_namespace(confmat)
+    # convert multilabel into binary
+    confmat = xp.sum(confmat, axis=0) if confmat.ndim == 3 else confmat
+
+    if int(apc.size(confmat) or 0) == 4:  # binary case
+        tn, fp, fn, tp = xp.reshape(xp.astype(confmat, xp.float32), (-1,))
+        if tp + tn != 0 and fp + fn == 0:
+            return xp.asarray(1.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
+
+        if tp + tn == 0 and fp + fn != 0:
+            return xp.asarray(-1.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
+
+    tk = xp.sum(confmat, axis=-1, dtype=xp.float32)
+    pk = xp.sum(confmat, axis=-2, dtype=xp.float32)
+    c = xp.astype(xp.linalg.trace(confmat), xp.float32)
+    s = xp.sum(confmat, dtype=xp.float32)
+
+    cov_ytyp = c * s - sum(tk * pk)
+    cov_ypyp = s**2 - sum(pk * pk)
+    cov_ytyt = s**2 - sum(tk * tk)
+
+    numerator = cov_ytyp
+    denom = cov_ypyp * cov_ytyt
+
+    if denom == 0 and int(apc.size(confmat) or 0) == 4:
+        if tp == 0 or tn == 0:
+            a = tp + tn
+
+        if fp == 0 or fn == 0:
+            b = fp + fn
+
+        eps = xp.asarray(
+            xp.finfo(xp.float32).eps,
+            dtype=xp.float32,
+            device=apc.device(confmat),
+        )
+        numerator = xp.sqrt(eps) * (a - b)
+        denom = (tp + fp + eps) * (tp + fn + eps) * (tn + fp + eps) * (tn + fn + eps)
+    elif denom == 0:
+        return xp.asarray(0.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
+    return numerator / xp.sqrt(denom)  # type: ignore[no-any-return]
+
+
+def binary_mcc(
+    target: Array,
+    preds: Array,
+    threshold: float = 0.5,
+    ignore_index: Optional[int] = None,
+) -> Array:
+    """Compute the matthews correlation coefficient for binary classification.
+
+    Parameters
+    ----------
+    target : Array
+        An array object that is compatible with the Python array API standard
+        and contains the ground truth labels. The expected shape of the array
+        is `(N, ...)`, where `N` is the number of samples.
+    preds : Array
+        An array object that is compatible with the Python array API standard and
+        contains the predictions of a binary classifier. the expected shape of the
+        array is `(N, ...)` where `N` is the number of samples. If `preds` contains
+        floating point values that are not in the range `[0, 1]`, a sigmoid function
+        will be applied to each value before thresholding.
+    threshold : float, default=0.5
+        The threshold to use when converting probabilities to binary predictions.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, ignore nothing.
+
+    Returns
+    -------
+    Array
+        The matthews correlation coefficient.
+
+    Raises
+    ------
+    ValueError
+        If `target` and `preds` have different shapes.
+    ValueError
+        If `target` and `preds` are not array-API-compatible.
+    ValueError
+        If `target` or `preds` are empty.
+    ValueError
+        If `target` or `preds` are not numeric arrays.
+    ValueError
+        If `threshold` is not a float in the [0,1] range.
+    ValueError
+        If `normalize` is not one of `'pred'`, `'true'`, `'all'`, `'none'`, or `None`.
+    ValueError
+        If `ignore_index` is not `None` or an integer.
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental.functional import binary_mcc
+    >>> target = anp.asarray([0, 1, 0, 1, 0, 1])
+    >>> preds = anp.asarray([0, 0, 1, 1, 0, 1])
+    >>> binary_mcc(target, preds)
+    Array(0.33333334, dtype=float32)
+    >>> target = anp.asarray([0, 1, 0, 1, 0, 1])
+    >>> preds = anp.asarray([0.11, 0.22, 0.84, 0.73, 0.33, 0.92])
+    >>> binary_mcc(target, preds)
+    Array(0.33333334, dtype=float32)
+
+    """
+    _binary_confusion_matrix_validate_args(
+        threshold=threshold,
+        normalize=None,
+        ignore_index=ignore_index,
+    )
+    xp = _binary_confusion_matrix_validate_arrays(target, preds, ignore_index)
+
+    target, preds = _binary_confusion_matrix_format_arrays(
+        target,
+        preds,
+        threshold,
+        ignore_index,
+        xp=xp,
+    )
+    tn, fp, fn, tp = _binary_confusion_matrix_update_state(target, preds, xp=xp)
+
+    confmat = _binary_confusion_matrix_compute(tn, fp, fn, tp, normalize=None)
+    return _mcc_reduce(confmat)
+
+
+def multiclass_mcc(
+    target: Array,
+    preds: Array,
+    num_classes: int,
+    ignore_index: Optional[Union[int, Tuple[int]]] = None,
+) -> Array:
+    """Compute the matthews correlation coefficient for multiclass classification.
+
+    Parameters
+    ----------
+    target : Array
+        The target array of shape `(N, ...)`, where `N` is the number of samples.
+    preds : Array
+        The prediction array with shape `(N, ...)`, for integer inputs, or
+        `(N, C, ...)`, for float inputs, where `N` is the number of samples and
+        `C` is the number of classes.
+    num_classes : int
+        The number of classes.
+    ignore_index : int, Tuple[int], optional, default=None
+        Specifies a target value(s) that is ignored and does not contribute to the
+        metric. If `None`, ignore nothing.
+
+    Returns
+    -------
+    Array
+        The matthews correlation coefficient.
+
+    Raises
+    ------
+    ValueError
+        If `target` and `preds` are not array-API-compatible.
+    ValueError
+        If `target` or `preds` are empty.
+    ValueError
+        If `target` or `preds` are not numeric arrays.
+    ValueError
+        If `num_classes` is not an integer larger than 1.
+    ValueError
+        If `normalize` is not one of `'pred'`, `'true'`, `'all'`, `'none'`, or `None`.
+    ValueError
+        If `ignore_index` is not `None`, an integer or a tuple of integers.
+    ValueError
+        If `preds` contains floats but `target` does not have one dimension less than
+        `preds`.
+    ValueError
+        If the second dimension of `preds` is not equal to `num_classes`.
+    ValueError
+        If when `target` has one dimension less than `preds`, the shape of `preds` is
+        not `(N, C, ...)` while the shape of `target` is `(N, ...)`.
+    ValueError
+        If when `target` and `preds` have the same number of dimensions, they
+        do not have the same shape.
+    RuntimeError
+        If `target` contains values that are not in the range [0, `num_classes`).
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental.functional import multiclass_mcc
+    >>> target = anp.asarray([2, 1, 0, 0])
+    >>> preds = anp.asarray([2, 1, 0, 1])
+    >>> multiclass_mcc(target, preds, num_classes=3)
+    Array(0.7, dtype=float32)
+    >>> target = anp.asarray([2, 1, 0, 0])
+    >>> preds = anp.asarray([[0.16, 0.26, 0.58],
+    ...                     [0.22, 0.61, 0.17],
+    ...                     [0.71, 0.09, 0.20],
+    ...                     [0.05, 0.82, 0.13]])
+    >>> multiclass_mcc(target, preds, num_classes=3)
+    Array(0.7, dtype=float32)
+
+    """
+    _multiclass_confusion_matrix_validate_args(
+        num_classes,
+        normalize=None,
+        ignore_index=ignore_index,
+    )
+    xp = _multiclass_confusion_matrix_validate_arrays(
+        target,
+        preds,
+        num_classes,
+        ignore_index=ignore_index,
+    )
+
+    target, preds = _multiclass_confusion_matrix_format_arrays(
+        target,
+        preds,
+        ignore_index=ignore_index,
+        xp=xp,
+    )
+    confmat = _multiclass_confusion_matrix_update_state(
+        target,
+        preds,
+        num_classes,
+        xp=xp,
+    )
+    return _mcc_reduce(confmat)
+
+
+def multilabel_mcc(
+    target: Array,
+    preds: Array,
+    num_labels: int,
+    threshold: float = 0.5,
+    ignore_index: Optional[int] = None,
+) -> Array:
+    """Compute the matthews correlation coefficient for multilabel classification.
+
+    Parameters
+    ----------
+    target : Array
+        The target array of shape `(N, L, ...)`, where `N` is the number of samples
+        and `L` is the number of labels.
+    preds : Array
+        The prediction array of shape `(N, L, ...)`, where `N` is the number of
+        samples and `L` is the number of labels. If `preds` contains floats that
+        are not in the range [0,1], they will be converted to probabilities using
+        the sigmoid function.
+    num_labels : int
+        The number of labels.
+    threshold : float, default=0.5
+        The threshold to use for binarizing the predictions.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, ignore nothing.
+
+    Returns
+    -------
+    Array
+        The matthews correlation coefficient.
+
+    Raises
+    ------
+    ValueError
+        If `target` and `preds` are not array-API-compatible.
+    ValueError
+        If `target` or `preds` are empty.
+    ValueError
+        If `target` or `preds` are not numeric arrays.
+    ValueError
+        If `threshold` is not a float in the [0,1] range.
+    ValueError
+        If `normalize` is not one of `'pred'`, `'true'`, `'all'`, `'none'`, or `None`.
+    ValueError
+        If `ignore_index` is not `None` or a non-negative integer.
+    ValueError
+        If `num_labels` is not an integer larger than 1.
+    ValueError
+        If `target` and `preds` do not have the same shape.
+    ValueError
+        If the second dimension of `preds` is not equal to `num_labels`.
+    RuntimeError
+        If `target` contains values that are not in the range [0, 1].
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental.functional import multilabel_mcc
+    >>> target = anp.asarray([[0, 1, 0], [1, 0, 1]])
+    >>> preds = anp.asarray([[0, 0, 1], [1, 0, 1]])
+    >>> multilabel_mcc(target, preds, num_labels=3)
+    Array(0.33333334, dtype=float32)
+    >>> target = anp.asarray([[0, 1, 0], [1, 0, 1]])
+    >>> preds = anp.asarray([[0.11, 0.22, 0.84], [0.73, 0.33, 0.92]])
+    >>> multilabel_mcc(target, preds, num_labels=3)
+    Array(0.33333334, dtype=float32)
+
+    """
+    _multilabel_confusion_matrix_validate_args(
+        num_labels,
+        threshold=threshold,
+        normalize=None,
+        ignore_index=ignore_index,
+    )
+    xp = _multilabel_confusion_matrix_validate_arrays(
+        target,
+        preds,
+        num_labels,
+        ignore_index=ignore_index,
+    )
+
+    target, preds = _multilabel_confusion_matrix_format_arrays(
+        target,
+        preds,
+        threshold=threshold,
+        ignore_index=ignore_index,
+        xp=xp,
+    )
+    tn, fp, fn, tp = _multilabel_confusion_matrix_update_state(target, preds, xp=xp)
+
+    confmat = _multilabel_confusion_matrix_compute(
+        tn,
+        fp,
+        fn,
+        tp,
+        num_labels,
+        normalize=None,
+    )
+    return _mcc_reduce(confmat)
diff --git a/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
new file mode 100644
index 000000000..804f3a857
--- /dev/null
+++ b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
@@ -0,0 +1,187 @@
+"""Matthews Correlation Coefficient (MCC) metric."""
+from typing import Any, Optional, Tuple, Union
+
+from cyclops.evaluate.metrics.experimental.confusion_matrix import (
+    BinaryConfusionMatrix,
+    MulticlassConfusionMatrix,
+    MultilabelConfusionMatrix,
+)
+from cyclops.evaluate.metrics.experimental.functional.confusion_matrix import (
+    _binary_confusion_matrix_compute,
+    _multilabel_confusion_matrix_compute,
+)
+from cyclops.evaluate.metrics.experimental.functional.matthews_corr_coef import (
+    _mcc_reduce,
+)
+from cyclops.evaluate.metrics.experimental.utils.types import Array
+
+
+class BinaryMCC(BinaryConfusionMatrix, registry_key="binary_mcc"):
+    """A measure of the agreement between predicted and actual values.
+
+    Parameters
+    ----------
+    threshold : float, default=0.5
+        The threshold value to use when binarizing the inputs.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, all values are used.
+    **kwargs : Any
+        Additional keyword arguments common to all metrics.
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental import BinaryMCC
+    >>> target = anp.asarray([0, 1, 0, 1, 0, 1])
+    >>> preds = anp.asarray([0, 0, 1, 1, 0, 1])
+    >>> metric = BinaryMCC()
+    >>> metric(target, preds)
+    Array(0.33333334, dtype=float32)
+    >>> target = anp.asarray([0, 1, 0, 1, 0, 1])
+    >>> preds = anp.asarray([0.11, 0.22, 0.84, 0.73, 0.33, 0.92])
+    >>> metric = BinaryMCC()
+    >>> metric(target, preds)
+    Array(0.33333334, dtype=float32)
+
+    """
+
+    name: str = "Matthews Correlation Coefficient"
+
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        ignore_index: Optional[int] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the class."""
+        super().__init__(threshold, normalize=None, ignore_index=ignore_index, **kwargs)
+
+    def _compute_metric(self) -> Array:
+        """Compute the confusion matrix."""
+        tn, fp, fn, tp = self._final_state()
+        confmat = _binary_confusion_matrix_compute(
+            tp=tp,
+            fp=fp,
+            tn=tn,
+            fn=fn,
+            normalize=self.normalize,
+        )
+        return _mcc_reduce(confmat)
+
+
+class MulticlassMCC(MulticlassConfusionMatrix, registry_key="multiclass_mcc"):
+    """A measure of the agreement between predicted and actual values.
+
+    Parameters
+    ----------
+    num_classes : int
+        The number of classes.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, all values are used.
+    **kwargs : Any
+        Additional keyword arguments common to all metrics.
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental import MulticlassMCC
+    >>> target = anp.asarray([2, 1, 0, 0])
+    >>> preds = anp.asarray([2, 1, 0, 1])
+    >>> metric = MulticlassMCC(num_classes=3)
+    >>> metric(target, preds)
+    Array(0.7, dtype=float32)
+    >>> target = anp.asarray([2, 1, 0, 0])
+    >>> preds = anp.asarray([[0.16, 0.26, 0.58],
+    ...                     [0.22, 0.61, 0.17],
+    ...                     [0.71, 0.09, 0.20],
+    ...                     [0.05, 0.82, 0.13]])
+    >>> metric = MulticlassMCC(num_classes=3)
+    >>> metric(target, preds)
+    Array(0.7, dtype=float32)
+    """
+
+    name: str = "Matthews Correlation Coefficient"
+
+    def __init__(
+        self,
+        num_classes: int,
+        ignore_index: Optional[Union[int, Tuple[int]]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the class."""
+        super().__init__(
+            num_classes=num_classes,
+            normalize=None,
+            ignore_index=ignore_index,
+            **kwargs,
+        )
+
+    def _compute_metric(self) -> Array:
+        """Compute the confusion matrix."""
+        return _mcc_reduce(self.confmat)  # type: ignore
+
+
+class MultilabelMCC(MultilabelConfusionMatrix, registry_key="multilabel_mcc"):
+    """A measure of the agreement between predicted and actual values.
+
+    Parameters
+    ----------
+    num_labels : int
+        The number of labels.
+    threshold : float, default=0.5
+        The threshold value to use when binarizing the inputs.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, all values are used.
+    **kwargs : Any
+        Additional keyword arguments common to all metrics.
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental import MultilabelMCC
+    >>> target = anp.asarray([[0, 1, 0], [1, 0, 1]])
+    >>> preds = anp.asarray([[0, 0, 1], [1, 0, 1]])
+    >>> metric = MultilabelMCC(num_labels=3)
+    >>> metric(target, preds)
+    Array(0.33333334, dtype=float32)
+    >>> target = anp.asarray([[0, 1, 0], [1, 0, 1]])
+    >>> preds = anp.asarray([[0.11, 0.22, 0.84], [0.73, 0.33, 0.92]])
+    >>> metric = MultilabelMCC(num_labels=3)
+    >>> metric(target, preds)
+    Array(0.33333334, dtype=float32)
+
+    """
+
+    name: str = "Matthews Correlation Coefficient"
+
+    def __init__(
+        self,
+        num_labels: int,
+        threshold: float = 0.5,
+        ignore_index: Optional[int] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the class."""
+        super().__init__(
+            num_labels=num_labels,
+            threshold=threshold,
+            normalize=None,
+            ignore_index=ignore_index,
+            **kwargs,
+        )
+
+    def _compute_metric(self) -> Array:
+        """Compute the confusion matrix."""
+        tn, fp, fn, tp = self._final_state()
+        confmat = _multilabel_confusion_matrix_compute(
+            tp=tp,
+            fp=fp,
+            tn=tn,
+            fn=fn,
+            num_labels=self.num_labels,
+            normalize=self.normalize,
+        )
+        return _mcc_reduce(confmat)
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
new file mode 100644
index 000000000..92c8ca390
--- /dev/null
+++ b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
@@ -0,0 +1,378 @@
+"""Test matthews correlation coefficient metrics."""
+from functools import partial
+
+import array_api_compat as apc
+import array_api_compat.torch
+import numpy.array_api as anp
+import pytest
+import torch.utils.dlpack
+from torchmetrics.functional.classification import (
+    binary_matthews_corrcoef,
+    multiclass_matthews_corrcoef,
+    multilabel_matthews_corrcoef,
+)
+
+from cyclops.evaluate.metrics.experimental.functional.matthews_corr_coef import (
+    binary_mcc,
+    multiclass_mcc,
+    multilabel_mcc,
+)
+from cyclops.evaluate.metrics.experimental.matthews_corr_coef import (
+    BinaryMCC,
+    MulticlassMCC,
+    MultilabelMCC,
+)
+from cyclops.evaluate.metrics.experimental.utils.ops import to_int
+from cyclops.evaluate.metrics.experimental.utils.validation import is_floating_point
+
+from ..conftest import NUM_CLASSES, NUM_LABELS, THRESHOLD
+from .inputs import _binary_cases, _multiclass_cases, _multilabel_cases
+from .testers import MetricTester, _inject_ignore_index
+
+
+def _binary_mcc_reference(
+    target,
+    preds,
+    threshold,
+    ignore_index,
+) -> torch.Tensor:
+    """Return the reference binary matthews correlation coefficient."""
+    return binary_matthews_corrcoef(
+        torch.utils.dlpack.from_dlpack(preds),
+        torch.utils.dlpack.from_dlpack(target),
+        threshold=threshold,
+        ignore_index=ignore_index,
+    )
+
+
+class TestBinaryMCC(MetricTester):
+    """Test binary matthews correlation coefficient function and class."""
+
+    @pytest.mark.parametrize("inputs", _binary_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_binary_mcc_function_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test function for binary matthews corrcoef using numpy.array_api arrays."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_function_implementation_test(
+            target,
+            preds,
+            metric_function=binary_mcc,
+            metric_args={
+                "threshold": THRESHOLD,
+                "ignore_index": ignore_index,
+            },
+            reference_metric=partial(
+                _binary_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+        )
+
+    @pytest.mark.parametrize("inputs", _binary_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_binary_mcc_class_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for binary matthews correlation coefficient."""
+        target, preds = inputs
+
+        if (
+            preds.ndim == 1
+            and is_floating_point(preds)
+            and not anp.all(to_int((preds >= 0)) * to_int((preds <= 1)))
+        ):
+            pytest.skip(
+                "When using 0-D logits, batch result will be different from local "
+                "result because the `sigmoid` operation may not be applied to each "
+                "batch (some values may be in [0, 1] and some may not).",
+            )
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=BinaryMCC,
+            metric_args={
+                "threshold": THRESHOLD,
+                "ignore_index": ignore_index,
+            },
+            reference_metric=partial(
+                _binary_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+        )
+
+    @pytest.mark.integration_test()  # machine for integration tests has GPU
+    @pytest.mark.parametrize("inputs", _binary_cases(xp=array_api_compat.torch))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_binary_mcc_class_with_torch_tensors(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test binary matthews correlation coefficient class with torch tensors."""
+        target, preds = inputs
+
+        if (
+            preds.ndim == 1
+            and is_floating_point(preds)
+            and not torch.all(to_int((preds >= 0)) * to_int((preds <= 1)))
+        ):
+            pytest.skip(
+                "When using 0-D logits, batch result will be different from local "
+                "result because the `sigmoid` operation may not be applied to each "
+                "batch (some values may be in [0, 1] and some may not).",
+            )
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=BinaryMCC,
+            metric_args={
+                "threshold": THRESHOLD,
+                "ignore_index": ignore_index,
+            },
+            reference_metric=partial(
+                _binary_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+            device=device,
+            use_device_for_ref=True,
+        )
+
+
+def _multiclass_mcc_reference(
+    target,
+    preds,
+    num_classes=NUM_CLASSES,
+    ignore_index=None,
+) -> torch.Tensor:
+    """Return the reference multiclass matthews correlation coefficient."""
+    if preds.ndim == 1 and is_floating_point(preds):
+        xp = apc.array_namespace(preds)
+        preds = xp.argmax(preds, axis=0)
+
+    return multiclass_matthews_corrcoef(
+        torch.utils.dlpack.from_dlpack(preds),
+        torch.utils.dlpack.from_dlpack(target),
+        num_classes,
+        ignore_index=ignore_index,
+    )
+
+
+class TestMulticlassMCC(MetricTester):
+    """Test multiclass matthews correlation coefficient function and class."""
+
+    @pytest.mark.parametrize("inputs", _multiclass_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_multiclass_mcc_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test function for multiclass matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_function_implementation_test(
+            target,
+            preds,
+            metric_function=multiclass_mcc,
+            metric_args={
+                "num_classes": NUM_CLASSES,
+                "ignore_index": ignore_index,
+            },
+            reference_metric=partial(
+                _multiclass_mcc_reference,
+                ignore_index=ignore_index,
+            ),
+        )
+
+    @pytest.mark.parametrize("inputs", _multiclass_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 1, -1])
+    def test_multiclass_mcc_class_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for multiclass matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=MulticlassMCC,
+            reference_metric=partial(
+                _multiclass_mcc_reference,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "num_classes": NUM_CLASSES,
+                "ignore_index": ignore_index,
+            },
+        )
+
+    @pytest.mark.integration_test()  # machine for integration tests has GPU
+    @pytest.mark.parametrize("inputs", _multiclass_cases(xp=array_api_compat.torch))
+    @pytest.mark.parametrize("ignore_index", [None, 1, -1])
+    def test_multiclass_mcc_class_with_torch_tensors(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for multiclass matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=MulticlassMCC,
+            reference_metric=partial(
+                _multiclass_mcc_reference,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "num_classes": NUM_CLASSES,
+                "ignore_index": ignore_index,
+            },
+            device=device,
+            use_device_for_ref=True,
+        )
+
+
+def _multilabel_mcc_reference(
+    preds,
+    target,
+    threshold,
+    num_labels=NUM_LABELS,
+    ignore_index=None,
+) -> torch.Tensor:
+    """Return the reference multilabel matthews correlation coefficient."""
+    return multilabel_matthews_corrcoef(
+        torch.utils.dlpack.from_dlpack(preds),
+        torch.utils.dlpack.from_dlpack(target),
+        num_labels,
+        threshold=threshold,
+        ignore_index=ignore_index,
+    )
+
+
+class TestMultilabelMCC(MetricTester):
+    """Test multilabel matthews correlation coefficient function and class."""
+
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_multilabel_mcc_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test function for multilabel matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_function_implementation_test(
+            target,
+            preds,
+            metric_function=multilabel_mcc,
+            reference_metric=partial(
+                _multilabel_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "threshold": THRESHOLD,
+                "num_labels": NUM_LABELS,
+                "ignore_index": ignore_index,
+            },
+        )
+
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_multilabel_mcc_class_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for multilabel matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=MultilabelMCC,
+            reference_metric=partial(
+                _multilabel_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "threshold": THRESHOLD,
+                "num_labels": NUM_LABELS,
+                "ignore_index": ignore_index,
+            },
+        )
+
+    @pytest.mark.integration_test()  # machine for integration tests has GPU
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_multilabel_mcc_class_with_torch_tensors(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for multilabel matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=MultilabelMCC,
+            reference_metric=partial(
+                _multilabel_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "threshold": THRESHOLD,
+                "num_labels": NUM_LABELS,
+                "ignore_index": ignore_index,
+            },
+        )

From f6987e134c62dddab79dc998313ebad8d28a5474 Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Fri, 2 Feb 2024 12:13:35 -0500
Subject: [PATCH 02/14] Reorder arguments in _multilabel_mcc_reference function

---
 .../evaluate/metrics/experimental/test_matthews_corr_coef.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
index 92c8ca390..831bababf 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
@@ -270,8 +270,8 @@ def test_multiclass_mcc_class_with_torch_tensors(
 
 
 def _multilabel_mcc_reference(
-    preds,
     target,
+    preds,
     threshold,
     num_labels=NUM_LABELS,
     ignore_index=None,

From 5614997efc8f3c976d8e26f24389c759a3ef9da2 Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Wed, 7 Feb 2024 13:54:56 -0500
Subject: [PATCH 03/14] update linux version for code check

---
 .github/workflows/code_checks.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
index e26b716ff..8414a5b1f 100644
--- a/.github/workflows/code_checks.yml
+++ b/.github/workflows/code_checks.yml
@@ -24,7 +24,7 @@ on:
 
 jobs:
   run-code-check:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v3
       - name: Install poetry

From 81c7e7e7eab461ebb657dc97f76d6c2bbb7389b1 Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Wed, 7 Feb 2024 14:12:54 -0500
Subject: [PATCH 04/14] revert ubuntu version change & update action versions

---
 .github/workflows/code_checks.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml
index 8414a5b1f..242c2ae30 100644
--- a/.github/workflows/code_checks.yml
+++ b/.github/workflows/code_checks.yml
@@ -24,12 +24,12 @@ on:
 
 jobs:
   run-code-check:
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
       - name: Install poetry
         run: python3 -m pip install --upgrade pip && python3 -m pip install poetry
-      - uses: actions/setup-python@v4.7.1
+      - uses: actions/setup-python@v5.0.0
         with:
           python-version: '3.10'
           cache: 'poetry'

From be63d4a046becfb8e036a51bba70521863527758 Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Wed, 7 Feb 2024 14:33:15 -0500
Subject: [PATCH 05/14] use torch instead of dlpack to convert tensors to numpy

---
 .../evaluate/metrics/experimental/testers.py        | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/cyclops/evaluate/metrics/experimental/testers.py b/tests/cyclops/evaluate/metrics/experimental/testers.py
index 4ae8775dc..c5bc95291 100644
--- a/tests/cyclops/evaluate/metrics/experimental/testers.py
+++ b/tests/cyclops/evaluate/metrics/experimental/testers.py
@@ -1,9 +1,11 @@
 """Testers for metrics."""
+
 from functools import partial
 from typing import Any, Callable, Dict, Optional, Sequence, Type
 
 import array_api_compat as apc
 import numpy as np
+from array_api_compat.common._helpers import _is_torch_array
 
 from cyclops.evaluate.metrics.experimental.metric import Metric
 from cyclops.evaluate.metrics.experimental.utils.ops import clone, flatten
@@ -19,8 +21,15 @@ def _assert_allclose(
     """Recursively assert that two results are within a certain tolerance."""
     if apc.is_array_api_obj(cyclops_result) and apc.is_array_api_obj(ref_result):
         # move to cpu and convert to numpy
-        cyclops_result = np.from_dlpack(apc.to_device(cyclops_result, "cpu"))
-        ref_result = np.from_dlpack(apc.to_device(ref_result, "cpu"))
+        if _is_torch_array(cyclops_result):
+            cyclops_result = cyclops_result.cpu().numpy()
+        else:
+            cyclops_result = np.from_dlpack(apc.to_device(cyclops_result, "cpu"))
+
+        if _is_torch_array(ref_result):
+            ref_result = ref_result.cpu().numpy()
+        else:
+            ref_result = np.from_dlpack(apc.to_device(ref_result, "cpu"))
 
         np.testing.assert_allclose(
             cyclops_result,

From b6651f46a6b6ff71944142c66b8d76f8413e375a Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Wed, 7 Feb 2024 15:08:00 -0500
Subject: [PATCH 06/14] update numpy conversion methods

---
 .../evaluate/metrics/experimental/testers.py  | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/tests/cyclops/evaluate/metrics/experimental/testers.py b/tests/cyclops/evaluate/metrics/experimental/testers.py
index c5bc95291..4d58305d3 100644
--- a/tests/cyclops/evaluate/metrics/experimental/testers.py
+++ b/tests/cyclops/evaluate/metrics/experimental/testers.py
@@ -5,7 +5,6 @@
 
 import array_api_compat as apc
 import numpy as np
-from array_api_compat.common._helpers import _is_torch_array
 
 from cyclops.evaluate.metrics.experimental.metric import Metric
 from cyclops.evaluate.metrics.experimental.utils.ops import clone, flatten
@@ -21,15 +20,20 @@ def _assert_allclose(
     """Recursively assert that two results are within a certain tolerance."""
     if apc.is_array_api_obj(cyclops_result) and apc.is_array_api_obj(ref_result):
         # move to cpu and convert to numpy
-        if _is_torch_array(cyclops_result):
-            cyclops_result = cyclops_result.cpu().numpy()
-        else:
-            cyclops_result = np.from_dlpack(apc.to_device(cyclops_result, "cpu"))
-
-        if _is_torch_array(ref_result):
-            ref_result = ref_result.cpu().numpy()
-        else:
-            ref_result = np.from_dlpack(apc.to_device(ref_result, "cpu"))
+        cyclops_result = np.from_dlpack(
+            (
+                apc.to_device(cyclops_result, "cpu")
+                if apc.device(cyclops_result) != "cpu"
+                else cyclops_result
+            ),
+        )
+        ref_result = np.from_dlpack(
+            (
+                apc.to_device(ref_result, "cpu")
+                if apc.device(ref_result) != "cpu"
+                else ref_result
+            ),
+        )
 
         np.testing.assert_allclose(
             cyclops_result,

From 2834deb03815d425eb21c18499ecb830107d55fa Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Wed, 7 Feb 2024 16:22:18 -0500
Subject: [PATCH 07/14] set dtype for multilabel test inputs

---
 .../evaluate/metrics/experimental/inputs.py   | 28 +++++++++++--------
 .../test_precision_recall_curve.py            | 27 ++++++++++++------
 .../evaluate/metrics/experimental/test_roc.py | 27 ++++++++++++------
 3 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/tests/cyclops/evaluate/metrics/experimental/inputs.py b/tests/cyclops/evaluate/metrics/experimental/inputs.py
index 92af7b9e6..d38d1d852 100644
--- a/tests/cyclops/evaluate/metrics/experimental/inputs.py
+++ b/tests/cyclops/evaluate/metrics/experimental/inputs.py
@@ -1,4 +1,5 @@
 """Input data for tests of metrics in cyclops/evaluate/metrics/experimental."""
+
 import random
 from collections import namedtuple
 from types import ModuleType
@@ -296,43 +297,46 @@ def _multilabel_cases(*, xp: Any):
     return (
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels),
-                preds=xp.asarray(_multilabel_preds),
+                target=xp.asarray(_multilabel_labels, dtype=xp.int32),
+                preds=xp.asarray(_multilabel_preds, dtype=xp.int32),
             ),
             id="input[2d-labels]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels_multidim),
-                preds=xp.asarray(_multilabel_preds_multidim),
+                target=xp.asarray(_multilabel_labels_multidim, dtype=xp.int32),
+                preds=xp.asarray(_multilabel_preds_multidim, dtype=xp.int32),
             ),
             id="input[multidim-labels]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels),
-                preds=xp.asarray(_multilabel_probs),
+                target=xp.asarray(_multilabel_labels, dtype=xp.int32),
+                preds=xp.asarray(_multilabel_probs, dtype=xp.float32),
             ),
             id="input[2d-probs]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels),
-                preds=xp.asarray(_inv_sigmoid(_multilabel_probs)),
+                target=xp.asarray(_multilabel_labels, dtype=xp.int32),
+                preds=xp.asarray(_inv_sigmoid(_multilabel_probs), dtype=xp.float32),
             ),
             id="input[2d-logits]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels_multidim),
-                preds=xp.asarray(_multilabel_probs_multidim),
+                target=xp.asarray(_multilabel_labels_multidim, dtype=xp.int32),
+                preds=xp.asarray(_multilabel_probs_multidim, dtype=xp.float32),
             ),
             id="input[multidim-probs]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels_multidim),
-                preds=xp.asarray(_inv_sigmoid(_multilabel_probs_multidim)),
+                target=xp.asarray(_multilabel_labels_multidim, dtype=xp.int32),
+                preds=xp.asarray(
+                    _inv_sigmoid(_multilabel_probs_multidim),
+                    dtype=xp.float32,
+                ),
             ),
             id="input[multidim-logits]",
         ),
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_precision_recall_curve.py b/tests/cyclops/evaluate/metrics/experimental/test_precision_recall_curve.py
index 4dc5989fd..081ebd1e9 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_precision_recall_curve.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_precision_recall_curve.py
@@ -1,4 +1,5 @@
 """Test precision-recall curve metric."""
+
 from functools import partial
 from typing import List, Tuple, Union
 
@@ -45,9 +46,11 @@ def _binary_precision_recall_curve_reference(
     return tm_binary_precision_recall_curve(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -215,9 +218,11 @@ def _multiclass_precision_recall_curve_reference(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
         num_classes,
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -371,9 +376,11 @@ def _multilabel_precision_recall_curve_reference(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
         num_labels,
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -381,6 +388,8 @@ def _multilabel_precision_recall_curve_reference(
 class TestMultilabelPrecisionRecallCurve(MetricTester):
     """Test multilabel precision-recall curve function and class."""
 
+    atol: float = 2e-7
+
     @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp)[2:])
     @pytest.mark.parametrize("thresholds", _thresholds(xp=anp))
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_roc.py b/tests/cyclops/evaluate/metrics/experimental/test_roc.py
index ddc4f9556..17a4fff5a 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_roc.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_roc.py
@@ -1,4 +1,5 @@
 """Test roc curve metric."""
+
 from functools import partial
 from typing import List, Tuple, Union
 
@@ -45,9 +46,11 @@ def _binary_roc_reference(
     return tm_binary_roc(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -215,9 +218,11 @@ def _multiclass_roc_reference(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
         num_classes,
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -371,9 +376,11 @@ def _multilabel_roc_reference(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
         num_labels,
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -381,6 +388,8 @@ def _multilabel_roc_reference(
 class TestMultilabelROC(MetricTester):
     """Test multilabel roc curve function and class."""
 
+    atol: float = 9e-8
+
     @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp)[2:])
     @pytest.mark.parametrize("thresholds", _thresholds(xp=anp))
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])

From 8c380c8105a1d0c82d8c679c56091ed097b9c9a9 Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Mon, 12 Feb 2024 11:14:42 -0500
Subject: [PATCH 08/14] fix doctest error

---
 cyclops/utils/index.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cyclops/utils/index.py b/cyclops/utils/index.py
index 2e3941a02..3b281ec7a 100644
--- a/cyclops/utils/index.py
+++ b/cyclops/utils/index.py
@@ -3,6 +3,7 @@
 from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import numpy.typing as npt
 
 
 def index_axis(ind: int, axis: int, shape: Tuple[int, ...]) -> Tuple[Any, ...]:
@@ -33,9 +34,9 @@ def index_axis(ind: int, axis: int, shape: Tuple[int, ...]) -> Tuple[Any, ...]:
 
 
 def take_indices(
-    data: np.typing.NDArray[Any],
-    indexes: Sequence[Optional[Union[Sequence[int], np.typing.NDArray[Any]]]],
-) -> np.typing.NDArray[Any]:
+    data: npt.NDArray[Any],
+    indexes: Sequence[Optional[Union[Sequence[int], npt.NDArray[Any]]]],
+) -> npt.NDArray[Any]:
     """Index array by specifying the indices to take on each axis.
 
     Parameters
@@ -69,10 +70,10 @@ def take_indices(
 
 
 def take_indices_over_axis(
-    data: np.typing.NDArray[Any],
+    data: npt.NDArray[Any],
     axis: int,
-    index: Union[np.typing.NDArray[Any], Sequence[int]],
-) -> np.typing.NDArray[Any]:
+    index: Union[npt.NDArray[Any], Sequence[int]],
+) -> npt.NDArray[Any]:
     """Take indices along an axis.
 
     Parameters

From dfa89cf7bed9ee1c02b3b98cc288f6e586b281df Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Fri, 16 Feb 2024 10:35:44 -0500
Subject: [PATCH 09/14] update implementation of multilabel confusion matrix

---
 .../metrics/experimental/confusion_matrix.py  | 31 +++++-----
 .../functional/confusion_matrix.py            | 56 +++++++++----------
 .../functional/matthews_corr_coef.py          | 25 ++++-----
 .../experimental/matthews_corr_coef.py        | 13 +----
 4 files changed, 59 insertions(+), 66 deletions(-)

diff --git a/cyclops/evaluate/metrics/experimental/confusion_matrix.py b/cyclops/evaluate/metrics/experimental/confusion_matrix.py
index 744ea1720..b1e19d27f 100644
--- a/cyclops/evaluate/metrics/experimental/confusion_matrix.py
+++ b/cyclops/evaluate/metrics/experimental/confusion_matrix.py
@@ -1,4 +1,5 @@
 """Confusion matrix."""
+
 from types import ModuleType
 from typing import Any, Optional, Tuple, Union
 
@@ -276,10 +277,7 @@ def _compute_metric(self) -> Array:
         )
 
 
-class MultilabelConfusionMatrix(
-    _AbstractConfusionMatrix,
-    registry_key="multilabel_confusion_matrix",
-):
+class MultilabelConfusionMatrix(Metric, registry_key="multilabel_confusion_matrix"):
     """Confusion matrix for multilabel classification tasks.
 
     Parameters
@@ -329,6 +327,8 @@ class MultilabelConfusionMatrix(
 
     """
 
+    name: str = "Confusion Matrix"
+
     def __init__(
         self,
         num_labels: int,
@@ -352,7 +352,11 @@ def __init__(
         self.normalize = normalize
         self.ignore_index = ignore_index
 
-        self._create_state(size=num_labels)
+        self.add_state_default_factory(
+            "confmat",
+            lambda xp: xp.zeros((num_labels, 2, 2), dtype=xp.int64, device=self.device),  # type: ignore
+            dist_reduce_fn="sum",
+        )
 
     def _update_state(self, target: Array, preds: Array) -> None:
         """Update the state variables."""
@@ -365,21 +369,22 @@ def _update_state(self, target: Array, preds: Array) -> None:
         target, preds = _multilabel_confusion_matrix_format_arrays(
             target,
             preds,
+            self.num_labels,
             threshold=self.threshold,
             ignore_index=self.ignore_index,
             xp=xp,
         )
-        tn, fp, fn, tp = _multilabel_confusion_matrix_update_state(target, preds, xp=xp)
-        self._update_stat_scores(tn=tn, fp=fp, fn=fn, tp=tp)
+        confmat = _multilabel_confusion_matrix_update_state(
+            target,
+            preds,
+            self.num_labels,
+            xp=xp,
+        )
+        self.confmat += confmat  # type: ignore
 
     def _compute_metric(self) -> Array:
         """Compute the confusion matrix."""
-        tn, fp, fn, tp = self._final_state()
         return _multilabel_confusion_matrix_compute(
-            tp=tp,
-            fp=fp,
-            tn=tn,
-            fn=fn,
-            num_labels=self.num_labels,
+            self.confmat,  # type: ignore
             normalize=self.normalize,
         )
diff --git a/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py b/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py
index cfc462269..fa2766f5a 100644
--- a/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py
+++ b/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py
@@ -1,4 +1,5 @@
 """Functions for computing the confusion matrix for classification tasks."""
+
 # mypy: disable-error-code="no-any-return"
 from types import ModuleType
 from typing import Literal, Optional, Tuple, Union
@@ -9,6 +10,7 @@
     bincount,
     clone,
     flatten,
+    moveaxis,
     remove_ignore_index,
     safe_divide,
     sigmoid,
@@ -599,6 +601,7 @@ def _multilabel_confusion_matrix_validate_arrays(
 def _multilabel_confusion_matrix_format_arrays(
     target: Array,
     preds: Array,
+    num_labels: int,
     threshold: float = 0.5,
     ignore_index: Optional[int] = None,
     *,
@@ -613,13 +616,15 @@ def _multilabel_confusion_matrix_format_arrays(
             preds = sigmoid(preds)  # convert logits to probabilities
         preds = to_int(preds > threshold)
 
-    preds = xp.reshape(preds, shape=(*preds.shape[:2], -1))
-    target = xp.reshape(target, shape=(*target.shape[:2], -1))
+    preds = xp.reshape(moveaxis(preds, 1, -1), shape=(-1, num_labels))
+    target = xp.reshape(moveaxis(target, 1, -1), shape=(-1, num_labels))
 
     if ignore_index is not None:
-        idx = target == ignore_index
         target = clone(target)
-        target[idx] = -1
+        preds = clone(preds)
+        idx = target == ignore_index
+        target[idx] = -4 * num_labels
+        preds[idx] = -4 * num_labels
 
     return target, preds
 
@@ -627,34 +632,25 @@ def _multilabel_confusion_matrix_format_arrays(
 def _multilabel_confusion_matrix_update_state(
     target: Array,
     preds: Array,
+    num_labels: int,
     *,
     xp: ModuleType,
-) -> Tuple[Array, Array, Array, Array]:
+) -> Array:
     """Compute the statistics for the given `target` and `preds` arrays."""
-    sum_axis = (0, -1)
-    tp = squeeze_all(xp.sum(to_int((target == preds) & (target == 1)), axis=sum_axis))
-    fn = squeeze_all(xp.sum(to_int((target != preds) & (target == 1)), axis=sum_axis))
-    fp = squeeze_all(xp.sum(to_int((target != preds) & (target == 0)), axis=sum_axis))
-    tn = squeeze_all(xp.sum(to_int((target == preds) & (target == 0)), axis=sum_axis))
-
-    return tn, fp, fn, tp
+    unique_mapping = (2 * target + preds) + 4 * flatten(
+        xp.arange(num_labels, device=apc.device(preds)),
+    )
+    unique_mapping = unique_mapping[unique_mapping >= 0]
+    bins = bincount(unique_mapping, minlength=4 * num_labels)
+    return xp.reshape(bins, shape=(num_labels, 2, 2))
 
 
 def _multilabel_confusion_matrix_compute(
-    tn: Array,
-    fp: Array,
-    fn: Array,
-    tp: Array,
-    num_labels: int,
+    confmat: Array,
     normalize: Optional[str] = None,
 ) -> Array:
     """Compute the confusion matrix from the given stat scores."""
-    xp = apc.array_namespace(tn, fp, fn, tp)
-
-    confmat = squeeze_all(
-        xp.reshape(xp.stack([tn, fp, fn, tp], axis=-1), shape=(-1, num_labels, 2, 2)),
-    )
-
+    xp = apc.array_namespace(confmat)
     return _normalize_confusion_matrix(confmat, normalize=normalize, xp=xp)
 
 
@@ -768,17 +764,19 @@ class over the number of true samples for each class.
     target, preds = _multilabel_confusion_matrix_format_arrays(
         target,
         preds,
+        num_labels,
         threshold=threshold,
         ignore_index=ignore_index,
         xp=xp,
     )
-    tn, fp, fn, tp = _multilabel_confusion_matrix_update_state(target, preds, xp=xp)
+    confmat = _multilabel_confusion_matrix_update_state(
+        target,
+        preds,
+        num_labels,
+        xp=xp,
+    )
 
     return _multilabel_confusion_matrix_compute(
-        tn,
-        fp,
-        fn,
-        tp,
-        num_labels,
+        confmat,
         normalize=normalize,
     )
diff --git a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
index 5bb34375f..4bb9cb927 100644
--- a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
+++ b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
@@ -1,4 +1,5 @@
 """Functional API for the matthews correlation coefficient (MCC) metric."""
+
 from typing import Optional, Tuple, Union
 
 import array_api_compat as apc
@@ -13,7 +14,6 @@
     _multiclass_confusion_matrix_update_state,
     _multiclass_confusion_matrix_validate_args,
     _multiclass_confusion_matrix_validate_arrays,
-    _multilabel_confusion_matrix_compute,
     _multilabel_confusion_matrix_format_arrays,
     _multilabel_confusion_matrix_update_state,
     _multilabel_confusion_matrix_validate_args,
@@ -25,6 +25,7 @@
 def _mcc_reduce(confmat: Array) -> Array:
     """Reduce an un-normalized confusion matrix into the matthews corrcoef."""
     xp = apc.array_namespace(confmat)
+
     # convert multilabel into binary
     confmat = xp.sum(confmat, axis=0) if confmat.ndim == 3 else confmat
 
@@ -36,10 +37,10 @@ def _mcc_reduce(confmat: Array) -> Array:
         if tp + tn == 0 and fp + fn != 0:
             return xp.asarray(-1.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
 
-    tk = xp.sum(confmat, axis=-1, dtype=xp.float32)
-    pk = xp.sum(confmat, axis=-2, dtype=xp.float32)
-    c = xp.astype(xp.linalg.trace(confmat), xp.float32)
-    s = xp.sum(confmat, dtype=xp.float32)
+    tk = xp.sum(confmat, axis=-1, dtype=xp.float32)  # tn + fp and tp + fn
+    pk = xp.sum(confmat, axis=-2, dtype=xp.float32)  # tn + fn and tp + fp
+    c = xp.astype(xp.linalg.trace(confmat), xp.float32)  # tn and tp
+    s = xp.sum(confmat, dtype=xp.float32)  # tn + tp + fn + fp
 
     cov_ytyp = c * s - sum(tk * pk)
     cov_ypyp = s**2 - sum(pk * pk)
@@ -333,18 +334,16 @@ def multilabel_mcc(
     target, preds = _multilabel_confusion_matrix_format_arrays(
         target,
         preds,
+        num_labels,
         threshold=threshold,
         ignore_index=ignore_index,
         xp=xp,
     )
-    tn, fp, fn, tp = _multilabel_confusion_matrix_update_state(target, preds, xp=xp)
-
-    confmat = _multilabel_confusion_matrix_compute(
-        tn,
-        fp,
-        fn,
-        tp,
+    confmat = _multilabel_confusion_matrix_update_state(
+        target,
+        preds,
         num_labels,
-        normalize=None,
+        xp=xp,
     )
+
     return _mcc_reduce(confmat)
diff --git a/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
index 804f3a857..bbfc4856e 100644
--- a/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
+++ b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
@@ -1,4 +1,5 @@
 """Matthews Correlation Coefficient (MCC) metric."""
+
 from typing import Any, Optional, Tuple, Union
 
 from cyclops.evaluate.metrics.experimental.confusion_matrix import (
@@ -8,7 +9,6 @@
 )
 from cyclops.evaluate.metrics.experimental.functional.confusion_matrix import (
     _binary_confusion_matrix_compute,
-    _multilabel_confusion_matrix_compute,
 )
 from cyclops.evaluate.metrics.experimental.functional.matthews_corr_coef import (
     _mcc_reduce,
@@ -175,13 +175,4 @@ def __init__(
 
     def _compute_metric(self) -> Array:
         """Compute the confusion matrix."""
-        tn, fp, fn, tp = self._final_state()
-        confmat = _multilabel_confusion_matrix_compute(
-            tp=tp,
-            fp=fp,
-            tn=tn,
-            fn=fn,
-            num_labels=self.num_labels,
-            normalize=self.normalize,
-        )
-        return _mcc_reduce(confmat)
+        return _mcc_reduce(self.confmat)  # type: ignore

From 2612934d59705d3bd27f23652c6248db405173f7 Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Fri, 16 Feb 2024 12:10:13 -0500
Subject: [PATCH 10/14] revert implementation update & add print statements for
 debugging

---
 .../metrics/experimental/confusion_matrix.py  | 30 +++++-----
 .../functional/confusion_matrix.py            | 55 ++++++++++---------
 .../functional/matthews_corr_coef.py          | 30 +++++++---
 .../experimental/matthews_corr_coef.py        | 12 +++-
 4 files changed, 76 insertions(+), 51 deletions(-)

diff --git a/cyclops/evaluate/metrics/experimental/confusion_matrix.py b/cyclops/evaluate/metrics/experimental/confusion_matrix.py
index b1e19d27f..d2623e5e4 100644
--- a/cyclops/evaluate/metrics/experimental/confusion_matrix.py
+++ b/cyclops/evaluate/metrics/experimental/confusion_matrix.py
@@ -277,7 +277,10 @@ def _compute_metric(self) -> Array:
         )
 
 
-class MultilabelConfusionMatrix(Metric, registry_key="multilabel_confusion_matrix"):
+class MultilabelConfusionMatrix(
+    _AbstractConfusionMatrix,
+    registry_key="multilabel_confusion_matrix",
+):
     """Confusion matrix for multilabel classification tasks.
 
     Parameters
@@ -327,8 +330,6 @@ class MultilabelConfusionMatrix(Metric, registry_key="multilabel_confusion_matri
 
     """
 
-    name: str = "Confusion Matrix"
-
     def __init__(
         self,
         num_labels: int,
@@ -352,11 +353,7 @@ def __init__(
         self.normalize = normalize
         self.ignore_index = ignore_index
 
-        self.add_state_default_factory(
-            "confmat",
-            lambda xp: xp.zeros((num_labels, 2, 2), dtype=xp.int64, device=self.device),  # type: ignore
-            dist_reduce_fn="sum",
-        )
+        self._create_state(size=num_labels)
 
     def _update_state(self, target: Array, preds: Array) -> None:
         """Update the state variables."""
@@ -369,22 +366,21 @@ def _update_state(self, target: Array, preds: Array) -> None:
         target, preds = _multilabel_confusion_matrix_format_arrays(
             target,
             preds,
-            self.num_labels,
             threshold=self.threshold,
             ignore_index=self.ignore_index,
             xp=xp,
         )
-        confmat = _multilabel_confusion_matrix_update_state(
-            target,
-            preds,
-            self.num_labels,
-            xp=xp,
-        )
-        self.confmat += confmat  # type: ignore
+        tn, fp, fn, tp = _multilabel_confusion_matrix_update_state(target, preds, xp=xp)
+        self._update_stat_scores(tn=tn, fp=fp, fn=fn, tp=tp)
 
     def _compute_metric(self) -> Array:
         """Compute the confusion matrix."""
+        tn, fp, fn, tp = self._final_state()
         return _multilabel_confusion_matrix_compute(
-            self.confmat,  # type: ignore
+            tp=tp,
+            fp=fp,
+            tn=tn,
+            fn=fn,
+            num_labels=self.num_labels,
             normalize=self.normalize,
         )
diff --git a/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py b/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py
index fa2766f5a..14d26d6a0 100644
--- a/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py
+++ b/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py
@@ -10,7 +10,6 @@
     bincount,
     clone,
     flatten,
-    moveaxis,
     remove_ignore_index,
     safe_divide,
     sigmoid,
@@ -601,7 +600,6 @@ def _multilabel_confusion_matrix_validate_arrays(
 def _multilabel_confusion_matrix_format_arrays(
     target: Array,
     preds: Array,
-    num_labels: int,
     threshold: float = 0.5,
     ignore_index: Optional[int] = None,
     *,
@@ -616,15 +614,13 @@ def _multilabel_confusion_matrix_format_arrays(
             preds = sigmoid(preds)  # convert logits to probabilities
         preds = to_int(preds > threshold)
 
-    preds = xp.reshape(moveaxis(preds, 1, -1), shape=(-1, num_labels))
-    target = xp.reshape(moveaxis(target, 1, -1), shape=(-1, num_labels))
+    preds = xp.reshape(preds, shape=(*preds.shape[:2], -1))
+    target = xp.reshape(target, shape=(*target.shape[:2], -1))
 
     if ignore_index is not None:
-        target = clone(target)
-        preds = clone(preds)
         idx = target == ignore_index
-        target[idx] = -4 * num_labels
-        preds[idx] = -4 * num_labels
+        target = clone(target)
+        target[idx] = -1
 
     return target, preds
 
@@ -632,25 +628,34 @@ def _multilabel_confusion_matrix_format_arrays(
 def _multilabel_confusion_matrix_update_state(
     target: Array,
     preds: Array,
-    num_labels: int,
     *,
     xp: ModuleType,
-) -> Array:
+) -> Tuple[Array, Array, Array, Array]:
     """Compute the statistics for the given `target` and `preds` arrays."""
-    unique_mapping = (2 * target + preds) + 4 * flatten(
-        xp.arange(num_labels, device=apc.device(preds)),
-    )
-    unique_mapping = unique_mapping[unique_mapping >= 0]
-    bins = bincount(unique_mapping, minlength=4 * num_labels)
-    return xp.reshape(bins, shape=(num_labels, 2, 2))
+    sum_axis = (0, -1)
+    tp = squeeze_all(xp.sum(to_int((target == preds) & (target == 1)), axis=sum_axis))
+    fn = squeeze_all(xp.sum(to_int((target != preds) & (target == 1)), axis=sum_axis))
+    fp = squeeze_all(xp.sum(to_int((target != preds) & (target == 0)), axis=sum_axis))
+    tn = squeeze_all(xp.sum(to_int((target == preds) & (target == 0)), axis=sum_axis))
+
+    return tn, fp, fn, tp
 
 
 def _multilabel_confusion_matrix_compute(
-    confmat: Array,
+    tn: Array,
+    fp: Array,
+    fn: Array,
+    tp: Array,
+    num_labels: int,
     normalize: Optional[str] = None,
 ) -> Array:
     """Compute the confusion matrix from the given stat scores."""
-    xp = apc.array_namespace(confmat)
+    xp = apc.array_namespace(tn, fp, fn, tp)
+
+    confmat = squeeze_all(
+        xp.reshape(xp.stack([tn, fp, fn, tp], axis=-1), shape=(-1, num_labels, 2, 2)),
+    )
+
     return _normalize_confusion_matrix(confmat, normalize=normalize, xp=xp)
 
 
@@ -764,19 +769,17 @@ class over the number of true samples for each class.
     target, preds = _multilabel_confusion_matrix_format_arrays(
         target,
         preds,
-        num_labels,
         threshold=threshold,
         ignore_index=ignore_index,
         xp=xp,
     )
-    confmat = _multilabel_confusion_matrix_update_state(
-        target,
-        preds,
-        num_labels,
-        xp=xp,
-    )
+    tn, fp, fn, tp = _multilabel_confusion_matrix_update_state(target, preds, xp=xp)
 
     return _multilabel_confusion_matrix_compute(
-        confmat,
+        tn,
+        fp,
+        fn,
+        tp,
+        num_labels,
         normalize=normalize,
     )
diff --git a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
index 4bb9cb927..80e358d02 100644
--- a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
+++ b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
@@ -14,6 +14,7 @@
     _multiclass_confusion_matrix_update_state,
     _multiclass_confusion_matrix_validate_args,
     _multiclass_confusion_matrix_validate_arrays,
+    _multilabel_confusion_matrix_compute,
     _multilabel_confusion_matrix_format_arrays,
     _multilabel_confusion_matrix_update_state,
     _multilabel_confusion_matrix_validate_args,
@@ -25,9 +26,10 @@
 def _mcc_reduce(confmat: Array) -> Array:
     """Reduce an un-normalized confusion matrix into the matthews corrcoef."""
     xp = apc.array_namespace(confmat)
-
     # convert multilabel into binary
     confmat = xp.sum(confmat, axis=0) if confmat.ndim == 3 else confmat
+    print("confmat: ", confmat)
+    print("numel: ", apc.size(confmat))
 
     if int(apc.size(confmat) or 0) == 4:  # binary case
         tn, fp, fn, tp = xp.reshape(xp.astype(confmat, xp.float32), (-1,))
@@ -38,16 +40,25 @@ def _mcc_reduce(confmat: Array) -> Array:
             return xp.asarray(-1.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
 
     tk = xp.sum(confmat, axis=-1, dtype=xp.float32)  # tn + fp and tp + fn
+    print("tk: ", tk)
     pk = xp.sum(confmat, axis=-2, dtype=xp.float32)  # tn + fn and tp + fp
+    print("pk: ", pk)
     c = xp.astype(xp.linalg.trace(confmat), xp.float32)  # tn and tp
+    print("c: ", c)
     s = xp.sum(confmat, dtype=xp.float32)  # tn + tp + fn + fp
+    print("s: ", s)
 
     cov_ytyp = c * s - sum(tk * pk)
+    print("cov_ytyp: ", cov_ytyp)
     cov_ypyp = s**2 - sum(pk * pk)
+    print("cov_ypyp: ", cov_ypyp)
     cov_ytyt = s**2 - sum(tk * tk)
+    print("cov_ytyt: ", cov_ytyt)
 
     numerator = cov_ytyp
+    print("numerator: ", numerator)
     denom = cov_ypyp * cov_ytyt
+    print("denom: ", denom)
 
     if denom == 0 and int(apc.size(confmat) or 0) == 4:
         if tp == 0 or tn == 0:
@@ -61,8 +72,11 @@ def _mcc_reduce(confmat: Array) -> Array:
             dtype=xp.float32,
             device=apc.device(confmat),
         )
+        print("eps: ", eps)
         numerator = xp.sqrt(eps) * (a - b)
+        print("numerator: ", numerator)
         denom = (tp + fp + eps) * (tp + fn + eps) * (tn + fp + eps) * (tn + fn + eps)
+        print("denom: ", denom)
     elif denom == 0:
         return xp.asarray(0.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
     return numerator / xp.sqrt(denom)  # type: ignore[no-any-return]
@@ -334,16 +348,18 @@ def multilabel_mcc(
     target, preds = _multilabel_confusion_matrix_format_arrays(
         target,
         preds,
-        num_labels,
         threshold=threshold,
         ignore_index=ignore_index,
         xp=xp,
     )
-    confmat = _multilabel_confusion_matrix_update_state(
-        target,
-        preds,
+    tn, fp, fn, tp = _multilabel_confusion_matrix_update_state(target, preds, xp=xp)
+
+    confmat = _multilabel_confusion_matrix_compute(
+        tn,
+        fp,
+        fn,
+        tp,
         num_labels,
-        xp=xp,
+        normalize=None,
     )
-
     return _mcc_reduce(confmat)
diff --git a/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
index bbfc4856e..a05980dcf 100644
--- a/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
+++ b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
@@ -9,6 +9,7 @@
 )
 from cyclops.evaluate.metrics.experimental.functional.confusion_matrix import (
     _binary_confusion_matrix_compute,
+    _multilabel_confusion_matrix_compute,
 )
 from cyclops.evaluate.metrics.experimental.functional.matthews_corr_coef import (
     _mcc_reduce,
@@ -175,4 +176,13 @@ def __init__(
 
     def _compute_metric(self) -> Array:
         """Compute the confusion matrix."""
-        return _mcc_reduce(self.confmat)  # type: ignore
+        tn, fp, fn, tp = self._final_state()
+        confmat = _multilabel_confusion_matrix_compute(
+            tp=tp,
+            fp=fp,
+            tn=tn,
+            fn=fn,
+            num_labels=self.num_labels,
+            normalize=self.normalize,
+        )
+        return _mcc_reduce(confmat)

From 4e17d12fae61fe6ebbad00f551819c434bddadfb Mon Sep 17 00:00:00 2001
From: Franklin <41602287+fcogidi@users.noreply.github.com>
Date: Fri, 16 Feb 2024 13:00:38 -0500
Subject: [PATCH 11/14] fix tests

---
 .../functional/matthews_corr_coef.py          | 14 ----
 .../experimental/test_confusion_matrix.py     |  7 +-
 .../metrics/experimental/test_f_score.py      | 78 ++++++++++++-------
 .../experimental/test_matthews_corr_coef.py   |  7 +-
 .../test_negative_predicitve_value.py         | 10 ++-
 .../experimental/test_precision_recall.py     | 10 ++-
 .../metrics/experimental/test_specificity.py  | 10 ++-
 .../evaluate/metrics/experimental/testers.py  |  9 +++
 8 files changed, 98 insertions(+), 47 deletions(-)

diff --git a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
index 80e358d02..07f542eab 100644
--- a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
+++ b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
@@ -28,8 +28,6 @@ def _mcc_reduce(confmat: Array) -> Array:
     xp = apc.array_namespace(confmat)
     # convert multilabel into binary
     confmat = xp.sum(confmat, axis=0) if confmat.ndim == 3 else confmat
-    print("confmat: ", confmat)
-    print("numel: ", apc.size(confmat))
 
     if int(apc.size(confmat) or 0) == 4:  # binary case
         tn, fp, fn, tp = xp.reshape(xp.astype(confmat, xp.float32), (-1,))
@@ -40,25 +38,16 @@ def _mcc_reduce(confmat: Array) -> Array:
             return xp.asarray(-1.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
 
     tk = xp.sum(confmat, axis=-1, dtype=xp.float32)  # tn + fp and tp + fn
-    print("tk: ", tk)
     pk = xp.sum(confmat, axis=-2, dtype=xp.float32)  # tn + fn and tp + fp
-    print("pk: ", pk)
     c = xp.astype(xp.linalg.trace(confmat), xp.float32)  # tn and tp
-    print("c: ", c)
     s = xp.sum(confmat, dtype=xp.float32)  # tn + tp + fn + fp
-    print("s: ", s)
 
     cov_ytyp = c * s - sum(tk * pk)
-    print("cov_ytyp: ", cov_ytyp)
     cov_ypyp = s**2 - sum(pk * pk)
-    print("cov_ypyp: ", cov_ypyp)
     cov_ytyt = s**2 - sum(tk * tk)
-    print("cov_ytyt: ", cov_ytyt)
 
     numerator = cov_ytyp
-    print("numerator: ", numerator)
     denom = cov_ypyp * cov_ytyt
-    print("denom: ", denom)
 
     if denom == 0 and int(apc.size(confmat) or 0) == 4:
         if tp == 0 or tn == 0:
@@ -72,11 +61,8 @@ def _mcc_reduce(confmat: Array) -> Array:
             dtype=xp.float32,
             device=apc.device(confmat),
         )
-        print("eps: ", eps)
         numerator = xp.sqrt(eps) * (a - b)
-        print("numerator: ", numerator)
         denom = (tp + fp + eps) * (tp + fn + eps) * (tn + fp + eps) * (tn + fn + eps)
-        print("denom: ", denom)
     elif denom == 0:
         return xp.asarray(0.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
     return numerator / xp.sqrt(denom)  # type: ignore[no-any-return]
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_confusion_matrix.py b/tests/cyclops/evaluate/metrics/experimental/test_confusion_matrix.py
index 2d94c03fa..860bf4ba3 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_confusion_matrix.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_confusion_matrix.py
@@ -1,4 +1,5 @@
 """Test confusion matrix metrics."""
+
 from functools import partial
 
 import array_api_compat as apc
@@ -390,7 +391,7 @@ def test_multilabel_confusion_matrix_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("normalize", [None, "true", "pred", "all"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_confusion_matrix_class_with_torch_tensors(
@@ -405,6 +406,8 @@ def test_multilabel_confusion_matrix_class_with_torch_tensors(
         if ignore_index is not None:
             target = _inject_ignore_index(target, ignore_index)
 
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
@@ -421,4 +424,6 @@ def test_multilabel_confusion_matrix_class_with_torch_tensors(
                 "normalize": normalize,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_f_score.py b/tests/cyclops/evaluate/metrics/experimental/test_f_score.py
index 5c9afe687..73f83dda5 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_f_score.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_f_score.py
@@ -1,4 +1,5 @@
 """Tests for the F-score metric."""
+
 from functools import partial
 from typing import Literal, Optional
 
@@ -79,9 +80,11 @@ def test_binary_fbeta_score_function_with_numpy_array_api_arrays(
         self.run_metric_function_implementation_test(
             target,
             preds,
-            metric_function=binary_f1_score
-            if beta == 1.0
-            else partial(binary_fbeta_score, beta=beta),
+            metric_function=(
+                binary_f1_score
+                if beta == 1.0
+                else partial(binary_fbeta_score, beta=beta)
+            ),
             metric_args={"threshold": THRESHOLD, "ignore_index": ignore_index},
             reference_metric=partial(
                 _binary_fbeta_score_reference,
@@ -119,9 +122,9 @@ def test_binary_fbeta_score_class_with_numpy_array_api_arrays(
         self.run_metric_class_implementation_test(
             target,
             preds,
-            metric_class=BinaryF1Score
-            if beta == 1.0
-            else partial(BinaryFBetaScore, beta=beta),
+            metric_class=(
+                BinaryF1Score if beta == 1.0 else partial(BinaryFBetaScore, beta=beta)
+            ),
             metric_args={"threshold": THRESHOLD, "ignore_index": ignore_index},
             reference_metric=partial(
                 _binary_fbeta_score_reference,
@@ -162,9 +165,9 @@ def test_binary_fbeta_class_with_torch_tensors(
         self.run_metric_class_implementation_test(
             target,
             preds,
-            metric_class=BinaryF1Score
-            if beta == 1.0
-            else partial(BinaryFBetaScore, beta=beta),
+            metric_class=(
+                BinaryF1Score if beta == 1.0 else partial(BinaryFBetaScore, beta=beta)
+            ),
             metric_args={"threshold": THRESHOLD, "ignore_index": ignore_index},
             reference_metric=partial(
                 _binary_fbeta_score_reference,
@@ -241,9 +244,11 @@ def test_multiclass_fbeta_score_with_numpy_array_api_arrays(
             self.run_metric_function_implementation_test(
                 target,
                 preds,
-                metric_function=multiclass_f1_score
-                if beta == 1.0
-                else partial(multiclass_fbeta_score, beta=beta),
+                metric_function=(
+                    multiclass_f1_score
+                    if beta == 1.0
+                    else partial(multiclass_fbeta_score, beta=beta)
+                ),
                 metric_args={
                     "num_classes": NUM_CLASSES,
                     "top_k": top_k,
@@ -292,9 +297,11 @@ def test_multiclass_fbeta_score_class_with_numpy_array_api_arrays(
             self.run_metric_class_implementation_test(
                 target,
                 preds,
-                metric_class=MulticlassF1Score
-                if beta == 1.0
-                else partial(MulticlassFBetaScore, beta=beta),
+                metric_class=(
+                    MulticlassF1Score
+                    if beta == 1.0
+                    else partial(MulticlassFBetaScore, beta=beta)
+                ),
                 reference_metric=partial(
                     _multiclass_fbeta_score_reference,
                     beta=beta,
@@ -346,9 +353,11 @@ def test_multiclass_fbeta_score_class_with_torch_tensors(
             self.run_metric_class_implementation_test(
                 target,
                 preds,
-                metric_class=MulticlassF1Score
-                if beta == 1.0
-                else partial(MulticlassFBetaScore, beta=beta),
+                metric_class=(
+                    MulticlassF1Score
+                    if beta == 1.0
+                    else partial(MulticlassFBetaScore, beta=beta)
+                ),
                 reference_metric=partial(
                     _multiclass_fbeta_score_reference,
                     beta=beta,
@@ -411,9 +420,11 @@ def test_multilabel_fbeta_score_with_numpy_array_api_arrays(
         self.run_metric_function_implementation_test(
             target,
             preds,
-            metric_function=multilabel_f1_score
-            if beta == 1.0
-            else partial(multilabel_fbeta_score, beta=beta),
+            metric_function=(
+                multilabel_f1_score
+                if beta == 1.0
+                else partial(multilabel_fbeta_score, beta=beta)
+            ),
             reference_metric=partial(
                 _multilabel_fbeta_score_reference,
                 beta=beta,
@@ -446,9 +457,11 @@ def test_multilabel_fbeta_score_class_with_numpy_array_api_arrays(
         self.run_metric_class_implementation_test(
             target,
             preds,
-            metric_class=MultilabelF1Score
-            if beta == 1.0
-            else partial(MultilabelFBetaScore, beta=beta),
+            metric_class=(
+                MultilabelF1Score
+                if beta == 1.0
+                else partial(MultilabelFBetaScore, beta=beta)
+            ),
             reference_metric=partial(
                 _multilabel_fbeta_score_reference,
                 beta=beta,
@@ -466,7 +479,7 @@ def test_multilabel_fbeta_score_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("average", [None, "micro", "macro", "weighted"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_fbeta_score_class_with_torch_tensors(
@@ -479,12 +492,19 @@ def test_multilabel_fbeta_score_class_with_torch_tensors(
         """Test class for multilabel fbeta score with torch tensors."""
         target, preds = inputs
 
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
-            metric_class=MultilabelF1Score
-            if beta == 1.0
-            else partial(MultilabelFBetaScore, beta=beta),
+            metric_class=(
+                MultilabelF1Score
+                if beta == 1.0
+                else partial(MultilabelFBetaScore, beta=beta)
+            ),
             reference_metric=partial(
                 _multilabel_fbeta_score_reference,
                 beta=beta,
@@ -499,6 +519,8 @@ def test_multilabel_fbeta_score_class_with_torch_tensors(
                 "average": average,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
 
 
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
index 831bababf..8d43d66cb 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
@@ -1,4 +1,5 @@
 """Test matthews correlation coefficient metrics."""
+
 from functools import partial
 
 import array_api_compat as apc
@@ -348,7 +349,7 @@ def test_multilabel_mcc_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_mcc_class_with_torch_tensors(
         self,
@@ -361,6 +362,8 @@ def test_multilabel_mcc_class_with_torch_tensors(
         if ignore_index is not None:
             target = _inject_ignore_index(target, ignore_index)
 
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
@@ -375,4 +378,6 @@ def test_multilabel_mcc_class_with_torch_tensors(
                 "num_labels": NUM_LABELS,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_negative_predicitve_value.py b/tests/cyclops/evaluate/metrics/experimental/test_negative_predicitve_value.py
index b07f2e7ea..6d0d057be 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_negative_predicitve_value.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_negative_predicitve_value.py
@@ -1,4 +1,5 @@
 """Test negative predictive value."""
+
 from functools import partial
 from typing import Literal, Optional
 
@@ -476,7 +477,7 @@ def test_multilabel_npv_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("average", [None, "micro", "macro", "weighted"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_npv_class_with_torch_tensors(
@@ -488,6 +489,11 @@ def test_multilabel_npv_class_with_torch_tensors(
         """Test class for multilabel negative predictive value with torch tensors."""
         target, preds = inputs
 
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
@@ -505,6 +511,8 @@ def test_multilabel_npv_class_with_torch_tensors(
                 "average": average,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
 
 
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_precision_recall.py b/tests/cyclops/evaluate/metrics/experimental/test_precision_recall.py
index 14c3c3a96..8b24b2d75 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_precision_recall.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_precision_recall.py
@@ -1,4 +1,5 @@
 """Test precision recall metrics."""
+
 from functools import partial
 from typing import Literal, Optional
 
@@ -730,7 +731,7 @@ def test_multilabel_precision_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("average", [None, "micro", "macro", "weighted"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_precision_class_with_torch_tensors(
@@ -742,6 +743,11 @@ def test_multilabel_precision_class_with_torch_tensors(
         """Test class for multilabel precision with torch tensors."""
         target, preds = inputs
 
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
@@ -760,6 +766,8 @@ def test_multilabel_precision_class_with_torch_tensors(
                 "average": average,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
 
 
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_specificity.py b/tests/cyclops/evaluate/metrics/experimental/test_specificity.py
index 035edbada..b4f40b12c 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_specificity.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_specificity.py
@@ -1,4 +1,5 @@
 """Test specificity."""
+
 from functools import partial
 from typing import Literal, Optional
 
@@ -417,7 +418,7 @@ def test_multilabel_specificity_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("average", [None, "micro", "macro", "weighted"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_specificity_class_with_torch_tensors(
@@ -429,6 +430,11 @@ def test_multilabel_specificity_class_with_torch_tensors(
         """Test class for multilabel specificity with torch tensors."""
         target, preds = inputs
 
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
@@ -446,6 +452,8 @@ def test_multilabel_specificity_class_with_torch_tensors(
                 "average": average,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
 
 
diff --git a/tests/cyclops/evaluate/metrics/experimental/testers.py b/tests/cyclops/evaluate/metrics/experimental/testers.py
index 4d58305d3..77d87436e 100644
--- a/tests/cyclops/evaluate/metrics/experimental/testers.py
+++ b/tests/cyclops/evaluate/metrics/experimental/testers.py
@@ -150,6 +150,15 @@ def _class_impl_test(  # noqa: PLR0912
         preds=apc.to_device(total_preds, device if use_device_for_ref else "cpu"),
     )
 
+    # DEBUG
+    metric.reset()
+    print(
+        metric(
+            target=apc.to_device(total_target, device if use_device_for_ref else "cpu"),
+            preds=apc.to_device(total_preds, device if use_device_for_ref else "cpu"),
+        ),
+    )
+
     # assert after aggregation
     if isinstance(ref_result, dict):
         for key in ref_result:

From 668ac1fbda3564d092b6fb5b584d2019e8cb2d77 Mon Sep 17 00:00:00 2001
From: fcogidi <41602287+fcogidi@users.noreply.github.com>
Date: Tue, 20 Feb 2024 17:44:42 -0500
Subject: [PATCH 12/14] use float64 for internal computations

---
 .../functional/matthews_corr_coef.py          | 24 +++++++++++--------
 .../experimental/matthews_corr_coef.py        | 12 ++++++----
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
index 07f542eab..794645e60 100644
--- a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
+++ b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
@@ -30,17 +30,17 @@ def _mcc_reduce(confmat: Array) -> Array:
     confmat = xp.sum(confmat, axis=0) if confmat.ndim == 3 else confmat
 
     if int(apc.size(confmat) or 0) == 4:  # binary case
-        tn, fp, fn, tp = xp.reshape(xp.astype(confmat, xp.float32), (-1,))
+        tn, fp, fn, tp = xp.reshape(xp.astype(confmat, xp.float64), (-1,))
         if tp + tn != 0 and fp + fn == 0:
             return xp.asarray(1.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
 
         if tp + tn == 0 and fp + fn != 0:
             return xp.asarray(-1.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
 
-    tk = xp.sum(confmat, axis=-1, dtype=xp.float32)  # tn + fp and tp + fn
-    pk = xp.sum(confmat, axis=-2, dtype=xp.float32)  # tn + fn and tp + fp
-    c = xp.astype(xp.linalg.trace(confmat), xp.float32)  # tn and tp
-    s = xp.sum(confmat, dtype=xp.float32)  # tn + tp + fn + fp
+    tk = xp.sum(confmat, axis=-1, dtype=xp.float64)  # tn + fp and tp + fn
+    pk = xp.sum(confmat, axis=-2, dtype=xp.float64)  # tn + fn and tp + fp
+    c = xp.astype(xp.linalg.trace(confmat), xp.float64)  # tn and tp
+    s = xp.sum(confmat, dtype=xp.float64)  # tn + tp + fn + fp
 
     cov_ytyp = c * s - sum(tk * pk)
     cov_ypyp = s**2 - sum(pk * pk)
@@ -65,7 +65,7 @@ def _mcc_reduce(confmat: Array) -> Array:
         denom = (tp + fp + eps) * (tp + fn + eps) * (tn + fp + eps) * (tn + fn + eps)
     elif denom == 0:
         return xp.asarray(0.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
-    return numerator / xp.sqrt(denom)  # type: ignore[no-any-return]
+    return xp.astype(numerator / xp.sqrt(denom), xp.float64)  # type: ignore[no-any-return]
 
 
 def binary_mcc(
@@ -214,10 +214,14 @@ def multiclass_mcc(
     >>> multiclass_mcc(target, preds, num_classes=3)
     Array(0.7, dtype=float32)
     >>> target = anp.asarray([2, 1, 0, 0])
-    >>> preds = anp.asarray([[0.16, 0.26, 0.58],
-    ...                     [0.22, 0.61, 0.17],
-    ...                     [0.71, 0.09, 0.20],
-    ...                     [0.05, 0.82, 0.13]])
+    >>> preds = anp.asarray(
+    ...     [
+    ...         [0.16, 0.26, 0.58],
+    ...         [0.22, 0.61, 0.17],
+    ...         [0.71, 0.09, 0.20],
+    ...         [0.05, 0.82, 0.13],
+    ...     ]
+    ... )
     >>> multiclass_mcc(target, preds, num_classes=3)
     Array(0.7, dtype=float32)
 
diff --git a/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
index a05980dcf..ce9c75c20 100644
--- a/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
+++ b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
@@ -94,10 +94,14 @@ class MulticlassMCC(MulticlassConfusionMatrix, registry_key="multiclass_mcc"):
     >>> metric(target, preds)
     Array(0.7, dtype=float32)
     >>> target = anp.asarray([2, 1, 0, 0])
-    >>> preds = anp.asarray([[0.16, 0.26, 0.58],
-    ...                     [0.22, 0.61, 0.17],
-    ...                     [0.71, 0.09, 0.20],
-    ...                     [0.05, 0.82, 0.13]])
+    >>> preds = anp.asarray(
+    ...     [
+    ...         [0.16, 0.26, 0.58],
+    ...         [0.22, 0.61, 0.17],
+    ...         [0.71, 0.09, 0.20],
+    ...         [0.05, 0.82, 0.13],
+    ...     ]
+    ... )
     >>> metric = MulticlassMCC(num_classes=3)
     >>> metric(target, preds)
     Array(0.7, dtype=float32)

From 18b06864bdf37670fc53a7476638f96da1ed8839 Mon Sep 17 00:00:00 2001
From: fcogidi <41602287+fcogidi@users.noreply.github.com>
Date: Tue, 20 Feb 2024 17:54:45 -0500
Subject: [PATCH 13/14] Refactor test files and remove debug print statement

---
 .../metrics/experimental/test_matthews_corr_coef.py      | 2 ++
 tests/cyclops/evaluate/metrics/experimental/testers.py   | 9 ---------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
index 8d43d66cb..44aecbdbf 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
@@ -290,6 +290,8 @@ def _multilabel_mcc_reference(
 class TestMultilabelMCC(MetricTester):
     """Test multilabel matthews correlation coefficient function and class."""
 
+    atol: float = 4e-8
+
     @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_mcc_with_numpy_array_api_arrays(
diff --git a/tests/cyclops/evaluate/metrics/experimental/testers.py b/tests/cyclops/evaluate/metrics/experimental/testers.py
index 77d87436e..4d58305d3 100644
--- a/tests/cyclops/evaluate/metrics/experimental/testers.py
+++ b/tests/cyclops/evaluate/metrics/experimental/testers.py
@@ -150,15 +150,6 @@ def _class_impl_test(  # noqa: PLR0912
         preds=apc.to_device(total_preds, device if use_device_for_ref else "cpu"),
     )
 
-    # DEBUG
-    metric.reset()
-    print(
-        metric(
-            target=apc.to_device(total_target, device if use_device_for_ref else "cpu"),
-            preds=apc.to_device(total_preds, device if use_device_for_ref else "cpu"),
-        ),
-    )
-
     # assert after aggregation
     if isinstance(ref_result, dict):
         for key in ref_result:

From 7c33227a329ed596568a63dee43a1bb80b069ad1 Mon Sep 17 00:00:00 2001
From: fcogidi <41602287+fcogidi@users.noreply.github.com>
Date: Tue, 20 Feb 2024 18:10:52 -0500
Subject: [PATCH 14/14] Fix data type in Matthews correlation coefficient
 calculation

---
 .../metrics/experimental/functional/matthews_corr_coef.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
index 794645e60..89ed7c07e 100644
--- a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
+++ b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
@@ -65,7 +65,7 @@ def _mcc_reduce(confmat: Array) -> Array:
         denom = (tp + fp + eps) * (tp + fn + eps) * (tn + fp + eps) * (tn + fn + eps)
     elif denom == 0:
         return xp.asarray(0.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
-    return xp.astype(numerator / xp.sqrt(denom), xp.float64)  # type: ignore[no-any-return]
+    return xp.astype(numerator / xp.sqrt(denom), xp.float32)  # type: ignore[no-any-return]
 
 
 def binary_mcc(