diff --git a/cyclops/evaluate/metrics/experimental/__init__.py b/cyclops/evaluate/metrics/experimental/__init__.py
index 3a5b9974a..51ab7a21e 100644
--- a/cyclops/evaluate/metrics/experimental/__init__.py
+++ b/cyclops/evaluate/metrics/experimental/__init__.py
@@ -29,6 +29,11 @@
 )
 from cyclops.evaluate.metrics.experimental.mae import MeanAbsoluteError
 from cyclops.evaluate.metrics.experimental.mape import MeanAbsolutePercentageError
+from cyclops.evaluate.metrics.experimental.matthews_corr_coef import (
+    BinaryMCC,
+    MulticlassMCC,
+    MultilabelMCC,
+)
 from cyclops.evaluate.metrics.experimental.metric_dict import MetricDict
 from cyclops.evaluate.metrics.experimental.mse import MeanSquaredError
 from cyclops.evaluate.metrics.experimental.negative_predictive_value import (
diff --git a/cyclops/evaluate/metrics/experimental/confusion_matrix.py b/cyclops/evaluate/metrics/experimental/confusion_matrix.py
index 9a14488f4..3140c89d9 100644
--- a/cyclops/evaluate/metrics/experimental/confusion_matrix.py
+++ b/cyclops/evaluate/metrics/experimental/confusion_matrix.py
@@ -1,4 +1,5 @@
 """Confusion matrix."""
+
 from types import ModuleType
 from typing import Any, Optional, Tuple, Union
 
diff --git a/cyclops/evaluate/metrics/experimental/functional/__init__.py b/cyclops/evaluate/metrics/experimental/functional/__init__.py
index 1a2e5902b..56b7e825e 100644
--- a/cyclops/evaluate/metrics/experimental/functional/__init__.py
+++ b/cyclops/evaluate/metrics/experimental/functional/__init__.py
@@ -31,6 +31,11 @@
 from cyclops.evaluate.metrics.experimental.functional.mape import (
     mean_absolute_percentage_error,
 )
+from cyclops.evaluate.metrics.experimental.functional.matthews_corr_coef import (
+    binary_mcc,
+    multiclass_mcc,
+    multilabel_mcc,
+)
 from cyclops.evaluate.metrics.experimental.functional.mse import mean_squared_error
 from cyclops.evaluate.metrics.experimental.functional.negative_predictive_value import (
     binary_npv,
diff --git a/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py b/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py
index 23faa208d..19b53b48c 100644
--- a/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py
+++ b/cyclops/evaluate/metrics/experimental/functional/confusion_matrix.py
@@ -1,4 +1,5 @@
 """Functions for computing the confusion matrix for classification tasks."""
+
 # mypy: disable-error-code="no-any-return"
 from types import ModuleType
 from typing import Literal, Optional, Tuple, Union
diff --git a/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
new file mode 100644
index 000000000..89ed7c07e
--- /dev/null
+++ b/cyclops/evaluate/metrics/experimental/functional/matthews_corr_coef.py
@@ -0,0 +1,355 @@
+"""Functional API for the matthews correlation coefficient (MCC) metric."""
+
+from typing import Optional, Tuple, Union
+
+import array_api_compat as apc
+
+from cyclops.evaluate.metrics.experimental.functional.confusion_matrix import (
+    _binary_confusion_matrix_compute,
+    _binary_confusion_matrix_format_arrays,
+    _binary_confusion_matrix_update_state,
+    _binary_confusion_matrix_validate_args,
+    _binary_confusion_matrix_validate_arrays,
+    _multiclass_confusion_matrix_format_arrays,
+    _multiclass_confusion_matrix_update_state,
+    _multiclass_confusion_matrix_validate_args,
+    _multiclass_confusion_matrix_validate_arrays,
+    _multilabel_confusion_matrix_compute,
+    _multilabel_confusion_matrix_format_arrays,
+    _multilabel_confusion_matrix_update_state,
+    _multilabel_confusion_matrix_validate_args,
+    _multilabel_confusion_matrix_validate_arrays,
+)
+from cyclops.evaluate.metrics.experimental.utils.types import Array
+
+
+def _mcc_reduce(confmat: Array) -> Array:
+    """Reduce an un-normalized confusion matrix into the matthews corrcoef."""
+    xp = apc.array_namespace(confmat)
+    # convert multilabel into binary
+    confmat = xp.sum(confmat, axis=0) if confmat.ndim == 3 else confmat
+
+    if int(apc.size(confmat) or 0) == 4:  # binary case
+        tn, fp, fn, tp = xp.reshape(xp.astype(confmat, xp.float64), (-1,))
+        if tp + tn != 0 and fp + fn == 0:
+            return xp.asarray(1.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
+
+        if tp + tn == 0 and fp + fn != 0:
+            return xp.asarray(-1.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
+
+    tk = xp.sum(confmat, axis=-1, dtype=xp.float64)  # tn + fp and tp + fn
+    pk = xp.sum(confmat, axis=-2, dtype=xp.float64)  # tn + fn and tp + fp
+    c = xp.astype(xp.linalg.trace(confmat), xp.float64)  # tn and tp
+    s = xp.sum(confmat, dtype=xp.float64)  # tn + tp + fn + fp
+
+    cov_ytyp = c * s - sum(tk * pk)
+    cov_ypyp = s**2 - sum(pk * pk)
+    cov_ytyt = s**2 - sum(tk * tk)
+
+    numerator = cov_ytyp
+    denom = cov_ypyp * cov_ytyt
+
+    if denom == 0 and int(apc.size(confmat) or 0) == 4:
+        if tp == 0 or tn == 0:
+            a = tp + tn
+
+        if fp == 0 or fn == 0:
+            b = fp + fn
+
+        eps = xp.asarray(
+            xp.finfo(xp.float32).eps,
+            dtype=xp.float32,
+            device=apc.device(confmat),
+        )
+        numerator = xp.sqrt(eps) * (a - b)
+        denom = (tp + fp + eps) * (tp + fn + eps) * (tn + fp + eps) * (tn + fn + eps)
+    elif denom == 0:
+        return xp.asarray(0.0, dtype=xp.float32, device=apc.device(confmat))  # type: ignore[no-any-return]
+    return xp.astype(numerator / xp.sqrt(denom), xp.float32)  # type: ignore[no-any-return]
+
+
+def binary_mcc(
+    target: Array,
+    preds: Array,
+    threshold: float = 0.5,
+    ignore_index: Optional[int] = None,
+) -> Array:
+    """Compute the matthews correlation coefficient for binary classification.
+
+    Parameters
+    ----------
+    target : Array
+        An array object that is compatible with the Python array API standard
+        and contains the ground truth labels. The expected shape of the array
+        is `(N, ...)`, where `N` is the number of samples.
+    preds : Array
+        An array object that is compatible with the Python array API standard and
+        contains the predictions of a binary classifier. the expected shape of the
+        array is `(N, ...)` where `N` is the number of samples. If `preds` contains
+        floating point values that are not in the range `[0, 1]`, a sigmoid function
+        will be applied to each value before thresholding.
+    threshold : float, default=0.5
+        The threshold to use when converting probabilities to binary predictions.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, ignore nothing.
+
+    Returns
+    -------
+    Array
+        The matthews correlation coefficient.
+
+    Raises
+    ------
+    ValueError
+        If `target` and `preds` have different shapes.
+    ValueError
+        If `target` and `preds` are not array-API-compatible.
+    ValueError
+        If `target` or `preds` are empty.
+    ValueError
+        If `target` or `preds` are not numeric arrays.
+    ValueError
+        If `threshold` is not a float in the [0,1] range.
+    ValueError
+        If `normalize` is not one of `'pred'`, `'true'`, `'all'`, `'none'`, or `None`.
+    ValueError
+        If `ignore_index` is not `None` or an integer.
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental.functional import binary_mcc
+    >>> target = anp.asarray([0, 1, 0, 1, 0, 1])
+    >>> preds = anp.asarray([0, 0, 1, 1, 0, 1])
+    >>> binary_mcc(target, preds)
+    Array(0.33333334, dtype=float32)
+    >>> target = anp.asarray([0, 1, 0, 1, 0, 1])
+    >>> preds = anp.asarray([0.11, 0.22, 0.84, 0.73, 0.33, 0.92])
+    >>> binary_mcc(target, preds)
+    Array(0.33333334, dtype=float32)
+
+    """
+    _binary_confusion_matrix_validate_args(
+        threshold=threshold,
+        normalize=None,
+        ignore_index=ignore_index,
+    )
+    xp = _binary_confusion_matrix_validate_arrays(target, preds, ignore_index)
+
+    target, preds = _binary_confusion_matrix_format_arrays(
+        target,
+        preds,
+        threshold,
+        ignore_index,
+        xp=xp,
+    )
+    tn, fp, fn, tp = _binary_confusion_matrix_update_state(target, preds, xp=xp)
+
+    confmat = _binary_confusion_matrix_compute(tn, fp, fn, tp, normalize=None)
+    return _mcc_reduce(confmat)
+
+
+def multiclass_mcc(
+    target: Array,
+    preds: Array,
+    num_classes: int,
+    ignore_index: Optional[Union[int, Tuple[int]]] = None,
+) -> Array:
+    """Compute the matthews correlation coefficient for multiclass classification.
+
+    Parameters
+    ----------
+    target : Array
+        The target array of shape `(N, ...)`, where `N` is the number of samples.
+    preds : Array
+        The prediction array with shape `(N, ...)`, for integer inputs, or
+        `(N, C, ...)`, for float inputs, where `N` is the number of samples and
+        `C` is the number of classes.
+    num_classes : int
+        The number of classes.
+    ignore_index : int, Tuple[int], optional, default=None
+        Specifies a target value(s) that is ignored and does not contribute to the
+        metric. If `None`, ignore nothing.
+
+    Returns
+    -------
+    Array
+        The matthews correlation coefficient.
+
+    Raises
+    ------
+    ValueError
+        If `target` and `preds` are not array-API-compatible.
+    ValueError
+        If `target` or `preds` are empty.
+    ValueError
+        If `target` or `preds` are not numeric arrays.
+    ValueError
+        If `num_classes` is not an integer larger than 1.
+    ValueError
+        If `normalize` is not one of `'pred'`, `'true'`, `'all'`, `'none'`, or `None`.
+    ValueError
+        If `ignore_index` is not `None`, an integer or a tuple of integers.
+    ValueError
+        If `preds` contains floats but `target` does not have one dimension less than
+        `preds`.
+    ValueError
+        If the second dimension of `preds` is not equal to `num_classes`.
+    ValueError
+        If when `target` has one dimension less than `preds`, the shape of `preds` is
+        not `(N, C, ...)` while the shape of `target` is `(N, ...)`.
+    ValueError
+        If when `target` and `preds` have the same number of dimensions, they
+        do not have the same shape.
+    RuntimeError
+        If `target` contains values that are not in the range [0, `num_classes`).
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental.functional import multiclass_mcc
+    >>> target = anp.asarray([2, 1, 0, 0])
+    >>> preds = anp.asarray([2, 1, 0, 1])
+    >>> multiclass_mcc(target, preds, num_classes=3)
+    Array(0.7, dtype=float32)
+    >>> target = anp.asarray([2, 1, 0, 0])
+    >>> preds = anp.asarray(
+    ...     [
+    ...         [0.16, 0.26, 0.58],
+    ...         [0.22, 0.61, 0.17],
+    ...         [0.71, 0.09, 0.20],
+    ...         [0.05, 0.82, 0.13],
+    ...     ]
+    ... )
+    >>> multiclass_mcc(target, preds, num_classes=3)
+    Array(0.7, dtype=float32)
+
+    """
+    _multiclass_confusion_matrix_validate_args(
+        num_classes,
+        normalize=None,
+        ignore_index=ignore_index,
+    )
+    xp = _multiclass_confusion_matrix_validate_arrays(
+        target,
+        preds,
+        num_classes,
+        ignore_index=ignore_index,
+    )
+
+    target, preds = _multiclass_confusion_matrix_format_arrays(
+        target,
+        preds,
+        ignore_index=ignore_index,
+        xp=xp,
+    )
+    confmat = _multiclass_confusion_matrix_update_state(
+        target,
+        preds,
+        num_classes,
+        xp=xp,
+    )
+    return _mcc_reduce(confmat)
+
+
+def multilabel_mcc(
+    target: Array,
+    preds: Array,
+    num_labels: int,
+    threshold: float = 0.5,
+    ignore_index: Optional[int] = None,
+) -> Array:
+    """Compute the matthews correlation coefficient for multilabel classification.
+
+    Parameters
+    ----------
+    target : Array
+        The target array of shape `(N, L, ...)`, where `N` is the number of samples
+        and `L` is the number of labels.
+    preds : Array
+        The prediction array of shape `(N, L, ...)`, where `N` is the number of
+        samples and `L` is the number of labels. If `preds` contains floats that
+        are not in the range [0,1], they will be converted to probabilities using
+        the sigmoid function.
+    num_labels : int
+        The number of labels.
+    threshold : float, default=0.5
+        The threshold to use for binarizing the predictions.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, ignore nothing.
+
+    Returns
+    -------
+    Array
+        The matthews correlation coefficient.
+
+    Raises
+    ------
+    ValueError
+        If `target` and `preds` are not array-API-compatible.
+    ValueError
+        If `target` or `preds` are empty.
+    ValueError
+        If `target` or `preds` are not numeric arrays.
+    ValueError
+        If `threshold` is not a float in the [0,1] range.
+    ValueError
+        If `normalize` is not one of `'pred'`, `'true'`, `'all'`, `'none'`, or `None`.
+    ValueError
+        If `ignore_index` is not `None` or a non-negative integer.
+    ValueError
+        If `num_labels` is not an integer larger than 1.
+    ValueError
+        If `target` and `preds` do not have the same shape.
+    ValueError
+        If the second dimension of `preds` is not equal to `num_labels`.
+    RuntimeError
+        If `target` contains values that are not in the range [0, 1].
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental.functional import multilabel_mcc
+    >>> target = anp.asarray([[0, 1, 0], [1, 0, 1]])
+    >>> preds = anp.asarray([[0, 0, 1], [1, 0, 1]])
+    >>> multilabel_mcc(target, preds, num_labels=3)
+    Array(0.33333334, dtype=float32)
+    >>> target = anp.asarray([[0, 1, 0], [1, 0, 1]])
+    >>> preds = anp.asarray([[0.11, 0.22, 0.84], [0.73, 0.33, 0.92]])
+    >>> multilabel_mcc(target, preds, num_labels=3)
+    Array(0.33333334, dtype=float32)
+
+    """
+    _multilabel_confusion_matrix_validate_args(
+        num_labels,
+        threshold=threshold,
+        normalize=None,
+        ignore_index=ignore_index,
+    )
+    xp = _multilabel_confusion_matrix_validate_arrays(
+        target,
+        preds,
+        num_labels,
+        ignore_index=ignore_index,
+    )
+
+    target, preds = _multilabel_confusion_matrix_format_arrays(
+        target,
+        preds,
+        threshold=threshold,
+        ignore_index=ignore_index,
+        xp=xp,
+    )
+    tn, fp, fn, tp = _multilabel_confusion_matrix_update_state(target, preds, xp=xp)
+
+    confmat = _multilabel_confusion_matrix_compute(
+        tn,
+        fp,
+        fn,
+        tp,
+        num_labels,
+        normalize=None,
+    )
+    return _mcc_reduce(confmat)
diff --git a/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
new file mode 100644
index 000000000..ce9c75c20
--- /dev/null
+++ b/cyclops/evaluate/metrics/experimental/matthews_corr_coef.py
@@ -0,0 +1,192 @@
+"""Matthews Correlation Coefficient (MCC) metric."""
+
+from typing import Any, Optional, Tuple, Union
+
+from cyclops.evaluate.metrics.experimental.confusion_matrix import (
+    BinaryConfusionMatrix,
+    MulticlassConfusionMatrix,
+    MultilabelConfusionMatrix,
+)
+from cyclops.evaluate.metrics.experimental.functional.confusion_matrix import (
+    _binary_confusion_matrix_compute,
+    _multilabel_confusion_matrix_compute,
+)
+from cyclops.evaluate.metrics.experimental.functional.matthews_corr_coef import (
+    _mcc_reduce,
+)
+from cyclops.evaluate.metrics.experimental.utils.types import Array
+
+
+class BinaryMCC(BinaryConfusionMatrix, registry_key="binary_mcc"):
+    """A measure of the agreement between predicted and actual values.
+
+    Parameters
+    ----------
+    threshold : float, default=0.5
+        The threshold value to use when binarizing the inputs.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, all values are used.
+    **kwargs : Any
+        Additional keyword arguments common to all metrics.
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental import BinaryMCC
+    >>> target = anp.asarray([0, 1, 0, 1, 0, 1])
+    >>> preds = anp.asarray([0, 0, 1, 1, 0, 1])
+    >>> metric = BinaryMCC()
+    >>> metric(target, preds)
+    Array(0.33333334, dtype=float32)
+    >>> target = anp.asarray([0, 1, 0, 1, 0, 1])
+    >>> preds = anp.asarray([0.11, 0.22, 0.84, 0.73, 0.33, 0.92])
+    >>> metric = BinaryMCC()
+    >>> metric(target, preds)
+    Array(0.33333334, dtype=float32)
+
+    """
+
+    name: str = "Matthews Correlation Coefficient"
+
+    def __init__(
+        self,
+        threshold: float = 0.5,
+        ignore_index: Optional[int] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the class."""
+        super().__init__(threshold, normalize=None, ignore_index=ignore_index, **kwargs)
+
+    def _compute_metric(self) -> Array:
+        """Compute the confusion matrix."""
+        tn, fp, fn, tp = self._final_state()
+        confmat = _binary_confusion_matrix_compute(
+            tp=tp,
+            fp=fp,
+            tn=tn,
+            fn=fn,
+            normalize=self.normalize,
+        )
+        return _mcc_reduce(confmat)
+
+
+class MulticlassMCC(MulticlassConfusionMatrix, registry_key="multiclass_mcc"):
+    """A measure of the agreement between predicted and actual values.
+
+    Parameters
+    ----------
+    num_classes : int
+        The number of classes.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, all values are used.
+    **kwargs : Any
+        Additional keyword arguments common to all metrics.
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental import MulticlassMCC
+    >>> target = anp.asarray([2, 1, 0, 0])
+    >>> preds = anp.asarray([2, 1, 0, 1])
+    >>> metric = MulticlassMCC(num_classes=3)
+    >>> metric(target, preds)
+    Array(0.7, dtype=float32)
+    >>> target = anp.asarray([2, 1, 0, 0])
+    >>> preds = anp.asarray(
+    ...     [
+    ...         [0.16, 0.26, 0.58],
+    ...         [0.22, 0.61, 0.17],
+    ...         [0.71, 0.09, 0.20],
+    ...         [0.05, 0.82, 0.13],
+    ...     ]
+    ... )
+    >>> metric = MulticlassMCC(num_classes=3)
+    >>> metric(target, preds)
+    Array(0.7, dtype=float32)
+    """
+
+    name: str = "Matthews Correlation Coefficient"
+
+    def __init__(
+        self,
+        num_classes: int,
+        ignore_index: Optional[Union[int, Tuple[int]]] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the class."""
+        super().__init__(
+            num_classes=num_classes,
+            normalize=None,
+            ignore_index=ignore_index,
+            **kwargs,
+        )
+
+    def _compute_metric(self) -> Array:
+        """Compute the confusion matrix."""
+        return _mcc_reduce(self.confmat)  # type: ignore
+
+
+class MultilabelMCC(MultilabelConfusionMatrix, registry_key="multilabel_mcc"):
+    """A measure of the agreement between predicted and actual values.
+
+    Parameters
+    ----------
+    num_labels : int
+        The number of labels.
+    threshold : float, default=0.5
+        The threshold value to use when binarizing the inputs.
+    ignore_index : int, optional, default=None
+        Specifies a target value that is ignored and does not contribute to the
+        metric. If `None`, all values are used.
+    **kwargs : Any
+        Additional keyword arguments common to all metrics.
+
+    Examples
+    --------
+    >>> import numpy.array_api as anp
+    >>> from cyclops.evaluate.metrics.experimental import MultilabelMCC
+    >>> target = anp.asarray([[0, 1, 0], [1, 0, 1]])
+    >>> preds = anp.asarray([[0, 0, 1], [1, 0, 1]])
+    >>> metric = MultilabelMCC(num_labels=3)
+    >>> metric(target, preds)
+    Array(0.33333334, dtype=float32)
+    >>> target = anp.asarray([[0, 1, 0], [1, 0, 1]])
+    >>> preds = anp.asarray([[0.11, 0.22, 0.84], [0.73, 0.33, 0.92]])
+    >>> metric = MultilabelMCC(num_labels=3)
+    >>> metric(target, preds)
+    Array(0.33333334, dtype=float32)
+
+    """
+
+    name: str = "Matthews Correlation Coefficient"
+
+    def __init__(
+        self,
+        num_labels: int,
+        threshold: float = 0.5,
+        ignore_index: Optional[int] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize the class."""
+        super().__init__(
+            num_labels=num_labels,
+            threshold=threshold,
+            normalize=None,
+            ignore_index=ignore_index,
+            **kwargs,
+        )
+
+    def _compute_metric(self) -> Array:
+        """Compute the confusion matrix."""
+        tn, fp, fn, tp = self._final_state()
+        confmat = _multilabel_confusion_matrix_compute(
+            tp=tp,
+            fp=fp,
+            tn=tn,
+            fn=fn,
+            num_labels=self.num_labels,
+            normalize=self.normalize,
+        )
+        return _mcc_reduce(confmat)
diff --git a/cyclops/utils/index.py b/cyclops/utils/index.py
index 2e3941a02..3b281ec7a 100644
--- a/cyclops/utils/index.py
+++ b/cyclops/utils/index.py
@@ -3,6 +3,7 @@
 from typing import Any, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import numpy.typing as npt
 
 
 def index_axis(ind: int, axis: int, shape: Tuple[int, ...]) -> Tuple[Any, ...]:
@@ -33,9 +34,9 @@ def index_axis(ind: int, axis: int, shape: Tuple[int, ...]) -> Tuple[Any, ...]:
 
 
 def take_indices(
-    data: np.typing.NDArray[Any],
-    indexes: Sequence[Optional[Union[Sequence[int], np.typing.NDArray[Any]]]],
-) -> np.typing.NDArray[Any]:
+    data: npt.NDArray[Any],
+    indexes: Sequence[Optional[Union[Sequence[int], npt.NDArray[Any]]]],
+) -> npt.NDArray[Any]:
     """Index array by specifying the indices to take on each axis.
 
     Parameters
@@ -69,10 +70,10 @@ def take_indices(
 
 
 def take_indices_over_axis(
-    data: np.typing.NDArray[Any],
+    data: npt.NDArray[Any],
     axis: int,
-    index: Union[np.typing.NDArray[Any], Sequence[int]],
-) -> np.typing.NDArray[Any]:
+    index: Union[npt.NDArray[Any], Sequence[int]],
+) -> npt.NDArray[Any]:
     """Take indices along an axis.
 
     Parameters
diff --git a/tests/cyclops/evaluate/metrics/experimental/inputs.py b/tests/cyclops/evaluate/metrics/experimental/inputs.py
index 92af7b9e6..d38d1d852 100644
--- a/tests/cyclops/evaluate/metrics/experimental/inputs.py
+++ b/tests/cyclops/evaluate/metrics/experimental/inputs.py
@@ -1,4 +1,5 @@
 """Input data for tests of metrics in cyclops/evaluate/metrics/experimental."""
+
 import random
 from collections import namedtuple
 from types import ModuleType
@@ -296,43 +297,46 @@ def _multilabel_cases(*, xp: Any):
     return (
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels),
-                preds=xp.asarray(_multilabel_preds),
+                target=xp.asarray(_multilabel_labels, dtype=xp.int32),
+                preds=xp.asarray(_multilabel_preds, dtype=xp.int32),
             ),
             id="input[2d-labels]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels_multidim),
-                preds=xp.asarray(_multilabel_preds_multidim),
+                target=xp.asarray(_multilabel_labels_multidim, dtype=xp.int32),
+                preds=xp.asarray(_multilabel_preds_multidim, dtype=xp.int32),
             ),
             id="input[multidim-labels]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels),
-                preds=xp.asarray(_multilabel_probs),
+                target=xp.asarray(_multilabel_labels, dtype=xp.int32),
+                preds=xp.asarray(_multilabel_probs, dtype=xp.float32),
             ),
             id="input[2d-probs]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels),
-                preds=xp.asarray(_inv_sigmoid(_multilabel_probs)),
+                target=xp.asarray(_multilabel_labels, dtype=xp.int32),
+                preds=xp.asarray(_inv_sigmoid(_multilabel_probs), dtype=xp.float32),
             ),
             id="input[2d-logits]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels_multidim),
-                preds=xp.asarray(_multilabel_probs_multidim),
+                target=xp.asarray(_multilabel_labels_multidim, dtype=xp.int32),
+                preds=xp.asarray(_multilabel_probs_multidim, dtype=xp.float32),
             ),
             id="input[multidim-probs]",
         ),
         pytest.param(
             InputSpec(
-                target=xp.asarray(_multilabel_labels_multidim),
-                preds=xp.asarray(_inv_sigmoid(_multilabel_probs_multidim)),
+                target=xp.asarray(_multilabel_labels_multidim, dtype=xp.int32),
+                preds=xp.asarray(
+                    _inv_sigmoid(_multilabel_probs_multidim),
+                    dtype=xp.float32,
+                ),
             ),
             id="input[multidim-logits]",
         ),
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_confusion_matrix.py b/tests/cyclops/evaluate/metrics/experimental/test_confusion_matrix.py
index 2d94c03fa..860bf4ba3 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_confusion_matrix.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_confusion_matrix.py
@@ -1,4 +1,5 @@
 """Test confusion matrix metrics."""
+
 from functools import partial
 
 import array_api_compat as apc
@@ -390,7 +391,7 @@ def test_multilabel_confusion_matrix_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("normalize", [None, "true", "pred", "all"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_confusion_matrix_class_with_torch_tensors(
@@ -405,6 +406,8 @@ def test_multilabel_confusion_matrix_class_with_torch_tensors(
         if ignore_index is not None:
             target = _inject_ignore_index(target, ignore_index)
 
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
@@ -421,4 +424,6 @@ def test_multilabel_confusion_matrix_class_with_torch_tensors(
                 "normalize": normalize,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_f_score.py b/tests/cyclops/evaluate/metrics/experimental/test_f_score.py
index 5c9afe687..73f83dda5 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_f_score.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_f_score.py
@@ -1,4 +1,5 @@
 """Tests for the F-score metric."""
+
 from functools import partial
 from typing import Literal, Optional
 
@@ -79,9 +80,11 @@ def test_binary_fbeta_score_function_with_numpy_array_api_arrays(
         self.run_metric_function_implementation_test(
             target,
             preds,
-            metric_function=binary_f1_score
-            if beta == 1.0
-            else partial(binary_fbeta_score, beta=beta),
+            metric_function=(
+                binary_f1_score
+                if beta == 1.0
+                else partial(binary_fbeta_score, beta=beta)
+            ),
             metric_args={"threshold": THRESHOLD, "ignore_index": ignore_index},
             reference_metric=partial(
                 _binary_fbeta_score_reference,
@@ -119,9 +122,9 @@ def test_binary_fbeta_score_class_with_numpy_array_api_arrays(
         self.run_metric_class_implementation_test(
             target,
             preds,
-            metric_class=BinaryF1Score
-            if beta == 1.0
-            else partial(BinaryFBetaScore, beta=beta),
+            metric_class=(
+                BinaryF1Score if beta == 1.0 else partial(BinaryFBetaScore, beta=beta)
+            ),
             metric_args={"threshold": THRESHOLD, "ignore_index": ignore_index},
             reference_metric=partial(
                 _binary_fbeta_score_reference,
@@ -162,9 +165,9 @@ def test_binary_fbeta_class_with_torch_tensors(
         self.run_metric_class_implementation_test(
             target,
             preds,
-            metric_class=BinaryF1Score
-            if beta == 1.0
-            else partial(BinaryFBetaScore, beta=beta),
+            metric_class=(
+                BinaryF1Score if beta == 1.0 else partial(BinaryFBetaScore, beta=beta)
+            ),
             metric_args={"threshold": THRESHOLD, "ignore_index": ignore_index},
             reference_metric=partial(
                 _binary_fbeta_score_reference,
@@ -241,9 +244,11 @@ def test_multiclass_fbeta_score_with_numpy_array_api_arrays(
             self.run_metric_function_implementation_test(
                 target,
                 preds,
-                metric_function=multiclass_f1_score
-                if beta == 1.0
-                else partial(multiclass_fbeta_score, beta=beta),
+                metric_function=(
+                    multiclass_f1_score
+                    if beta == 1.0
+                    else partial(multiclass_fbeta_score, beta=beta)
+                ),
                 metric_args={
                     "num_classes": NUM_CLASSES,
                     "top_k": top_k,
@@ -292,9 +297,11 @@ def test_multiclass_fbeta_score_class_with_numpy_array_api_arrays(
             self.run_metric_class_implementation_test(
                 target,
                 preds,
-                metric_class=MulticlassF1Score
-                if beta == 1.0
-                else partial(MulticlassFBetaScore, beta=beta),
+                metric_class=(
+                    MulticlassF1Score
+                    if beta == 1.0
+                    else partial(MulticlassFBetaScore, beta=beta)
+                ),
                 reference_metric=partial(
                     _multiclass_fbeta_score_reference,
                     beta=beta,
@@ -346,9 +353,11 @@ def test_multiclass_fbeta_score_class_with_torch_tensors(
             self.run_metric_class_implementation_test(
                 target,
                 preds,
-                metric_class=MulticlassF1Score
-                if beta == 1.0
-                else partial(MulticlassFBetaScore, beta=beta),
+                metric_class=(
+                    MulticlassF1Score
+                    if beta == 1.0
+                    else partial(MulticlassFBetaScore, beta=beta)
+                ),
                 reference_metric=partial(
                     _multiclass_fbeta_score_reference,
                     beta=beta,
@@ -411,9 +420,11 @@ def test_multilabel_fbeta_score_with_numpy_array_api_arrays(
         self.run_metric_function_implementation_test(
             target,
             preds,
-            metric_function=multilabel_f1_score
-            if beta == 1.0
-            else partial(multilabel_fbeta_score, beta=beta),
+            metric_function=(
+                multilabel_f1_score
+                if beta == 1.0
+                else partial(multilabel_fbeta_score, beta=beta)
+            ),
             reference_metric=partial(
                 _multilabel_fbeta_score_reference,
                 beta=beta,
@@ -446,9 +457,11 @@ def test_multilabel_fbeta_score_class_with_numpy_array_api_arrays(
         self.run_metric_class_implementation_test(
             target,
             preds,
-            metric_class=MultilabelF1Score
-            if beta == 1.0
-            else partial(MultilabelFBetaScore, beta=beta),
+            metric_class=(
+                MultilabelF1Score
+                if beta == 1.0
+                else partial(MultilabelFBetaScore, beta=beta)
+            ),
             reference_metric=partial(
                 _multilabel_fbeta_score_reference,
                 beta=beta,
@@ -466,7 +479,7 @@ def test_multilabel_fbeta_score_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("average", [None, "micro", "macro", "weighted"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_fbeta_score_class_with_torch_tensors(
@@ -479,12 +492,19 @@ def test_multilabel_fbeta_score_class_with_torch_tensors(
         """Test class for multilabel fbeta score with torch tensors."""
         target, preds = inputs
 
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
-            metric_class=MultilabelF1Score
-            if beta == 1.0
-            else partial(MultilabelFBetaScore, beta=beta),
+            metric_class=(
+                MultilabelF1Score
+                if beta == 1.0
+                else partial(MultilabelFBetaScore, beta=beta)
+            ),
             reference_metric=partial(
                 _multilabel_fbeta_score_reference,
                 beta=beta,
@@ -499,6 +519,8 @@ def test_multilabel_fbeta_score_class_with_torch_tensors(
                 "average": average,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
 
 
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
new file mode 100644
index 000000000..44aecbdbf
--- /dev/null
+++ b/tests/cyclops/evaluate/metrics/experimental/test_matthews_corr_coef.py
@@ -0,0 +1,385 @@
+"""Test matthews correlation coefficient metrics."""
+
+from functools import partial
+
+import array_api_compat as apc
+import array_api_compat.torch
+import numpy.array_api as anp
+import pytest
+import torch.utils.dlpack
+from torchmetrics.functional.classification import (
+    binary_matthews_corrcoef,
+    multiclass_matthews_corrcoef,
+    multilabel_matthews_corrcoef,
+)
+
+from cyclops.evaluate.metrics.experimental.functional.matthews_corr_coef import (
+    binary_mcc,
+    multiclass_mcc,
+    multilabel_mcc,
+)
+from cyclops.evaluate.metrics.experimental.matthews_corr_coef import (
+    BinaryMCC,
+    MulticlassMCC,
+    MultilabelMCC,
+)
+from cyclops.evaluate.metrics.experimental.utils.ops import to_int
+from cyclops.evaluate.metrics.experimental.utils.validation import is_floating_point
+
+from ..conftest import NUM_CLASSES, NUM_LABELS, THRESHOLD
+from .inputs import _binary_cases, _multiclass_cases, _multilabel_cases
+from .testers import MetricTester, _inject_ignore_index
+
+
+def _binary_mcc_reference(
+    target,
+    preds,
+    threshold,
+    ignore_index,
+) -> torch.Tensor:
+    """Return the reference binary matthews correlation coefficient."""
+    return binary_matthews_corrcoef(
+        torch.utils.dlpack.from_dlpack(preds),
+        torch.utils.dlpack.from_dlpack(target),
+        threshold=threshold,
+        ignore_index=ignore_index,
+    )
+
+
+class TestBinaryMCC(MetricTester):
+    """Test binary matthews correlation coefficient function and class."""
+
+    @pytest.mark.parametrize("inputs", _binary_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_binary_mcc_function_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test function for binary matthews corrcoef using numpy.array_api arrays."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_function_implementation_test(
+            target,
+            preds,
+            metric_function=binary_mcc,
+            metric_args={
+                "threshold": THRESHOLD,
+                "ignore_index": ignore_index,
+            },
+            reference_metric=partial(
+                _binary_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+        )
+
+    @pytest.mark.parametrize("inputs", _binary_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_binary_mcc_class_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for binary matthews correlation coefficient."""
+        target, preds = inputs
+
+        if (
+            preds.ndim == 1
+            and is_floating_point(preds)
+            and not anp.all(to_int((preds >= 0)) * to_int((preds <= 1)))
+        ):
+            pytest.skip(
+                "When using 0-D logits, batch result will be different from local "
+                "result because the `sigmoid` operation may not be applied to each "
+                "batch (some values may be in [0, 1] and some may not).",
+            )
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=BinaryMCC,
+            metric_args={
+                "threshold": THRESHOLD,
+                "ignore_index": ignore_index,
+            },
+            reference_metric=partial(
+                _binary_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+        )
+
+    @pytest.mark.integration_test()  # machine for integration tests has GPU
+    @pytest.mark.parametrize("inputs", _binary_cases(xp=array_api_compat.torch))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_binary_mcc_class_with_torch_tensors(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test binary matthews correlation coefficient class with torch tensors."""
+        target, preds = inputs
+
+        if (
+            preds.ndim == 1
+            and is_floating_point(preds)
+            and not torch.all(to_int((preds >= 0)) * to_int((preds <= 1)))
+        ):
+            pytest.skip(
+                "When using 0-D logits, batch result will be different from local "
+                "result because the `sigmoid` operation may not be applied to each "
+                "batch (some values may be in [0, 1] and some may not).",
+            )
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=BinaryMCC,
+            metric_args={
+                "threshold": THRESHOLD,
+                "ignore_index": ignore_index,
+            },
+            reference_metric=partial(
+                _binary_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+            device=device,
+            use_device_for_ref=True,
+        )
+
+
+def _multiclass_mcc_reference(
+    target,
+    preds,
+    num_classes=NUM_CLASSES,
+    ignore_index=None,
+) -> torch.Tensor:
+    """Return the reference multiclass matthews correlation coefficient."""
+    if preds.ndim == 1 and is_floating_point(preds):
+        xp = apc.array_namespace(preds)
+        preds = xp.argmax(preds, axis=0)
+
+    return multiclass_matthews_corrcoef(
+        torch.utils.dlpack.from_dlpack(preds),
+        torch.utils.dlpack.from_dlpack(target),
+        num_classes,
+        ignore_index=ignore_index,
+    )
+
+
+class TestMulticlassMCC(MetricTester):
+    """Test multiclass matthews correlation coefficient function and class."""
+
+    @pytest.mark.parametrize("inputs", _multiclass_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_multiclass_mcc_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test function for multiclass matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_function_implementation_test(
+            target,
+            preds,
+            metric_function=multiclass_mcc,
+            metric_args={
+                "num_classes": NUM_CLASSES,
+                "ignore_index": ignore_index,
+            },
+            reference_metric=partial(
+                _multiclass_mcc_reference,
+                ignore_index=ignore_index,
+            ),
+        )
+
+    @pytest.mark.parametrize("inputs", _multiclass_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 1, -1])
+    def test_multiclass_mcc_class_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for multiclass matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=MulticlassMCC,
+            reference_metric=partial(
+                _multiclass_mcc_reference,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "num_classes": NUM_CLASSES,
+                "ignore_index": ignore_index,
+            },
+        )
+
+    @pytest.mark.integration_test()  # machine for integration tests has GPU
+    @pytest.mark.parametrize("inputs", _multiclass_cases(xp=array_api_compat.torch))
+    @pytest.mark.parametrize("ignore_index", [None, 1, -1])
+    def test_multiclass_mcc_class_with_torch_tensors(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for multiclass matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=MulticlassMCC,
+            reference_metric=partial(
+                _multiclass_mcc_reference,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "num_classes": NUM_CLASSES,
+                "ignore_index": ignore_index,
+            },
+            device=device,
+            use_device_for_ref=True,
+        )
+
+
+def _multilabel_mcc_reference(
+    target,
+    preds,
+    threshold,
+    num_labels=NUM_LABELS,
+    ignore_index=None,
+) -> torch.Tensor:
+    """Return the reference multilabel matthews correlation coefficient."""
+    return multilabel_matthews_corrcoef(
+        torch.utils.dlpack.from_dlpack(preds),
+        torch.utils.dlpack.from_dlpack(target),
+        num_labels,
+        threshold=threshold,
+        ignore_index=ignore_index,
+    )
+
+
+class TestMultilabelMCC(MetricTester):
+    """Test multilabel matthews correlation coefficient function and class."""
+
+    atol: float = 4e-8
+
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_multilabel_mcc_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test function for multilabel matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_function_implementation_test(
+            target,
+            preds,
+            metric_function=multilabel_mcc,
+            reference_metric=partial(
+                _multilabel_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "threshold": THRESHOLD,
+                "num_labels": NUM_LABELS,
+                "ignore_index": ignore_index,
+            },
+        )
+
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_multilabel_mcc_class_with_numpy_array_api_arrays(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for multilabel matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=MultilabelMCC,
+            reference_metric=partial(
+                _multilabel_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "threshold": THRESHOLD,
+                "num_labels": NUM_LABELS,
+                "ignore_index": ignore_index,
+            },
+        )
+
+    @pytest.mark.integration_test()  # machine for integration tests has GPU
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
+    @pytest.mark.parametrize("ignore_index", [None, 0, -1])
+    def test_multilabel_mcc_class_with_torch_tensors(
+        self,
+        inputs,
+        ignore_index,
+    ) -> None:
+        """Test class for multilabel matthews correlation coefficient."""
+        target, preds = inputs
+
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        self.run_metric_class_implementation_test(
+            target,
+            preds,
+            metric_class=MultilabelMCC,
+            reference_metric=partial(
+                _multilabel_mcc_reference,
+                threshold=THRESHOLD,
+                ignore_index=ignore_index,
+            ),
+            metric_args={
+                "threshold": THRESHOLD,
+                "num_labels": NUM_LABELS,
+                "ignore_index": ignore_index,
+            },
+            device=device,
+            use_device_for_ref=True,
+        )
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_negative_predicitve_value.py b/tests/cyclops/evaluate/metrics/experimental/test_negative_predicitve_value.py
index b07f2e7ea..6d0d057be 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_negative_predicitve_value.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_negative_predicitve_value.py
@@ -1,4 +1,5 @@
 """Test negative predictive value."""
+
 from functools import partial
 from typing import Literal, Optional
 
@@ -476,7 +477,7 @@ def test_multilabel_npv_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("average", [None, "micro", "macro", "weighted"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_npv_class_with_torch_tensors(
@@ -488,6 +489,11 @@ def test_multilabel_npv_class_with_torch_tensors(
         """Test class for multilabel negative predictive value with torch tensors."""
         target, preds = inputs
 
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
@@ -505,6 +511,8 @@ def test_multilabel_npv_class_with_torch_tensors(
                 "average": average,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
 
 
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_precision_recall.py b/tests/cyclops/evaluate/metrics/experimental/test_precision_recall.py
index 14c3c3a96..8b24b2d75 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_precision_recall.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_precision_recall.py
@@ -1,4 +1,5 @@
 """Test precision recall metrics."""
+
 from functools import partial
 from typing import Literal, Optional
 
@@ -730,7 +731,7 @@ def test_multilabel_precision_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("average", [None, "micro", "macro", "weighted"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_precision_class_with_torch_tensors(
@@ -742,6 +743,11 @@ def test_multilabel_precision_class_with_torch_tensors(
         """Test class for multilabel precision with torch tensors."""
         target, preds = inputs
 
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
@@ -760,6 +766,8 @@ def test_multilabel_precision_class_with_torch_tensors(
                 "average": average,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
 
 
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_precision_recall_curve.py b/tests/cyclops/evaluate/metrics/experimental/test_precision_recall_curve.py
index 4dc5989fd..081ebd1e9 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_precision_recall_curve.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_precision_recall_curve.py
@@ -1,4 +1,5 @@
 """Test precision-recall curve metric."""
+
 from functools import partial
 from typing import List, Tuple, Union
 
@@ -45,9 +46,11 @@ def _binary_precision_recall_curve_reference(
     return tm_binary_precision_recall_curve(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -215,9 +218,11 @@ def _multiclass_precision_recall_curve_reference(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
         num_classes,
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -371,9 +376,11 @@ def _multilabel_precision_recall_curve_reference(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
         num_labels,
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -381,6 +388,8 @@ def _multilabel_precision_recall_curve_reference(
 class TestMultilabelPrecisionRecallCurve(MetricTester):
     """Test multilabel precision-recall curve function and class."""
 
+    atol: float = 2e-7
+
     @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp)[2:])
     @pytest.mark.parametrize("thresholds", _thresholds(xp=anp))
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_roc.py b/tests/cyclops/evaluate/metrics/experimental/test_roc.py
index ddc4f9556..17a4fff5a 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_roc.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_roc.py
@@ -1,4 +1,5 @@
 """Test roc curve metric."""
+
 from functools import partial
 from typing import List, Tuple, Union
 
@@ -45,9 +46,11 @@ def _binary_roc_reference(
     return tm_binary_roc(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -215,9 +218,11 @@ def _multiclass_roc_reference(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
         num_classes,
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -371,9 +376,11 @@ def _multilabel_roc_reference(
         torch.utils.dlpack.from_dlpack(preds),
         torch.utils.dlpack.from_dlpack(target),
         num_labels,
-        thresholds=torch.utils.dlpack.from_dlpack(thresholds)
-        if apc.is_array_api_obj(thresholds)
-        else thresholds,
+        thresholds=(
+            torch.utils.dlpack.from_dlpack(thresholds)
+            if apc.is_array_api_obj(thresholds)
+            else thresholds
+        ),
         ignore_index=ignore_index,
     )
 
@@ -381,6 +388,8 @@ def _multilabel_roc_reference(
 class TestMultilabelROC(MetricTester):
     """Test multilabel roc curve function and class."""
 
+    atol: float = 9e-8
+
     @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp)[2:])
     @pytest.mark.parametrize("thresholds", _thresholds(xp=anp))
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
diff --git a/tests/cyclops/evaluate/metrics/experimental/test_specificity.py b/tests/cyclops/evaluate/metrics/experimental/test_specificity.py
index 035edbada..b4f40b12c 100644
--- a/tests/cyclops/evaluate/metrics/experimental/test_specificity.py
+++ b/tests/cyclops/evaluate/metrics/experimental/test_specificity.py
@@ -1,4 +1,5 @@
 """Test specificity."""
+
 from functools import partial
 from typing import Literal, Optional
 
@@ -417,7 +418,7 @@ def test_multilabel_specificity_class_with_numpy_array_api_arrays(
         )
 
     @pytest.mark.integration_test()  # machine for integration tests has GPU
-    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=anp))
+    @pytest.mark.parametrize("inputs", _multilabel_cases(xp=array_api_compat.torch))
     @pytest.mark.parametrize("average", [None, "micro", "macro", "weighted"])
     @pytest.mark.parametrize("ignore_index", [None, 0, -1])
     def test_multilabel_specificity_class_with_torch_tensors(
@@ -429,6 +430,11 @@ def test_multilabel_specificity_class_with_torch_tensors(
         """Test class for multilabel specificity with torch tensors."""
         target, preds = inputs
 
+        if ignore_index is not None:
+            target = _inject_ignore_index(target, ignore_index)
+
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+
         self.run_metric_class_implementation_test(
             target,
             preds,
@@ -446,6 +452,8 @@ def test_multilabel_specificity_class_with_torch_tensors(
                 "average": average,
                 "ignore_index": ignore_index,
             },
+            device=device,
+            use_device_for_ref=True,
         )
 
 
diff --git a/tests/cyclops/evaluate/metrics/experimental/testers.py b/tests/cyclops/evaluate/metrics/experimental/testers.py
index 4ae8775dc..4d58305d3 100644
--- a/tests/cyclops/evaluate/metrics/experimental/testers.py
+++ b/tests/cyclops/evaluate/metrics/experimental/testers.py
@@ -1,4 +1,5 @@
 """Testers for metrics."""
+
 from functools import partial
 from typing import Any, Callable, Dict, Optional, Sequence, Type
 
@@ -19,8 +20,20 @@ def _assert_allclose(
     """Recursively assert that two results are within a certain tolerance."""
     if apc.is_array_api_obj(cyclops_result) and apc.is_array_api_obj(ref_result):
         # move to cpu and convert to numpy
-        cyclops_result = np.from_dlpack(apc.to_device(cyclops_result, "cpu"))
-        ref_result = np.from_dlpack(apc.to_device(ref_result, "cpu"))
+        cyclops_result = np.from_dlpack(
+            (
+                apc.to_device(cyclops_result, "cpu")
+                if apc.device(cyclops_result) != "cpu"
+                else cyclops_result
+            ),
+        )
+        ref_result = np.from_dlpack(
+            (
+                apc.to_device(ref_result, "cpu")
+                if apc.device(ref_result) != "cpu"
+                else ref_result
+            ),
+        )
 
         np.testing.assert_allclose(
             cyclops_result,