From d44a53a97ea344fb99d8728dd4c461d6eb03d9c6 Mon Sep 17 00:00:00 2001
From: Mintas <dmitrii.kovalev@phystech.edu>
Date: Tue, 26 Nov 2024 17:48:51 +0300
Subject: [PATCH] refactored and optimized segmentation metrics eval

---
 mir_eval/segment.py | 468 ++++++++++++++++++++------------------------
 1 file changed, 212 insertions(+), 256 deletions(-)

diff --git a/mir_eval/segment.py b/mir_eval/segment.py
index 7a49d6ff..764e0ea5 100644
--- a/mir_eval/segment.py
+++ b/mir_eval/segment.py
@@ -73,6 +73,7 @@
 """
 
 import collections
+import functools
 import warnings
 
 import numpy as np
@@ -80,10 +81,10 @@
 import scipy.sparse
 import scipy.misc
 import scipy.special
+from typing import Sequence, Any
 
 from . import util
 
-
 def validate_boundary(reference_intervals, estimated_intervals, trim):
     """Check that the input annotations to a segment boundary estimation
     metric (i.e. one that only takes in segment intervals) look like valid
@@ -118,6 +119,17 @@ def validate_boundary(reference_intervals, estimated_intervals, trim):
     for intervals in [reference_intervals, estimated_intervals]:
         util.validate_intervals(intervals)
 
+def validated_trimmed_boundaries(reference_intervals, estimated_intervals, trim):
+    validate_boundary(reference_intervals, estimated_intervals, trim)
+    # Convert intervals to boundaries
+    reference_intervals = util.intervals_to_boundaries(reference_intervals)
+    estimated_intervals = util.intervals_to_boundaries(estimated_intervals)
+
+    # Suppress the first and last intervals
+    if trim:
+        reference_intervals = reference_intervals[1:-1]
+        estimated_intervals = estimated_intervals[1:-1]
+    return reference_intervals, estimated_intervals
 
 def validate_structure(
     reference_intervals, reference_labels, estimated_intervals, estimated_labels
@@ -165,9 +177,20 @@ def validate_structure(
         if not np.allclose(reference_intervals.max(), estimated_intervals.max()):
             raise ValueError("End times do not match")
 
+def _default_on_empty(default_return_value):
+    def decorator_default_on_empty(func):
+        @functools.wraps(func)
+        def wrapper_default_on_empty(reference_indices, estimated_indices, *args, **kwargs):
+            # Check for empty annotations.  Don't need to check labels because
+            # validate_structure makes sure they're the same size as intervals
+            if len(reference_indices) == 0 or len(estimated_indices) == 0:
+                return default_return_value
+            return func(reference_indices, estimated_indices, *args, **kwargs)
+        return wrapper_default_on_empty
+    return decorator_default_on_empty
 
 def detection(
-    reference_intervals, estimated_intervals, window=0.5, beta=1.0, trim=False
+    reference_intervals, estimated_intervals, window=0.5, beta=1.0, trim=False,
 ):
     """Boundary detection hit-rate.
 
@@ -225,21 +248,15 @@ def detection(
     f_measure : float
         F-measure (weighted harmonic mean of ``precision`` and ``recall``)
     """
-    validate_boundary(reference_intervals, estimated_intervals, trim)
-
-    # Convert intervals to boundaries
-    reference_boundaries = util.intervals_to_boundaries(reference_intervals)
-    estimated_boundaries = util.intervals_to_boundaries(estimated_intervals)
+    reference_boundaries, estimated_boundaries = validated_trimmed_boundaries(reference_intervals, 
+                                                                              estimated_intervals, 
+                                                                              trim)
 
-    # Suppress the first and last intervals
-    if trim:
-        reference_boundaries = reference_boundaries[1:-1]
-        estimated_boundaries = estimated_boundaries[1:-1]
-
-    # If we have no boundaries, we get no score.
-    if len(reference_boundaries) == 0 or len(estimated_boundaries) == 0:
-        return 0.0, 0.0, 0.0
+    return _compute_detection(reference_boundaries, estimated_boundaries, window, beta)
 
+# If we have no boundaries, we get no score.
+@_default_on_empty(default_return_value=(0.0,0.0,0.0))
+def _compute_detection(reference_boundaries, estimated_boundaries, window=0.5, beta=1.0):
     matching = util.match_events(reference_boundaries, estimated_boundaries, window)
 
     precision = float(len(matching)) / len(estimated_boundaries)
@@ -249,7 +266,6 @@ def detection(
 
     return precision, recall, f_measure
 
-
 def deviation(reference_intervals, estimated_intervals, trim=False):
     """Compute the median deviations between reference
     and estimated boundary times.
@@ -284,22 +300,16 @@ def deviation(reference_intervals, estimated_intervals, trim=False):
     estimated_to_reference : float
         median time from each estimated boundary to the
         closest reference boundary
-    """
-    validate_boundary(reference_intervals, estimated_intervals, trim)
-
-    # Convert intervals to boundaries
-    reference_boundaries = util.intervals_to_boundaries(reference_intervals)
-    estimated_boundaries = util.intervals_to_boundaries(estimated_intervals)
-
-    # Suppress the first and last intervals
-    if trim:
-        reference_boundaries = reference_boundaries[1:-1]
-        estimated_boundaries = estimated_boundaries[1:-1]
-
-    # If we have no boundaries, we get no score.
-    if len(reference_boundaries) == 0 or len(estimated_boundaries) == 0:
-        return np.nan, np.nan
-
+    """ 
+    reference_boundaries, estimated_boundaries = validated_trimmed_boundaries(reference_intervals, 
+                                                                              estimated_intervals, 
+                                                                              trim)
+    
+    return _compute_deviation(reference_boundaries, estimated_boundaries)
+
+# If we have no boundaries, we get no score.
+@_default_on_empty(default_return_value=(np.nan, np.nan))
+def _compute_deviation(reference_boundaries, estimated_boundaries):
     dist = np.abs(np.subtract.outer(reference_boundaries, estimated_boundaries))
 
     estimated_to_reference = np.median(dist.min(axis=0))
@@ -308,6 +318,47 @@ def deviation(reference_intervals, estimated_intervals, trim=False):
     return reference_to_estimated, estimated_to_reference
 
 
+def _labeled_structure_metric(metric_calculator, 
+                              reference_intervals, reference_labels,
+                              estimated_intervals, estimated_labels, 
+                              frame_size=0.1, **calculator_kwargs):
+    """Calculate metric for labeled intervals with input validation
+
+    Parameters
+    ----------
+    metric_calculator : callable, function, lambda
+        function that takes actual calculation of labeled_structure metric;
+        should have signature: metric_calculator(reference_indices, estimated_indices, **kwargs)
+    """
+    y_ref, y_est = _structure_to_indices(reference_intervals, reference_labels, 
+                          estimated_intervals, estimated_labels,
+                          frame_size)
+    return metric_calculator(y_ref, y_est, **calculator_kwargs)
+
+def _cluster_labels_to_index_space(intervals, labels, sample_size):
+    """ Generate the cluster labels and Map to index space
+    """
+    cluster_labels = util.intervals_to_samples(
+        intervals, labels, sample_size=sample_size
+    )[-1]
+    return util.index_labels(cluster_labels)[0]
+    
+def _structure_to_indices(reference_intervals, reference_labels, 
+                          estimated_intervals, estimated_labels,
+                          frame_size=0.1):
+    validate_structure(
+        reference_intervals, reference_labels, estimated_intervals, estimated_labels
+    )
+    # Check for empty annotations.  Don't need to check labels because
+    # validate_structure makes sure they're the same size as intervals
+    if reference_intervals.size == 0 or estimated_intervals.size == 0:
+        return np.empty,np.empty
+
+    # Generate the cluster labels and Map to index space
+    y_ref = _cluster_labels_to_index_space(reference_intervals, reference_labels, frame_size)
+    y_est = _cluster_labels_to_index_space(estimated_intervals, estimated_labels, frame_size)
+    return y_ref, y_est
+
 def pairwise(
     reference_intervals,
     reference_labels,
@@ -368,41 +419,25 @@ def pairwise(
         F-measure of detecting whether frames belong in the same cluster
 
     """
-    validate_structure(
-        reference_intervals, reference_labels, estimated_intervals, estimated_labels
-    )
-
-    # Check for empty annotations.  Don't need to check labels because
-    # validate_structure makes sure they're the same size as intervals
-    if reference_intervals.size == 0 or estimated_intervals.size == 0:
-        return 0.0, 0.0, 0.0
-
-    # Generate the cluster labels
-    y_ref = util.intervals_to_samples(
-        reference_intervals, reference_labels, sample_size=frame_size
-    )[-1]
-
-    y_ref = util.index_labels(y_ref)[0]
-
-    # Map to index space
-    y_est = util.intervals_to_samples(
-        estimated_intervals, estimated_labels, sample_size=frame_size
-    )[-1]
-
-    y_est = util.index_labels(y_est)[0]
+    return _labeled_structure_metric(_compute_pairwise, 
+                            reference_intervals, reference_labels,
+                            estimated_intervals, estimated_labels,
+                            frame_size=frame_size, beta=beta)
 
+@_default_on_empty(default_return_value=(0.0,0.0,0.0))
+def _compute_pairwise(reference_indices, estimated_indices, beta=1.0):
     # Build the reference label agreement matrix
-    agree_ref = np.equal.outer(y_ref, y_ref)
+    agree_ref = np.equal.outer(reference_indices, reference_indices)
     # Count the unique pairs
-    n_agree_ref = (agree_ref.sum() - len(y_ref)) / 2.0
+    n_agree_ref = (agree_ref.sum() - len(reference_indices)) / 2.0
 
     # Repeat for estimate
-    agree_est = np.equal.outer(y_est, y_est)
-    n_agree_est = (agree_est.sum() - len(y_est)) / 2.0
+    agree_est = np.equal.outer(estimated_indices, estimated_indices)
+    n_agree_est = (agree_est.sum() - len(estimated_indices)) / 2.0
 
     # Find where they agree
     matches = np.logical_and(agree_ref, agree_est)
-    n_matches = (matches.sum() - len(y_ref)) / 2.0
+    n_matches = (matches.sum() - len(reference_indices)) / 2.0
 
     precision = n_matches / n_agree_est
     recall = n_matches / n_agree_ref
@@ -410,7 +445,6 @@ def pairwise(
 
     return precision, recall, f_measure
 
-
 def rand_index(
     reference_intervals,
     reference_labels,
@@ -458,42 +492,25 @@ def rand_index(
         length (in seconds) of frames for clustering
         (Default value = 0.1)
     beta : float > 0
-        beta value for F-measure
-        (Default value = 1.0)
+        deprecated parameter - to be removed in 0.9 !!!
 
     Returns
     -------
     rand_index : float > 0
         Rand index
     """
-    validate_structure(
-        reference_intervals, reference_labels, estimated_intervals, estimated_labels
-    )
-
-    # Check for empty annotations.  Don't need to check labels because
-    # validate_structure makes sure they're the same size as intervals
-    if reference_intervals.size == 0 or estimated_intervals.size == 0:
-        return 0.0, 0.0, 0.0
-
-    # Generate the cluster labels
-    y_ref = util.intervals_to_samples(
-        reference_intervals, reference_labels, sample_size=frame_size
-    )[-1]
-
-    y_ref = util.index_labels(y_ref)[0]
-
-    # Map to index space
-    y_est = util.intervals_to_samples(
-        estimated_intervals, estimated_labels, sample_size=frame_size
-    )[-1]
-
-    y_est = util.index_labels(y_est)[0]
+    return _labeled_structure_metric(_compute_random_index, 
+                            reference_intervals, reference_labels,
+                            estimated_intervals, estimated_labels,
+                            frame_size=frame_size)
 
+@_default_on_empty(default_return_value=0.0)
+def _compute_random_index(reference_indices, estimated_indices):
     # Build the reference label agreement matrix
-    agree_ref = np.equal.outer(y_ref, y_ref)
+    agree_ref = np.equal.outer(reference_indices, reference_indices)
 
     # Repeat for estimate
-    agree_est = np.equal.outer(y_est, y_est)
+    agree_est = np.equal.outer(estimated_indices, estimated_indices)
 
     # Find where they agree
     matches_pos = np.logical_and(agree_ref, agree_est)
@@ -501,15 +518,14 @@ def rand_index(
     # Find where they disagree
     matches_neg = np.logical_and(~agree_ref, ~agree_est)
 
-    n_pairs = len(y_ref) * (len(y_ref) - 1) / 2.0
+    n_pairs = len(reference_indices) * (len(reference_indices) - 1) / 2.0
 
-    n_matches_pos = (matches_pos.sum() - len(y_ref)) / 2.0
+    n_matches_pos = (matches_pos.sum() - len(reference_indices)) / 2.0
     n_matches_neg = matches_neg.sum() / 2.0
     rand = (n_matches_pos + n_matches_neg) / n_pairs
 
     return rand
 
-
 def _contingency_matrix(reference_indices, estimated_indices):
     """Compute the contingency matrix of a true labeling vs an estimated one.
 
@@ -538,7 +554,7 @@ def _contingency_matrix(reference_indices, estimated_indices):
         dtype=np.int64,
     ).toarray()
 
-
+@_default_on_empty(default_return_value=0.0)
 def _adjusted_rand_index(reference_indices, estimated_indices):
     """Compute the Rand index, adjusted for change.
 
@@ -572,16 +588,15 @@ def _adjusted_rand_index(reference_indices, estimated_indices):
     contingency = _contingency_matrix(reference_indices, estimated_indices)
 
     # Compute the ARI using the contingency data
-    sum_comb_c = sum(
-        scipy.special.comb(n_c, 2, exact=1) for n_c in contingency.sum(axis=1)
-    )
-    sum_comb_k = sum(
-        scipy.special.comb(n_k, 2, exact=1) for n_k in contingency.sum(axis=0)
-    )
+    def _sum_comb(contingency_data):
+        return sum(
+            scipy.special.comb(n_i, 2, exact=1) for n_i in contingency_data
+        )
+    
+    sum_comb_c = _sum_comb(contingency.sum(axis=1))
+    sum_comb_k = _sum_comb(contingency.sum(axis=0))
+    sum_comb = _sum_comb(contingency.flatten())
 
-    sum_comb = sum(
-        (scipy.special.comb(n_ij, 2, exact=1) for n_ij in contingency.flatten())
-    )
     prod_comb = (sum_comb_c * sum_comb_k) / float(scipy.special.comb(n_samples, 2))
     mean_comb = (sum_comb_k + sum_comb_c) / 2.0
     return (sum_comb - prod_comb) / (mean_comb - prod_comb)
@@ -637,32 +652,12 @@ def ari(
         Adjusted Rand index between segmentations.
 
     """
-    validate_structure(
-        reference_intervals, reference_labels, estimated_intervals, estimated_labels
-    )
-
-    # Check for empty annotations.  Don't need to check labels because
-    # validate_structure makes sure they're the same size as intervals
-    if reference_intervals.size == 0 or estimated_intervals.size == 0:
-        return 0.0, 0.0, 0.0
-
-    # Generate the cluster labels
-    y_ref = util.intervals_to_samples(
-        reference_intervals, reference_labels, sample_size=frame_size
-    )[-1]
-
-    y_ref = util.index_labels(y_ref)[0]
-
-    # Map to index space
-    y_est = util.intervals_to_samples(
-        estimated_intervals, estimated_labels, sample_size=frame_size
-    )[-1]
-
-    y_est = util.index_labels(y_est)[0]
-
-    return _adjusted_rand_index(y_ref, y_est)
-
+    return _labeled_structure_metric(_adjusted_rand_index, 
+                            reference_intervals, reference_labels,
+                            estimated_intervals, estimated_labels,
+                            frame_size=frame_size)
 
+@_default_on_empty(default_return_value=0.0)
 def _mutual_info_score(reference_indices, estimated_indices, contingency=None):
     """Compute the mutual information between two sequence labelings.
 
@@ -684,9 +679,7 @@ def _mutual_info_score(reference_indices, estimated_indices, contingency=None):
 
     """
     if contingency is None:
-        contingency = _contingency_matrix(reference_indices, estimated_indices).astype(
-            float
-        )
+        contingency = _contingency_matrix(reference_indices, estimated_indices).astype(float)
     contingency_sum = np.sum(contingency)
     pi = np.sum(contingency, axis=1)
     pj = np.sum(contingency, axis=0)
@@ -732,7 +725,7 @@ def _entropy(labels):
     return -np.sum((pi / pi_sum) * (np.log(pi) - np.log(pi_sum)))
 
 
-def _adjusted_mutual_info_score(reference_indices, estimated_indices):
+def _adjusted_mutual_info_score(reference_indices, estimated_indices, contingency=None, mutual_info_score=None):
     """Compute the mutual information between two sequence labelings, adjusted for
     chance.
 
@@ -761,13 +754,13 @@ def _adjusted_mutual_info_score(reference_indices, estimated_indices):
         or ref_classes.shape[0] == est_classes.shape[0] == 0
     ):
         return 1.0
-    contingency = _contingency_matrix(reference_indices, estimated_indices).astype(
-        float
-    )
-    # Calculate the MI for the two clusterings
-    mi = _mutual_info_score(
-        reference_indices, estimated_indices, contingency=contingency
-    )
+    if contingency is None: 
+        contingency = _contingency_matrix(reference_indices, estimated_indices).astype(float) 
+    if mutual_info_score is None:
+        # Calculate the MI for the two clusterings
+        mutual_info_score = _mutual_info_score(
+            reference_indices, estimated_indices, contingency=contingency
+        )
     # The following code is based on
     # sklearn.metrics.cluster.expected_mutual_information
     R, C = contingency.shape
@@ -821,11 +814,11 @@ def _adjusted_mutual_info_score(reference_indices, estimated_indices):
                 emi += term1[nij] * term2 * term3
     # Calculate entropy for each labeling
     h_true, h_pred = _entropy(reference_indices), _entropy(estimated_indices)
-    ami = (mi - emi) / (max(h_true, h_pred) - emi)
+    ami = (mutual_info_score - emi) / (max(h_true, h_pred) - emi)
     return ami
 
 
-def _normalized_mutual_info_score(reference_indices, estimated_indices):
+def _normalized_mutual_info_score(reference_indices, estimated_indices, contingency=None, mutual_info_score=None):
     """Compute the mutual information between two sequence labelings, adjusted for
     chance.
 
@@ -852,18 +845,17 @@ def _normalized_mutual_info_score(reference_indices, estimated_indices):
         or ref_classes.shape[0] == est_classes.shape[0] == 0
     ):
         return 1.0
-    contingency = _contingency_matrix(reference_indices, estimated_indices).astype(
-        float
-    )
-    contingency = np.array(contingency, dtype="float")
-    # Calculate the MI for the two clusterings
-    mi = _mutual_info_score(
-        reference_indices, estimated_indices, contingency=contingency
-    )
+    if contingency is None: 
+        contingency = _contingency_matrix(reference_indices, estimated_indices).astype(float) 
+    if mutual_info_score is None:
+        # Calculate the MI for the two clusterings
+        mutual_info_score = _mutual_info_score(
+            reference_indices, estimated_indices, contingency=contingency
+        )
     # Calculate the expected value for the mutual information
     # Calculate entropy for each labeling
     h_true, h_pred = _entropy(reference_indices), _entropy(estimated_indices)
-    nmi = mi / max(np.sqrt(h_true * h_pred), 1e-10)
+    nmi = mutual_info_score / max(np.sqrt(h_true * h_pred), 1e-10)
     return nmi
 
 
@@ -923,37 +915,20 @@ def mutual_information(
         Normalize mutual information between segmentations
 
     """
-    validate_structure(
-        reference_intervals, reference_labels, estimated_intervals, estimated_labels
-    )
-
-    # Check for empty annotations.  Don't need to check labels because
-    # validate_structure makes sure they're the same size as intervals
-    if reference_intervals.size == 0 or estimated_intervals.size == 0:
-        return 0.0, 0.0, 0.0
-
-    # Generate the cluster labels
-    y_ref = util.intervals_to_samples(
-        reference_intervals, reference_labels, sample_size=frame_size
-    )[-1]
-
-    y_ref = util.index_labels(y_ref)[0]
-
-    # Map to index space
-    y_est = util.intervals_to_samples(
-        estimated_intervals, estimated_labels, sample_size=frame_size
-    )[-1]
-
-    y_est = util.index_labels(y_est)[0]
-
+    return _labeled_structure_metric(_compute_mutual_information, 
+                            reference_intervals, reference_labels,
+                            estimated_intervals, estimated_labels,
+                            frame_size=frame_size)
+    
+@_default_on_empty(default_return_value=(0.0,0.0,0.0))
+def _compute_mutual_information(reference_indices, estimated_indices):
+    contingency = _contingency_matrix(reference_indices, estimated_indices).astype(float)
     # Mutual information
-    mutual_info = _mutual_info_score(y_ref, y_est)
-
+    mutual_info = _mutual_info_score(reference_indices, estimated_indices, contingency=contingency)
     # Adjusted mutual information
-    adj_mutual_info = _adjusted_mutual_info_score(y_ref, y_est)
-
+    adj_mutual_info = _adjusted_mutual_info_score(reference_indices, estimated_indices, contingency=contingency, mutual_info_score=mutual_info)
     # Normalized mutual information
-    norm_mutual_info = _normalized_mutual_info_score(y_ref, y_est)
+    norm_mutual_info = _normalized_mutual_info_score(reference_indices, estimated_indices, contingency=contingency, mutual_info_score=mutual_info)
 
     return mutual_info, adj_mutual_info, norm_mutual_info
 
@@ -1037,29 +1012,13 @@ def nce(
     S_F
         F-measure for (S_over, S_under)
     """
-    validate_structure(
-        reference_intervals, reference_labels, estimated_intervals, estimated_labels
-    )
-
-    # Check for empty annotations.  Don't need to check labels because
-    # validate_structure makes sure they're the same size as intervals
-    if reference_intervals.size == 0 or estimated_intervals.size == 0:
-        return 0.0, 0.0, 0.0
-
-    # Generate the cluster labels
-    y_ref = util.intervals_to_samples(
-        reference_intervals, reference_labels, sample_size=frame_size
-    )[-1]
-
-    y_ref = util.index_labels(y_ref)[0]
-
-    # Map to index space
-    y_est = util.intervals_to_samples(
-        estimated_intervals, estimated_labels, sample_size=frame_size
-    )[-1]
-
-    y_est = util.index_labels(y_est)[0]
+    return _labeled_structure_metric(_compute_nce,
+                            reference_intervals, reference_labels,
+                            estimated_intervals, estimated_labels,
+                            frame_size=frame_size, beta=beta, marginal=marginal)
 
+@_default_on_empty(default_return_value=(0.0,0.0,0.0))
+def _compute_nce(y_ref, y_est, beta=1.0, marginal=False):
     # Make the contingency table: shape = (n_ref, n_est)
     contingency = _contingency_matrix(y_ref, y_est).astype(float)
 
@@ -1178,6 +1137,35 @@ def vmeasure(
         marginal=True,
     )
 
+def return_mapping(mapping:Sequence[Any], some_func, *args, **kwargs) -> dict[Any, Any]:
+    """Return function results as a mapping
+    
+    Note: there are no type-checks or length-checks in this method, clients are responsible.
+
+    Args:
+        mapping (Sequence): keys to form mapping, same order as return values of some_func
+        some_func (_type_): function to decorate
+
+    Returns:
+        dict: return values of some_func, mapped by mapping key
+    """
+    return_res = some_func(*args, **kwargs)
+    return _return_values_to_mapping(mapping, return_res)
+
+def _return_values_to_mapping(mapping:Sequence[Any], values, accumulator:dict=None) -> dict[Any, Any]:
+    if accumulator is None:
+        accumulator = {}
+    if not isinstance(values, tuple):
+        accumulator[mapping[0]] = values
+    else:
+        for i,m in enumerate(mapping):
+            accumulator[m] = values[i]
+    return accumulator
+
+def __accumulate_metrics(metric_func, metric_names, metric_accumulator, *args, **kwargs):
+    return_res = util.filter_kwargs(metric_func,  *args, **kwargs)
+    return _return_values_to_mapping(metric_names, return_res, metric_accumulator)
+        
 
 def evaluate(ref_intervals, ref_labels, est_intervals, est_labels, **kwargs):
     """Compute all metrics for the given reference and estimated annotations.
@@ -1227,71 +1215,39 @@ def evaluate(ref_intervals, ref_labels, est_intervals, est_labels, **kwargs):
     # Now compute all the metrics
     scores = collections.OrderedDict()
 
-    # Boundary detection
+    # Metrics for Intervals without structure labels section:
+    trim = kwargs.get('trim')
+    reference_boundaries, estimated_boundaries = validated_trimmed_boundaries(ref_intervals, 
+                                                                              est_intervals, 
+                                                                              trim)
+    def __with_interval_metrics(metric_func, metric_names):
+        return __accumulate_metrics(metric_func, metric_names, scores, reference_boundaries, estimated_boundaries, **kwargs)
+        
+    __with_interval_metrics(_compute_deviation, ["Ref-to-est deviation", "Est-to-ref deviation"])
     # Force these values for window
     kwargs["window"] = 0.5
-    (
-        scores["Precision@0.5"],
-        scores["Recall@0.5"],
-        scores["F-measure@0.5"],
-    ) = util.filter_kwargs(detection, ref_intervals, est_intervals, **kwargs)
-
+    __with_interval_metrics(_compute_detection, ["Precision@0.5", "Recall@0.5", "F-measure@0.5"])
     kwargs["window"] = 3.0
-    (
-        scores["Precision@3.0"],
-        scores["Recall@3.0"],
-        scores["F-measure@3.0"],
-    ) = util.filter_kwargs(detection, ref_intervals, est_intervals, **kwargs)
-
-    # Boundary deviation
-    scores["Ref-to-est deviation"], scores["Est-to-ref deviation"] = util.filter_kwargs(
-        deviation, ref_intervals, est_intervals, **kwargs
-    )
-
-    # Pairwise clustering
-    (
-        scores["Pairwise Precision"],
-        scores["Pairwise Recall"],
-        scores["Pairwise F-measure"],
-    ) = util.filter_kwargs(
-        pairwise, ref_intervals, ref_labels, est_intervals, est_labels, **kwargs
-    )
-
-    # Rand index
-    scores["Rand Index"] = util.filter_kwargs(
-        rand_index, ref_intervals, ref_labels, est_intervals, est_labels, **kwargs
-    )
-    # Adjusted rand index
-    scores["Adjusted Rand Index"] = util.filter_kwargs(
-        ari, ref_intervals, ref_labels, est_intervals, est_labels, **kwargs
-    )
-
-    # Mutual information metrics
-    (
-        scores["Mutual Information"],
-        scores["Adjusted Mutual Information"],
-        scores["Normalized Mutual Information"],
-    ) = util.filter_kwargs(
-        mutual_information,
-        ref_intervals,
-        ref_labels,
-        est_intervals,
-        est_labels,
-        **kwargs
-    )
+    __with_interval_metrics(_compute_detection, ["Precision@3.0", "Recall@3.0", "F-measure@3.0"])
+
+    # Structured metrics section:
+    frame_size = kwargs.get('frame_size')
+    ref_indices, est_indices = _structure_to_indices(ref_intervals, ref_labels, 
+                          est_intervals, est_labels,
+                          frame_size)
+    def __with_structured_interval_metrics(metric_func, metric_names):
+        return __accumulate_metrics(metric_func, metric_names, scores, ref_indices, est_indices, **kwargs)
+        
+    __with_structured_interval_metrics(_compute_pairwise, ["Pairwise Precision", "Pairwise Recall", "Pairwise F-measure"])
+    __with_structured_interval_metrics(_compute_random_index, ["Rand Index"])
+    __with_structured_interval_metrics(_adjusted_rand_index, ["Adjusted Rand Index"])
+    __with_structured_interval_metrics(_compute_mutual_information, ["Mutual Information", "Adjusted Mutual Information", "Normalized Mutual Information"])
 
     # Conditional entropy metrics
-    (
-        scores["NCE Over"],
-        scores["NCE Under"],
-        scores["NCE F-measure"],
-    ) = util.filter_kwargs(
-        nce, ref_intervals, ref_labels, est_intervals, est_labels, **kwargs
-    )
-
+    kwargs['marginal']=False
+    __with_structured_interval_metrics(_compute_nce, ["NCE Over", "NCE Under", "NCE F-measure"])
     # V-measure metrics
-    scores["V Precision"], scores["V Recall"], scores["V-measure"] = util.filter_kwargs(
-        vmeasure, ref_intervals, ref_labels, est_intervals, est_labels, **kwargs
-    )
+    kwargs['marginal']=True
+    __with_structured_interval_metrics(_compute_nce, ["V Precision", "V Recall", "V-measure"])
 
     return scores