From e6672d9af99b3c0cbba6650231ae1423ba1624e9 Mon Sep 17 00:00:00 2001
From: Domagoj Fijan <domagoj.fijan@gmail.com>
Date: Tue, 9 Apr 2024 13:00:52 -0400
Subject: [PATCH] remove unused file

---
 validity.py | 922 ----------------------------------------------------
 1 file changed, 922 deletions(-)
 delete mode 100644 validity.py

diff --git a/validity.py b/validity.py
deleted file mode 100644
index 38d49eb..0000000
--- a/validity.py
+++ /dev/null
@@ -1,922 +0,0 @@
-# %%
-import time
-import numpy as np
-from numpy import isclose
-from sklearn.metrics import pairwise_distances
-from scipy.spatial.distance import cdist, euclidean
-from sklearn.cluster._hdbscan._linkage import (
-    mst_from_data_matrix,
-    mst_from_mutual_reachability,
-)
-from sklearn.cluster._hdbscan._reachability import mutual_reachability_graph
-from sklearn.cluster._hdbscan._tree import _condense_tree
-from scipy.sparse.csgraph import minimum_spanning_tree
-from scipy.sparse import csgraph
-from functools import reduce, partial
-
-
-def DBCV(X, labels, dist_function=euclidean):
-    """
-    Density Based clustering validation
-
-    Args:
-        X (np.ndarray): ndarray with dimensions [n_samples, n_features]
-            data to check validity of clustering
-        labels (np.array): clustering assignments for data X
-        dist_dunction (func): function to determine distance between objects
-            func args must be [np.array, np.array] where each array is a point
-
-    Returns: cluster_validity (float)
-        score in range[-1, 1] indicating validity of clustering assignments
-    """
-    graph = _mutual_reach_dist_graph(X, labels, dist_function)
-    mst = _mutual_reach_dist_MST(graph)
-    cluster_validity = _clustering_validity_index(mst, labels)
-    return cluster_validity
-
-
-def _mutual_reach_dist_graph(X, labels, dist_function):
-    """
-    Computes the mutual reach distance complete graph.
-    Graph of all pair-wise mutual reachability distances between points
-
-    Args:
-        X (np.ndarray): ndarray with dimensions [n_samples, n_features]
-            data to check validity of clustering
-        labels (np.array): clustering assignments for data X
-        dist_dunction (func): function to determine distance between objects
-            func args must be [np.array, np.array] where each array is a point
-
-    Returns: graph (np.ndarray)
-        array of dimensions (n_samples, n_samples)
-        Graph of all pair-wise mutual reachability distances between points.
-
-    """
-    if isinstance(dist_function, int):
-        n_samples = np.shape(X)[0]
-        n_features = dist_function
-        dists = X
-        graph = np.empty_like(X)
-    else:
-        n_samples = np.shape(X)[0]
-        n_features = np.shape(X)[1]
-        dists = cdist(X, X, dist_function)
-        # If we're calculating the distances ourselves, might as well reuse the array for later
-        graph = dists
-
-    core_dists = np.empty(n_samples)
-    for label in np.unique(labels):
-        mask = labels == label
-        distance_vectors = dists[mask, :][:, mask]
-        n_neighbors = distance_vectors.shape[0]
-        z = distance_vectors == 0
-        distance_vectors[z] = np.nan
-        numerator = np.nansum((1 / distance_vectors) ** n_features, axis=1)
-        cluster_core_dists = (numerator / (n_neighbors - 1)) ** (-1 / n_features)
-        core_dists[mask] = cluster_core_dists
-
-    distances = np.broadcast_arrays(
-        dists, core_dists[:, np.newaxis], core_dists[np.newaxis, :]
-    )
-    reduce(partial(np.maximum, out=graph), distances)
-    return graph
-
-
-def _mutual_reach_dist_MST(dist_tree):
-    """
-    Computes minimum spanning tree of the mutual reach distance complete graph
-
-    Args:
-        dist_tree (np.ndarray): array of dimensions (n_samples, n_samples)
-            Graph of all pair-wise mutual reachability distances
-            between points.
-
-    Returns: minimum_spanning_tree (np.ndarray)
-        array of dimensions (n_samples, n_samples)
-        minimum spanning tree of all pair-wise mutual reachability
-            distances between points.
-    """
-    mst = minimum_spanning_tree(dist_tree).toarray()
-    return mst + np.transpose(mst)
-
-
-def _cluster_density_sparseness(MST, labels, cluster):
-    """
-    Computes the cluster density sparseness, the minimum density
-        within a cluster
-
-    Args:
-        MST (np.ndarray): minimum spanning tree of all pair-wise
-            mutual reachability distances between points.
-        labels (np.array): clustering assignments for data X
-        cluster (int): cluster of interest
-
-    Returns: cluster_density_sparseness (float)
-        value corresponding to the minimum density within a cluster
-    """
-    indices = np.where(labels == cluster)[0]
-    cluster_MST = MST[indices][:, indices]
-    cluster_density_sparseness = np.max(cluster_MST)
-    return cluster_density_sparseness
-
-
-def _cluster_density_separation(MST, labels, cluster_i, cluster_j):
-    """
-    Computes the density separation between two clusters, the maximum
-        density between clusters.
-
-    Args:
-        MST (np.ndarray): minimum spanning tree of all pair-wise
-            mutual reachability distances between points.
-        labels (np.array): clustering assignments for data X
-        cluster_i (int): cluster i of interest
-        cluster_j (int): cluster j of interest
-
-    Returns: density_separation (float):
-        value corresponding to the maximum density between clusters
-    """
-    indices_i = np.where(labels == cluster_i)[0]
-    indices_j = np.where(labels == cluster_j)[0]
-    shortest_paths = csgraph.dijkstra(MST, indices=indices_i)
-    relevant_paths = shortest_paths[:, indices_j]
-    density_separation = np.min(relevant_paths)
-    return density_separation
-
-
-def _cluster_validity_index(MST, labels, cluster):
-    """
-    Computes the validity of a cluster (validity of assignmnets)
-
-    Args:
-        MST (np.ndarray): minimum spanning tree of all pair-wise
-            mutual reachability distances between points.
-        labels (np.array): clustering assignments for data X
-        cluster (int): cluster of interest
-
-    Returns: cluster_validity (float)
-        value corresponding to the validity of cluster assignments
-    """
-    min_density_separation = np.inf
-    for cluster_j in np.unique(labels):
-        if cluster_j != cluster:
-            cluster_density_separation = _cluster_density_separation(
-                MST, labels, cluster, cluster_j
-            )
-            if cluster_density_separation < min_density_separation:
-                min_density_separation = cluster_density_separation
-    cluster_density_sparseness = _cluster_density_sparseness(MST, labels, cluster)
-    numerator = min_density_separation - cluster_density_sparseness
-    denominator = np.max([min_density_separation, cluster_density_sparseness])
-    cluster_validity = numerator / denominator
-    return cluster_validity
-
-
-def _clustering_validity_index(MST, labels):
-    """
-    Computes the validity of all clustering assignments for a
-    clustering algorithm
-
-    Args:
-        MST (np.ndarray): minimum spanning tree of all pair-wise
-            mutual reachability distances between points.
-        labels (np.array): clustering assignments for data X
-
-    Returns: validity_index (float):
-        score in range[-1, 1] indicating validity of clustering assignments
-    """
-    n_samples = len(labels)
-    validity_index = 0
-    for label in np.unique(labels):
-        fraction = np.sum(labels == label) / float(n_samples)
-        cluster_validity = _cluster_validity_index(MST, labels, label)
-        validity_index += fraction * cluster_validity
-    return validity_index
-
-
-################# HDBSCAN's validity
-def all_points_core_distance(distance_matrix, d=2.0):
-    """
-    Compute the all-points-core-distance for all the points of a cluster.
-
-    Parameters
-    ----------
-    distance_matrix : array (cluster_size, cluster_size)
-        The pairwise distance matrix between points in the cluster.
-
-    d : integer
-        The dimension of the data set, which is used in the computation
-        of the all-point-core-distance as per the paper.
-
-    Returns
-    -------
-    core_distances : array (cluster_size,)
-        The all-points-core-distance of each point in the cluster
-
-    References
-    ----------
-    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
-    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
-    """
-    distance_matrix[distance_matrix != 0] = (
-        1.0 / distance_matrix[distance_matrix != 0]
-    ) ** d
-    result = distance_matrix.sum(axis=1)
-    result /= distance_matrix.shape[0] - 1
-
-    if result.sum() == 0:
-        result = np.zeros(len(distance_matrix))
-    else:
-        result **= -1.0 / d
-
-    return result
-
-
-def max_ratio(stacked_distances):
-    # Extract the distances and core distances
-    distances = stacked_distances[:, :, 0]
-    coredists = stacked_distances[:, :, 1]
-
-    # Replace zeros in distances with ones to avoid division by zero
-    distances[distances == 0] = 1
-
-    # Compute the ratios
-    ratios = coredists / distances
-
-    # Find the maximum ratio greater than zero
-    max_ratio = ratios[ratios > 0].max()
-
-    return max_ratio
-
-
-def distances_between_points(
-    X,
-    labels,
-    cluster_id,
-    metric="euclidean",
-    d=None,
-    no_coredist=False,
-    print_max_raw_to_coredist_ratio=False,
-    **kwd_args
-):
-    """
-    Compute pairwise distances for all the points of a cluster.
-
-    If metric is 'precomputed' then assume X is a distance matrix for the full
-    dataset. Note that in this case you must pass in 'd' the dimension of the
-    dataset.
-
-    Parameters
-    ----------
-    X : array (n_samples, n_features) or (n_samples, n_samples)
-        The input data of the clustering. This can be the data, or, if
-        metric is set to `precomputed` the pairwise distance matrix used
-        for the clustering.
-
-    labels : array (n_samples)
-        The label array output by the clustering, providing an integral
-        cluster label to each data point, with -1 for noise points.
-
-    cluster_id : integer
-        The cluster label for which to compute the distances
-
-    metric : string
-        The metric used to compute distances for the clustering (and
-        to be re-used in computing distances for mr distance). If
-        set to `precomputed` then X is assumed to be the precomputed
-        distance matrix between samples.
-
-    d : integer (or None)
-        The number of features (dimension) of the dataset. This need only
-        be set in the case of metric being set to `precomputed`, where
-        the ambient dimension of the data is unknown to the function.
-
-    **kwd_args :
-        Extra arguments to pass to the distance computation for other
-        metrics, such as minkowski, Mahanalobis etc.
-
-    Returns
-    -------
-
-    distances : array (n_samples, n_samples)
-        The distances between all points in `X` with `label` equal to `cluster_id`.
-
-    core_distances : array (n_samples,)
-        The all-points-core_distance of all points in `X` with `label` equal
-        to `cluster_id`.
-
-    References
-    ----------
-    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
-    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
-    """
-    if metric == "precomputed":
-        if d is None:
-            raise ValueError("If metric is precomputed a " "d value must be provided!")
-        distance_matrix = X[labels == cluster_id, :][:, labels == cluster_id]
-    else:
-        subset_X = X[labels == cluster_id, :]
-        distance_matrix = pairwise_distances(subset_X, metric=metric, **kwd_args)
-        d = X.shape[1]
-
-    if no_coredist:
-        return distance_matrix, None
-
-    else:
-        core_distances = all_points_core_distance(distance_matrix.copy(), d=d)
-        core_dist_matrix = np.tile(core_distances, (core_distances.shape[0], 1))
-        stacked_distances = np.dstack(
-            [distance_matrix, core_dist_matrix, core_dist_matrix.T]
-        )
-
-        if print_max_raw_to_coredist_ratio:
-            print(
-                "Max raw distance to coredistance ratio: "
-                + str(max_ratio(stacked_distances))
-            )
-
-        return stacked_distances.max(axis=-1), core_distances
-
-
-def convert_mst_output(mst_edges):
-    result = np.zeros((mst_edges.shape[0], 3))
-    result[:, 0] = mst_edges["current_node"]
-    result[:, 1] = mst_edges["next_node"]
-    result[:, 2] = mst_edges["distance"]
-    return result
-
-
-def internal_minimum_spanning_tree(mr_distances):
-    """
-    Compute the 'internal' minimum spanning tree given a matrix of mutual
-    reachability distances. Given a minimum spanning tree the 'internal'
-    graph is the subgraph induced by vertices of degree greater than one.
-
-    Parameters
-    ----------
-    mr_distances : array (cluster_size, cluster_size)
-        The pairwise mutual reachability distances, inferred to be the edge
-        weights of a complete graph. Since MSTs are computed per cluster
-        this is the all-points-mutual-reacability for points within a single
-        cluster.
-
-    Returns
-    -------
-    internal_nodes : array
-        An array listing the indices of the internal nodes of the MST
-
-    internal_edges : array (?, 3)
-        An array of internal edges in weighted edge list format; that is
-        an edge is an array of length three listing the two vertices
-        forming the edge and weight of the edge.
-
-    References
-    ----------
-    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
-    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
-    """
-    # Compute the MST using mst_from_mutual_reachability
-    mst_edges = mst_from_mutual_reachability(mr_distances)
-
-    # Convert to the desired format
-    single_linkage_data = convert_mst_output(mst_edges)
-
-    min_span_tree = single_linkage_data.copy()
-    for index, row in enumerate(min_span_tree[1:], 1):
-        candidates = np.where(isclose(mr_distances[int(row[1])], row[2]))[0]
-        candidates = np.intersect1d(
-            candidates, single_linkage_data[:index, :2].astype(int)
-        )
-        candidates = candidates[candidates != row[1]]
-        assert len(candidates) > 0
-        row[0] = candidates[0]
-
-    vertices = np.arange(mr_distances.shape[0])[
-        np.bincount(min_span_tree.T[:2].flatten().astype(np.intp)) > 1
-    ]
-    if not len(vertices):
-        vertices = [0]
-    # A little "fancy" we select from the flattened array reshape back
-    # (Fortran format to get indexing right) and take the product to do an and
-    # then convert back to boolean type.
-    edge_selection = np.prod(
-        np.in1d(min_span_tree.T[:2], vertices).reshape(
-            (min_span_tree.shape[0], 2), order="F"
-        ),
-        axis=1,
-    ).astype(bool)
-
-    # Density sparseness is not well defined if there are no
-    # internal edges (as per the referenced paper). However
-    # MATLAB code from the original authors simply selects the
-    # largest of *all* the edges in the case that there are
-    # no internal edges, so we do the same here
-    if np.any(edge_selection):
-        # If there are any internal edges, then subselect them out
-        edges = min_span_tree[edge_selection]
-    else:
-        # If there are no internal edges then we want to take the
-        # max over all the edges that exist in the MST, so we simply
-        # do nothing and return all the edges in the MST.
-        edges = min_span_tree.copy()
-
-    return vertices, edges
-
-
-def density_separation(
-    X,
-    labels,
-    cluster_id1,
-    cluster_id2,
-    internal_nodes1,
-    internal_nodes2,
-    core_distances1,
-    core_distances2,
-    metric="euclidean",
-    no_coredist=False,
-    **kwd_args
-):
-    """
-    Compute the density separation between two clusters. This is the minimum
-    distance between pairs of points, one from internal nodes of MSTs of each cluster.
-
-    Parameters
-    ----------
-    X : array (n_samples, n_features) or (n_samples, n_samples)
-        The input data of the clustering. This can be the data, or, if
-        metric is set to `precomputed` the pairwise distance matrix used
-        for the clustering.
-
-    labels : array (n_samples)
-        The label array output by the clustering, providing an integral
-        cluster label to each data point, with -1 for noise points.
-
-    cluster_id1 : integer
-        The first cluster label to compute separation between.
-
-    cluster_id2 : integer
-        The second cluster label to compute separation between.
-
-    internal_nodes1 : array
-        The vertices of the MST for `cluster_id1` that were internal vertices.
-
-    internal_nodes2 : array
-        The vertices of the MST for `cluster_id2` that were internal vertices.
-
-    core_distances1 : array (size of cluster_id1,)
-        The all-points-core_distances of all points in the cluster
-        specified by cluster_id1.
-
-    core_distances2 : array (size of cluster_id2,)
-        The all-points-core_distances of all points in the cluster
-        specified by cluster_id2.
-
-    metric : string
-        The metric used to compute distances for the clustering (and
-        to be re-used in computing distances for mr distance). If
-        set to `precomputed` then X is assumed to be the precomputed
-        distance matrix between samples.
-
-    **kwd_args :
-        Extra arguments to pass to the distance computation for other
-        metrics, such as minkowski, Mahanalobis etc.
-
-    Returns
-    -------
-    The 'density separation' between the clusters specified by
-    `cluster_id1` and `cluster_id2`.
-
-    References
-    ----------
-    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
-    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
-    """
-    if metric == "precomputed":
-        sub_select = X[labels == cluster_id1, :][:, labels == cluster_id2]
-        distance_matrix = sub_select[internal_nodes1, :][:, internal_nodes2]
-    else:
-        cluster1 = X[labels == cluster_id1][internal_nodes1]
-        cluster2 = X[labels == cluster_id2][internal_nodes2]
-        distance_matrix = cdist(cluster1, cluster2, metric, **kwd_args)
-
-    if no_coredist:
-        return distance_matrix.min()
-
-    else:
-        core_dist_matrix1 = np.tile(
-            core_distances1[internal_nodes1], (distance_matrix.shape[1], 1)
-        ).T
-        core_dist_matrix2 = np.tile(
-            core_distances2[internal_nodes2], (distance_matrix.shape[0], 1)
-        )
-
-        mr_dist_matrix = np.dstack(
-            [distance_matrix, core_dist_matrix1, core_dist_matrix2]
-        ).max(axis=-1)
-
-        return mr_dist_matrix.min()
-
-
-def validity_index(
-    X,
-    labels,
-    metric="euclidean",
-    d=None,
-    per_cluster_scores=False,
-    mst_raw_dist=False,
-    verbose=False,
-    **kwd_args
-):
-    """
-    Compute the density based cluster validity index for the
-    clustering specified by `labels` and for each cluster in `labels`.
-
-    Parameters
-    ----------
-    X : array (n_samples, n_features) or (n_samples, n_samples)
-        The input data of the clustering. This can be the data, or, if
-        metric is set to `precomputed` the pairwise distance matrix used
-        for the clustering.
-
-    labels : array (n_samples)
-        The label array output by the clustering, providing an integral
-        cluster label to each data point, with -1 for noise points.
-
-    metric : optional, string (default 'euclidean')
-        The metric used to compute distances for the clustering (and
-        to be re-used in computing distances for mr distance). If
-        set to `precomputed` then X is assumed to be the precomputed
-        distance matrix between samples.
-
-    d : optional, integer (or None) (default None)
-        The number of features (dimension) of the dataset. This need only
-        be set in the case of metric being set to `precomputed`, where
-        the ambient dimension of the data is unknown to the function.
-
-    per_cluster_scores : optional, boolean (default False)
-        Whether to return the validity index for individual clusters.
-        Defaults to False with the function returning a single float
-        value for the whole clustering.
-
-    mst_raw_dist : optional, boolean (default False)
-        If True, the MST's are constructed solely via 'raw' distances (depending on the given metric, e.g. euclidean distances)
-        instead of using mutual reachability distances. Thus setting this parameter to True avoids using 'all-points-core-distances' at all.
-        This is advantageous specifically in the case of elongated clusters that lie in close proximity to each other <citation needed>.
-
-    **kwd_args :
-        Extra arguments to pass to the distance computation for other
-        metrics, such as minkowski, Mahanalobis etc.
-
-    Returns
-    -------
-    validity_index : float
-        The density based cluster validity index for the clustering. This
-        is a numeric value between -1 and 1, with higher values indicating
-        a 'better' clustering.
-
-    per_cluster_validity_index : array (n_clusters,)
-        The cluster validity index of each individual cluster as an array.
-        The overall validity index is the weighted average of these values.
-        Only returned if per_cluster_scores is set to True.
-
-    References
-    ----------
-    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
-    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
-    """
-    core_distances = {}
-    density_sparseness = {}
-    mst_nodes = {}
-    mst_edges = {}
-
-    max_cluster_id = labels.max() + 1
-    density_sep = np.inf * np.ones((max_cluster_id, max_cluster_id), dtype=np.float64)
-    cluster_validity_indices = np.empty(max_cluster_id, dtype=np.float64)
-
-    for cluster_id in range(max_cluster_id):
-        if np.sum(labels == cluster_id) == 0:
-            continue
-
-        distances_for_mst, core_distances[cluster_id] = distances_between_points(
-            X,
-            labels,
-            cluster_id,
-            metric,
-            d,
-            no_coredist=mst_raw_dist,
-            print_max_raw_to_coredist_ratio=verbose,
-            **kwd_args
-        )
-
-        mst_nodes[cluster_id], mst_edges[cluster_id] = internal_minimum_spanning_tree(
-            distances_for_mst
-        )
-        density_sparseness[cluster_id] = mst_edges[cluster_id].T[2].max()
-
-    for i in range(max_cluster_id):
-        if np.sum(labels == i) == 0:
-            continue
-
-        internal_nodes_i = mst_nodes[i]
-        for j in range(i + 1, max_cluster_id):
-            if np.sum(labels == j) == 0:
-                continue
-
-            internal_nodes_j = mst_nodes[j]
-            density_sep[i, j] = density_separation(
-                X,
-                labels,
-                i,
-                j,
-                internal_nodes_i,
-                internal_nodes_j,
-                core_distances[i],
-                core_distances[j],
-                metric=metric,
-                no_coredist=mst_raw_dist,
-                **kwd_args
-            )
-            density_sep[j, i] = density_sep[i, j]
-
-    n_samples = float(X.shape[0])
-    result = 0
-
-    for i in range(max_cluster_id):
-        if np.sum(labels == i) == 0:
-            continue
-
-        min_density_sep = density_sep[i].min()
-        cluster_validity_indices[i] = (min_density_sep - density_sparseness[i]) / max(
-            min_density_sep, density_sparseness[i]
-        )
-
-        if verbose:
-            print("Minimum density separation: " + str(min_density_sep))
-            print("Density sparseness: " + str(density_sparseness[i]))
-
-        cluster_size = np.sum(labels == i)
-        result += (cluster_size / n_samples) * cluster_validity_indices[i]
-
-    if per_cluster_scores:
-        return result, cluster_validity_indices
-    else:
-        return result
-
-
-def relative_validity_index_from_SLT(
-    labels,
-    single_linkage_tree,
-    per_cluster_scores=False,
-):
-    """
-    Compute the density based cluster validity index for the
-    clustering specified by `labels` and for each cluster in `labels`.
-
-    Parameters
-    ----------
-    labels : array (n_samples)
-        The label array output by the clustering, providing an integral
-        cluster label to each data point, with -1 for noise points.
-
-    single_linkage_tree : array (n_samples - 1, 4)
-        The single linkage tree output by the clustering, providing the
-        hierarchical clustering of the data.
-
-    per_cluster_scores : optional, boolean (default False)
-        Whether to return the validity index for individual clusters.
-        Defaults to False with the function returning a single float
-        value for the whole clustering.
-
-    **kwd_args :
-        Extra arguments to pass to the distance computation for other
-        metrics, such as minkowski, Mahanalobis etc.
-
-    Returns
-    -------
-    validity_index : float
-        The density based cluster validity index for the clustering. This
-        is a numeric value between -1 and 1, with higher values indicating
-        a 'better' clustering.
-
-    per_cluster_validity_index : array (n_clusters,)
-        The cluster validity index of each individual cluster as an array.
-        The overall validity index is the weighted average of these values.
-        Only returned if per_cluster_scores is set to True.
-
-    References
-    ----------
-    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
-    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
-    """
-    sizes = np.bincount(labels + 1)
-    noise_size = sizes[0]
-    cluster_size = sizes[1:]
-    total = noise_size + np.sum(cluster_size)
-    num_clusters = len(cluster_size)
-    DSC = np.zeros(num_clusters)
-    min_outlier_sep = np.inf  # only required if num_clusters = 1
-    correction_const = 2  # only required if num_clusters = 1
-
-    # Unltimately, for each Ci, we only require the
-    # minimum of DSPC(Ci, Cj) over all Cj != Ci.
-    # So let's call this value DSPC_wrt(Ci), i.e.
-    # density separation 'with respect to' Ci.
-    DSPC_wrt = np.ones(num_clusters) * np.inf
-    max_distance = 0
-
-#this didnt help
-    mst = _condense_tree(single_linkage_tree.copy(), 5)
-    print(mst[:5])
-  #  for p1, p2, length, p3 in single_linkage_tree:
-  #      print(p1,p2,length,p3)
-        #if np.any(np.asarray([p1,p2,p3])>len(labels)):
-        #    print(p1,p2,p3)
-    
-    #THE ISSUE HERE IS THAT HDBSCANS SLT from scikit learn is not the
-    #same as the one from hdbscan contrib. The hdbscan contrib has node
-    #in node out, while the scikit learn one has some other node stuff
-    #relate dto condensed tree and contains praent/child pairs and
-    #cluster size
-
-    for p1, p2, length, _ in mst:
-        max_distance = max(max_distance, length)
-        label1 = labels[p1]
-        label2 = labels[p2]
-
-        if label1 == -1 and label2 == -1:
-            continue
-        elif label1 == -1 or label2 == -1:
-            # If exactly one of the points is noise
-            min_outlier_sep = min(min_outlier_sep, length)
-            continue
-
-        if label1 == label2:
-            # Set the density sparseness of the cluster
-            # to the sparsest value seen so far.
-            DSC[label1] = max(length, DSC[label1])
-        else:
-            # Check whether density separations with
-            # respect to each of these clusters can
-            # be reduced.
-            DSPC_wrt[label1] = min(length, DSPC_wrt[label1])
-            DSPC_wrt[label2] = min(length, DSPC_wrt[label2])
-
-    # In case min_outlier_sep is still np.inf, we assign a new value to it.
-    # This only makes sense if num_clusters = 1 since it has turned out
-    # that the MR-MST has no edges between a noise point and a core point.
-    min_outlier_sep = max_distance if min_outlier_sep == np.inf else min_outlier_sep
-
-    # DSPC_wrt[Ci] might be infinite if the connected component for Ci is
-    # an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the
-    # MR-MST might contain an edge with one point in Cj and ther other one
-    # in Ck. Here, we replace the infinite density separation of Ci by
-    # another large enough value.
-    #
-    # TODO: Think of a better yet efficient way to handle this.
-    correction = correction_const * (
-        max_distance if num_clusters > 1 else min_outlier_sep
-    )
-    DSPC_wrt[np.where(DSPC_wrt == np.inf)] = correction
-
-    V_index = [
-        (DSPC_wrt[i] - DSC[i]) / max(DSPC_wrt[i], DSC[i])
-        for i in range(num_clusters)
-    ]
-    cluster_scores = np.array(
-        [cluster_size[i] * V_index[i] for i in range(num_clusters)]
-    )
-    score = np.sum(
-        cluster_scores / total
-    )
-    if per_cluster_scores:
-        return score, cluster_scores
-    else:
-        return score
-        
-#from sklearn.datasets import make_blobs, make_moons
-#from sklearn.cluster import HDBSCAN
-#X_m, _ = make_moons(n_samples=300, noise=0.05, random_state=42)
-#clustering_m_1 = HDBSCAN().fit(X_m)
-#print(clustering_m_1._single_linkage_tree_[:5])
-#print("Validity index moons HDBSCAN: " + str(relative_validity_index_from_SLT(clustering_m_1.labels_, clustering_m_1._single_linkage_tree_)))
-
-#%%
-def test_normal(plot=False):
-    from sklearn.datasets import make_blobs, make_moons
-    from sklearn.cluster import DBSCAN, HDBSCAN, OPTICS
-
-    # Create a dataset with 3 well-separated clusters
-    X_b, _ = make_blobs(n_samples=300, centers=3, random_state=42)
-    X_m, _ = make_moons(n_samples=300, noise=0.05, random_state=42)
-
-    # Apply DBSCAN clustering
-    clustering_b_1 = HDBSCAN().fit(X_b)
-    clustering_b_2 = DBSCAN(eps=0.75).fit(X_b)
-    clustering_b_3 = OPTICS().fit(X_b)
-    # Compute the validity index
-    validity_index_result_b_1 = validity_index(X_b, clustering_b_1.labels_, mst_raw_dist=True)
-    validity_index_result_b_2 = validity_index(X_b, clustering_b_2.labels_, mst_raw_dist=True)
-    validity_index_result_b_3 = validity_index(X_b, clustering_b_3.labels_, mst_raw_dist=True)
-    print("Validity index blobs HDBSCAN: " + str(validity_index_result_b_1))
-    print("Validity index blobs DBSCAN eps=0.75: " + str(validity_index_result_b_2))
-    print("Validity index blobs OPTICS: " + str(validity_index_result_b_3))
-    DBCV_b_1 = DBCV(X_b, clustering_b_1.labels_)
-    DBCV_b_2 = DBCV(X_b, clustering_b_2.labels_)
-    DBCV_b_3 = DBCV(X_b, clustering_b_3.labels_)
-    print(" DBCV blobs HDBSCAN: " + str(DBCV_b_1))
-    print(" DBCV blobs DBSCAN eps=0.75: " + str(DBCV_b_2))
-    print(" DBCV blobs OPTICS: " + str(DBCV_b_3))
-    clustering_m_1 = HDBSCAN().fit(X_m)
-    clustering_m_2 = DBSCAN(eps=0.75).fit(X_m)
-    clustering_m_3 = OPTICS().fit(X_m)
-    # Compute the validity index
-    validity_index_result_m_1 = validity_index(X_m, clustering_m_1.labels_, mst_raw_dist=True)
-    validity_index_result_m_2 = validity_index(X_m, clustering_m_2.labels_, mst_raw_dist=True)
-    validity_index_result_m_3 = validity_index(X_m, clustering_m_3.labels_, mst_raw_dist=True)
-    print("Validity index moons HDBSCAN: " + str(validity_index_result_m_1))
-    print("Validity index moons DBSCAN eps=0.75: " + str(validity_index_result_m_2))
-    print("Validity index moons OPTICS: " + str(validity_index_result_m_3))
-    DBCV_m_1 = DBCV(X_m, clustering_m_1.labels_)
-    DBCV_m_2 = DBCV(X_m, clustering_m_2.labels_)
-    DBCV_m_3 = DBCV(X_m, clustering_m_3.labels_)
-    print(" DBCV moons HDBSCAN: " + str(DBCV_m_1))
-    print(" DBCV moons DBSCAN eps=0.75: " + str(DBCV_m_2))
-    print(" DBCV moons OPTICS: " + str(DBCV_m_3))
-    if plot:
-        import matplotlib.pyplot as plt
-
-        plt.figure(figsize=(10, 5))
-
-        plt.subplot(2, 3, 1)
-        plt.scatter(X_b[:, 0], X_b[:, 1], c=clustering_b_1.labels_, cmap="viridis", s=50)
-        plt.title(
-            "HDBSCAN; \nvalidity index: "
-            + str(validity_index_result_b_1)
-            + " \nDBCV: "
-            + str(DBCV_b_1)
-        )
-
-        plt.subplot(2, 3, 2)
-        plt.scatter(X_b[:, 0], X_b[:, 1], c=clustering_b_2.labels_, cmap="viridis", s=50)
-        plt.title(
-            "DBSCAN eps=0.75;\nvalidity index: "
-            + str(validity_index_result_b_2)
-            + " \nDBCV: "
-            + str(DBCV_b_2)
-        )
-
-        plt.subplot(2, 3, 3)
-        plt.scatter(X_b[:, 0], X_b[:, 1], c=clustering_b_3.labels_, cmap="viridis", s=50)
-        plt.title(
-            "OPTICS;\nvalidity index: "
-            + str(validity_index_result_b_3)
-            + " \nDBCV: "
-            + str(DBCV_b_3)
-        )
-
-        plt.subplot(2, 3, 4)
-        plt.scatter(X_m[:, 0], X_m[:, 1], c=clustering_m_1.labels_, cmap="viridis", s=50)
-        plt.title(
-            "HDBSCAN; \nvalidity index: "
-            + str(validity_index_result_m_1)
-            + " \nDBCV: "
-            + str(DBCV_m_1)
-        )
-
-        plt.subplot(2, 3, 5)
-        plt.scatter(X_m[:, 0], X_m[:, 1], c=clustering_m_2.labels_, cmap="viridis", s=50)
-        plt.title(
-            "DBSCAN eps=0.75;\nvalidity index: "
-            + str(validity_index_result_m_2)
-            + " \nDBCV: "
-            + str(DBCV_m_2)
-        )
-
-        plt.subplot(2, 3, 6)
-        plt.scatter(X_m[:, 0], X_m[:, 1], c=clustering_m_3.labels_, cmap="viridis", s=50)
-        plt.title(
-            "OPTICS;\nvalidity index: "
-            + str(validity_index_result_m_3)
-            + " \nDBCV: "
-            + str(DBCV_m_3)
-        )
-        # y space between plots should be twice as much
-        plt.subplots_adjust(hspace=0.6)        
-        plt.show()
-
-
-# if main
-if __name__ == "__main__":
-    test_normal(True)
-
-#%%
-from sklearn.datasets import make_blobs
-from sklearn.cluster import DBSCAN
-
-# Create a dataset with 3 well-separated clusters
-X, _ = make_blobs(n_samples=300, centers=3, random_state=42)
-
-# Apply DBSCAN clustering
-clustering = DBSCAN(eps=0.5).fit(X)
-labels = clustering.labels_
-%timeit validity_index(X, labels)
-%timeit DBCV(X, labels)