From 8f101a5acc593b1a655b0c4d5edbb110cd43fce0 Mon Sep 17 00:00:00 2001 From: Tim Millar Date: Wed, 21 Jun 2017 15:13:44 +1200 Subject: [PATCH 1/9] fix for noise points causing -ve child area --- tefingerprint/cluster.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tefingerprint/cluster.py b/tefingerprint/cluster.py index c3d44c3..e56236a 100644 --- a/tefingerprint/cluster.py +++ b/tefingerprint/cluster.py @@ -348,7 +348,9 @@ def _traverse_hudc_tree(points, base_eps, n): # compare areas total_area = np.sum(base_eps - points['eps']) - child_area = np.sum(threshold_eps - points['eps']) + child_area = threshold_eps - points['eps'] + child_area[child_area < 0] = 0 # remove negative values caused by noise points + child_area = np.sum(child_area) parent_area = total_area - child_area if parent_area > child_area: From f9134d4f8092c2e3f433f1293f1fa40ac1fdf82a Mon Sep 17 00:00:00 2001 From: Tim Millar Date: Mon, 26 Jun 2017 17:13:49 +1200 Subject: [PATCH 2/9] correction for calculation of minimum epsilon --- tefingerprint/cluster.py | 114 +++++++++++++++++++++------------------ tests/test_cluster.py | 8 +-- 2 files changed, 67 insertions(+), 55 deletions(-) diff --git a/tefingerprint/cluster.py b/tefingerprint/cluster.py index e56236a..5764c08 100644 --- a/tefingerprint/cluster.py +++ b/tefingerprint/cluster.py @@ -228,34 +228,35 @@ def __init__(self, n, max_eps=None, min_eps=None): self.input_array = np.array([], dtype=int) @staticmethod - def _point_eps(array, n): + def _core_distances(array, min_points): """ - Identify the minimum value of eps for every point in an array of integers. + Identify the core distance (minimum value of eps) for every point in an array of integers. For each point, the minimum eps is calculated among all subclusters containing that point. :param array: An array of integers sorted in ascending order :type array: :class:`numpy.ndarray`[int] - :param n: number of points used to form a subcluster + :param min_points: number of points used to form a subcluster :type n: int :return: An array of minimum eps values :rtype: :class:`numpy.ndarray`[int] """ - assert n > 1 # groups must contain at least two points - offset = n - 1 # offset for indexing + assert min_points > 1 # groups must contain at least two points + offset = min_points - 1 # offset for indexing because the minimum points includes itself length = len(array) lower = array[0:length - offset] upper = array[offset:length] eps_values = upper - lower - eps_2d = np.full((n, length), np.max(eps_values), dtype=int) - for i in range(n): + eps_2d = np.full((min_points, length), np.max(eps_values), dtype=int) + for i in range(min_points): eps_2d[i, i:length - (offset - i)] = eps_values return np.min(eps_2d, axis=0) @staticmethod - def _eps_splits(array, n): + def _cluster_forks(array, min_points): """ - Identify eps 'splits' in an array by calculating eps of the gaps between values in the array. + Identify eps 'splits' in an array by calculating epsilon of the gaps between values in the array. + This is Eps between values must be calculated as eps of the values either side of a gap may be significantly lower. For example when clustering the array of vales: @@ -273,17 +274,17 @@ def _eps_splits(array, n): :param array: An array of integers sorted in ascending order :type array: :class:`numpy.ndarray`[int] - :param n: number of points used to form a subcluster + :param min_points: number of points used to form a subcluster :type n: int - :return: An array of eps split values - :rtype: :class:`numpy.ndarray`[int] + :return: An epsilon value or None + :rtype: int """ - if len(array) <= n: - # no peaks possible because all points must have the same eps - return np.array([], dtype=np.int) + if len(array) <= min_points: + # no forks possible because all points must have the same eps + return None - offset = n - 1 + offset = min_points - 1 # calculate split eps using the 2d method eps_values = array[offset:] - array[:-offset] @@ -291,7 +292,6 @@ def _eps_splits(array, n): for i in range(offset): eps_2d[i, i:len(eps_values) + i] = eps_values splits = np.min(eps_2d, axis=0) - splits -= 1 # Convert values to thresholds # Remove plateaus gradients = splits[1:] - splits[:-1] @@ -300,7 +300,15 @@ def _eps_splits(array, n): # Remove non-peaks is_peak = np.logical_and(np.append(np.array([False]), splits[1:] > splits[:-1]), np.append(splits[:-1] > splits[1:], np.array([False]))) - return splits[is_peak] + splits = splits[is_peak] + + # If this method calculates epsilon of 5 it means the child cluster starts at epsilon 4.999... + if len(splits) == 0: + # The cluster does not fork into child clusters at all + return None + else: + # We only require the largest fork value + return np.max(splits) @staticmethod def _flatten_list(item): @@ -322,48 +330,51 @@ def _flatten_list(item): yield item @staticmethod - def _traverse_hudc_tree(points, base_eps, n): + def _traverse_hudc_tree(points, epsilon_maximum, min_points): """ Traverse a tree of nested density clusters and recursively identify clusters based on their area. This method is intimately tied to method 'hudc'. :param points: An array of points with the slots 'value', 'index' and 'eps :type points: :class:`numpy.ndarray`[(int, int, int)] - :param base_eps: The maximum distance allowed in among each set of n points - :type base_eps: int - :param n: The minimum number of points allowed in each (sub)cluster - :type n: int + :param epsilon_maximum: Maximum distance allowed in among each set of n points + :type epsilon_maximum: int + :param min_points: The minimum number of points allowed in each (sub)cluster + :type min_points: int :return: A nested list of upper and (half-open) indices of selected clusters :rtype: list[list[]] """ - splits = HUDC._eps_splits(points['value'], n) + # Values of epsilon bellow which the cluster forks + fork_epsilon = HUDC._cluster_forks(points['value'], min_points) - if len(splits) == 0: - # there are no children so return slice indices + if fork_epsilon is None: + # The cluster doesn't fork so it has no children + # Epsilon_minimum would equal the minimum of core distances but it's not neededE return points['index'][0], points['index'][-1] + 1 - # compare based on largest peak (least dense place) - threshold_eps = np.max(splits) + # If a cluster forks into children then it's minimum epsilon is the value at which forks + epsilon_minimum = fork_epsilon - # compare areas - total_area = np.sum(base_eps - points['eps']) - child_area = threshold_eps - points['eps'] - child_area[child_area < 0] = 0 # remove negative values caused by noise points - child_area = np.sum(child_area) - parent_area = total_area - child_area + # Compare support for cluster and its children + support = np.sum(epsilon_maximum - np.maximum(epsilon_minimum, points['core_dist'])) + support_children = np.sum(np.maximum(0, epsilon_minimum - points['core_dist'])) - if parent_area > child_area: - # parent is lager so return slice indices + if support > support_children: + # Parent is supported so return slice indices return points['index'][0], points['index'][-1] + 1 else: - # combined area of children is larger so divide and repeat - child_points = (points[left:right] for left, right in HUDC._cluster(points['value'], threshold_eps, n)) - return [HUDC._traverse_hudc_tree(points, threshold_eps, n) for points in child_points] + # Combined support of children is larger so divide and repeat recursively: + # A minimum epsilon of 5 means the child clusters technically starts at epsilon 4.999... + # we calculate the child clusters using epsilon 4 which will produce the same clusters as 4.999... + child_cluster_bounds = HUDC._cluster(points['value'], epsilon_minimum - 1, min_points) + child_points = (points[left:right] for left, right in child_cluster_bounds) + # but then use epsilon 5 as the new maximum epsilon so that support is calculated from epsilon 4.999... + return [HUDC._traverse_hudc_tree(points, epsilon_minimum, min_points) for points in child_points] @staticmethod - def hudc(array, n, max_eps=None, min_eps=None): + def hudc(array, min_points, max_eps=None, min_eps=None): """ Calculate Hierarchical Density Clusters for a Univariate Array. @@ -390,8 +401,8 @@ def hudc(array, n, max_eps=None, min_eps=None): :param array: An array of ints sorted in ascending order :type array: :class:`numpy.ndarray`[int] - :param n: The minimum number of points allowed in each (sub)cluster - :type n: int + :param min_points: The minimum number of points allowed in each (sub)cluster + :type min_points: int :param max_eps: An optional value for maximum distance allowed among each set of n points :type max_eps: int :param min_eps: An optional value for the minimum value of eps to be used when calculating cluster depth @@ -402,29 +413,30 @@ def hudc(array, n, max_eps=None, min_eps=None): """ assert HUDC._sorted_ascending(array) - if len(array) < n: + if len(array) < min_points: # not enough points to form a cluster return np.array([], dtype=UDC._DTYPE_SLICE) else: + # create a structured array for tracking value, index and core distance of each individual points = np.empty(len(array), dtype=np.dtype([('value', np.int64), ('index', np.int64), - ('eps', np.int64)])) + ('core_dist', np.int64)])) points['value'] = array points['index'] = np.arange(len(array), dtype=int) - points['eps'] = HUDC._point_eps(array, n) + points['core_dist'] = HUDC._core_distances(array, min_points) if not max_eps: # start at highest observed eps - max_eps = np.max(points['eps']) + max_eps = np.max(points['core_dist']) if min_eps: # overwrite lower eps - points['eps'][points['eps'] < min_eps] = min_eps + points['core_dist'][points['core_dist'] < min_eps] = min_eps - # initial splits - child_points = (points[left:right] for left, right in HUDC._cluster(points['value'], max_eps, n)) + # initial splits based on the specified maximum epsilon + child_points = (points[left:right] for left, right in HUDC._cluster(points['value'], max_eps, min_points)) - # run - clusters = [HUDC._traverse_hudc_tree(points, max_eps, n) for points in child_points] + # recursively run on all clusters + clusters = [HUDC._traverse_hudc_tree(points, max_eps, min_points) for points in child_points] return np.fromiter(HUDC._flatten_list(clusters), dtype=UDC._DTYPE_SLICE) def fit(self, array): diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 6a03f12..fa6f683 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -172,7 +172,7 @@ def test_point_eps(self): 54, 54, 54, 54, 54, 54, 54, 54, 106, 106, 106, 106, 123, 124, 124, 257, 228, 228, 186, 165, 144, 97, 89, 89, 89, 89, 89, 89, 89, 89, 89, 89], dtype=int) - npt.assert_array_equal(HUDC._point_eps(input_array, 10), point_eps) + npt.assert_array_equal(HUDC._core_distances(input_array, 10), point_eps) def test_point_eps_hidden_peak(self): """ @@ -182,7 +182,7 @@ def test_point_eps_hidden_peak(self): """ input_array = np.array([0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32], dtype=int) point_eps = np.array([3, 3, 1, 1, 1, 2, 2, 1, 1, 1, 3, 3], dtype=int) - npt.assert_array_equal(HUDC._point_eps(input_array, 3), point_eps) + npt.assert_array_equal(HUDC._core_distances(input_array, 3), point_eps) def test_eps_splits(self): """ @@ -199,7 +199,7 @@ def test_eps_splits(self): 1217, 1234, 1234, 1591, 1620, 1620, 1662, 1686, 1707, 1755, 1828, 1828, 1848, 1848, 1848, 1848, 1851, 1851, 1852, 1917], dtype=int) split_eps = np.array([122, 42, 453, 436], dtype=int) - npt.assert_array_equal(HUDC._eps_splits(input_array, 10), split_eps) + npt.assert_array_equal(HUDC._cluster_forks(input_array, 10), split_eps) def test_eps_splits_hidden_peak(self): """ @@ -209,7 +209,7 @@ def test_eps_splits_hidden_peak(self): """ input_array = np.array([0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32], dtype=int) split_eps = np.array([21], dtype=int) - npt.assert_array_equal(HUDC._eps_splits(input_array, 3), split_eps) + npt.assert_array_equal(HUDC._cluster_forks(input_array, 3), split_eps) @pytest.mark.parametrize("nested,flat", [([(1, 3), [(6, 9), (11, 11)]], From 4de59e94c90221ec8ef5cd4e9c8eafbe6bf5b2cd Mon Sep 17 00:00:00 2001 From: Tim Millar Date: Tue, 27 Jun 2017 11:01:19 +1200 Subject: [PATCH 3/9] update clustring docstrings/variables to reflect terminology of Campello 2015 --- tefingerprint/cluster.py | 211 +++++++++++++++++++-------------------- tefingerprint/loci.py | 4 +- tests/test_cluster.py | 78 +++++++-------- 3 files changed, 143 insertions(+), 150 deletions(-) diff --git a/tefingerprint/cluster.py b/tefingerprint/cluster.py index 5764c08..7bfa906 100644 --- a/tefingerprint/cluster.py +++ b/tefingerprint/cluster.py @@ -3,16 +3,20 @@ import numpy as np -class UDC(object): +class UDBSCANx(object): """ - Univariate Density Cluster analysis: Create a model to calculate density clusters for a univariate array. + Univariate implimentation of DBSCAN*. + + This is an implementation of the DBSCAN* (Campello et al 2015) clustering method for use on + a sorted (in ascending order) univariate array of integers. Clusters are identified as suitably dense, continuous regions in the data where the threshold density - is specified by parameters n (minimum points) and eps (epsilon). - The range in values among every set of n points in the array is calculated and compared to epsilon. + is specified by parameters minimum points and epsilon. Points in the array that do not fall + inside a cluster are classified as noise points. + + The range in values among every set of min-points in the array is calculated and compared to epsilon. Sets of n points with a range equal to, or less than epsilon are classified as subclusters. Overlapping subclusters are then merged together to form clusters. - Points in the array that do not fall inside a cluster are regarded as noise. :param n: The minimum number of points allowed in each (sub)cluster :type n: int @@ -22,10 +26,10 @@ class UDC(object): _DTYPE_SLICE = np.dtype([('lower', np.int64), ('upper', np.int64)]) - def __init__(self, n, eps): - self.n = n - self.eps = eps - self.slices = np.array([], dtype=UDC._DTYPE_SLICE) + def __init__(self, min_points, epsilon): + self.min_points = min_points + self.epsilon = epsilon + self.slices = np.array([], dtype=UDBSCANx._DTYPE_SLICE) self.input_array = np.array([], dtype=int) @staticmethod @@ -72,79 +76,68 @@ def _melter(lowers, uppers, splits): yield l, u # return slices formatted as a numpy array - return np.fromiter(_melter(lowers, uppers, splits), dtype=UDC._DTYPE_SLICE) - + return np.fromiter(_melter(lowers, uppers, splits), dtype=UDBSCANx._DTYPE_SLICE) @staticmethod - def _subcluster(array, eps, n): + def _subcluster(array, epsilon, min_points): """ Calculate subclusters of an array and return their slice indices. The input array must be sorted in ascending order. If n is greater than the length of the array, an empty array is returned. :param array: An array of ints sorted in ascending order - :param eps: The number of points in each subcluster - :param n: The maximum distance allowed in among points of each subcluster + :param epsilon: The number of points in each subcluster + :param min_points: The maximum distance allowed in among points of each subcluster :return: An array of paired lower and upper indices for each subcluster found in the array :rtype: :class:`numpy.ndarray`[(int, int)] """ - assert UDC._sorted_ascending(array) - offset = n - 1 + assert UDBSCANx._sorted_ascending(array) + offset = min_points - 1 upper = array[offset:] lower = array[:-offset] - selected = upper - lower <= eps + selected = upper - lower <= epsilon lower_index = np.arange(0, len(lower))[selected] upper_index = np.arange(offset, len(array))[selected] + 1 - return np.fromiter(zip(lower_index, upper_index), dtype=UDC._DTYPE_SLICE) + return np.fromiter(zip(lower_index, upper_index), dtype=UDBSCANx._DTYPE_SLICE) @staticmethod - def _cluster(array, eps, n): + def _cluster(array, epsilon, min_points): """ Calculate clusters of an array and return their slice indices. The input array must be sorted in ascending order. If n is greater than the length of the array, an empty array is returned. :param array: An array of ints sorted in ascending order - :param eps: The minimum number of points in each (sub)cluster - :param n: The maximum distance allowed in among each set of n points + :param epsilon: The minimum number of points in each (sub)cluster + :param min_points: The maximum distance allowed in among each set of n points :return: An array of paired lower and upper indices for each cluster found in the array :rtype: :class:`numpy.ndarray`[(int, int)] """ # sorted-ascending checked in method _subcluster - slices = UDC._subcluster(array, eps, n) + slices = UDBSCANx._subcluster(array, epsilon, min_points) if len(slices) > 1: - slices = UDC._melt_slices(slices) + slices = UDBSCANx._melt_slices(slices) return slices @staticmethod - def udc(array, n, eps): + def udbscanx(array, min_points, epsilon): """ - Calculate Density Clusters for a Univariate Array. - - Clusters are identified as suitably dense, continuous regions in the data where the threshold density - is specified by parameters n (minimum points) and eps (epsilon). - The range in values among every set of n points in the array is calculated and compared to epsilon. - Sets of n points with a range equal to, or less than epsilon are classified as subclusters. - Overlapping subclusters are then merged together to form clusters. - Points in the array that do not fall inside a cluster are regarded as noise. - - The input array must be sorted in ascending order. - This method returns pairs of indices representing the (half open) upper and lower bounds of each cluster. + See documentation for :class: `UDBSCANx`. :param array: An array of ints sorted in ascending order :type array: :class:`numpy.ndarray`[int] - :param n: The minimum number of points in each (sub)cluster + :param min_points: The minimum number of points in each (sub)cluster :type n: int - :param eps: The maximum distance allowed in among each set of n points - :type eps: int + :param epsilon: The maximum distance allowed in among each set of n points + :type epsilon: int :return: An array of paired lower and upper indices for each cluster found in the array :rtype: :class:`numpy.ndarray`[int, int] """ - assert UDC._sorted_ascending(array) - slices = UDC._cluster(array, eps, n) + assert UDBSCANx._sorted_ascending(array) + slices = UDBSCANx._cluster(array, epsilon, min_points) return slices def fit(self, array): @@ -156,7 +149,7 @@ def fit(self, array): :type array: :class:`numpy.ndarray`[int] """ self.input_array = np.array(array, copy=True) - self.slices = UDC.udc(self.input_array, self.n, self.eps) + self.slices = UDBSCANx.udbscanx(self.input_array, self.min_points, self.epsilon) def clusters(self): """ @@ -192,46 +185,66 @@ def labels(self): return labels -class HUDC(UDC): +class UDBSCANxH(UDBSCANx): """ - Hierarchical Univariate Density Cluster analysis: A model to calculate density clusters for a univariate array. + Univariate DBSCAN* with Hierarchical component. - Clusters are identified as suitably dense, continuous regions in the data where the threshold density - is specified by parameters n (minimum points) and eps (epsilon). - The range in values among every set of n points in the array is calculated and compared to epsilon. - Sets of n points with a range equal to, or less than epsilon are classified as subclusters. - Overlapping subclusters are then merged together to form clusters. - Points in the array that do not fall inside a cluster are regarded as noise. + This is a variation of the DBSCAN*/HDBSCAN* (Campello et al 2015) clustering methods which allows some flexibility + when calculating density based clusters. While it has a hierarchical component it differs significantly from + HDBSCAN* in how cluster support is calculated and clusters are selected. - The HUDC algorithm identifies values of epsilon lower than the initial value, at which previously identified - clusters are separated into smaller 'child' clusters. At each of these splits the 'area' of each parent cluster - is compared to the combined 'area' of it's descendant clusters where a clusters 'area' is the sum total of data - points found within that cluster at each value of epsilon for which that cluster persists (i.e. is not split). - If the area of the parent is larger than that of its combined descendants, then it is selected. If the area of - combined descendants is larger than the parents area, the algorithm is re-run for each of the immediate - child clusters. This process continues until a parent cluster is selected or a terminal cluster (a cluster with - no children) is reached and automatically selected. + This implementation is for use on a sorted (in ascending order) univariate array of integers. - :param n: The minimum number of points allowed in each (sub)cluster - :type n: int - :param max_eps: An optional value for maximum distance allowed among each set of n points + This method is useful when there is some existing knowledge about the expected density of informative clusters + that are present in the data. It preferred over DBSCAN* because it allows *some* flexibility in the density of + clusters identified. However its flexibility is much more limited than HDBSCAN* to ensure that clusters are of + similar density to one another. + + Using the definitions of Campello et al 2015, this method differs from HDBSCAN* when calculating support for + clusters in that support of a cluster is calculated as: + + .. math:: + (\textbf{C}_i) = \sum_{\textbf{x}_j \in \textbf{C}_i} \varepsilon_{\text{max}}(\textbf{C}_i) - \varepsilon_{\text{min}}(\textbf{x}_j, \textbf{C}_i) + + where: + + .. math:: + \varepsilon_{\text{min}}(\textbf{x}_j, \textbf{C}_i) + + is calculated as: + + .. math:: + \text{max}\{d_\text{core}(\textbf{x}_j) , \varepsilon_{\text{min}}(\textbf{C}_i) \} + + + Clusters selection is performed in a top down manner from the root of the tree. A given cluster is selected if + its support is greater than the sum of support for all of its decedent clusters. A cluster cannot be selected + if one of its ancestor clusters has been selected. This process results in a set of flat (non-overlapping) clusters. + + The search space may be constrained by setting global maximum and minimum allowable values of epsilon. + If a maximum value for epsilon is selected, the search algorithm will start at the specified value. If a + minimum value for epsilon is selected, core distances bellow the specified level are increased to that level. + + :param min_points: The minimum number of points allowed in each cluster + :type min_points: int + :param max_eps: An optional value for maximum distance allowed among each set of min-points :type max_eps: int :param min_eps: An optional value for the minimum value of eps to be used when calculating cluster depth :type min_eps: int """ - def __init__(self, n, max_eps=None, min_eps=None): - self.min_pts = n + def __init__(self, min_points, max_eps=None, min_eps=None): + self.min_pts = min_points self.max_eps = max_eps self.min_eps = min_eps - self.slices = np.array([], dtype=UDC._DTYPE_SLICE) + self.slices = np.array([], dtype=UDBSCANx._DTYPE_SLICE) self.input_array = np.array([], dtype=int) @staticmethod def _core_distances(array, min_points): """ - Identify the core distance (minimum value of eps) for every point in an array of integers. - For each point, the minimum eps is calculated among all subclusters containing that point. + Identify the core distance (minimum value of epsilon) for every point in an array of integers. + For each point, the minimum epsilon is calculated among all subclusters containing that point. :param array: An array of integers sorted in ascending order :type array: :class:`numpy.ndarray`[int] @@ -253,20 +266,21 @@ def _core_distances(array, min_points): return np.min(eps_2d, axis=0) @staticmethod - def _cluster_forks(array, min_points): + def _fork_epsilon(array, min_points): """ Identify eps 'splits' in an array by calculating epsilon of the gaps between values in the array. - This is - Eps between values must be calculated as eps of the values either side of a gap may be significantly lower. - For example when clustering the array of vales: + Identifies the minimum epsilon of a cluster prior to it forking into child clusters. + If the cluster does not fork into children then None is returned. Note that in this cas the minimum epsilon + of the cluster is then equal to the minimum core distance of points in that cluster. + + Example when clustering the array of vales: [0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32] - with n = 3 (subclusters of 3 points). The eps calculated for values 6 and 26 is 2 and 2 respectively. - However the minimum eps value calculated for subclusters that include both values is 21. Therefore, the eps - split is 'hidden' if eps is only calculated on a per point basis. + with min-points = 3 The core distance calculated for values 6 and 26 is 2 and 2 respectively. + However the minimum eps value calculated for the cluster that include both values is 22 (26 - 4 or 28 - 6). Once eps is calculated for all gaps between values in an array, peaks are identified and classified as 'splits'. - Splits are points at which a parent clusters are split into child clusters. + Splits are points at which a parent cluster are split into child clusters. Eps of splits is reduced by a constant of 1 in order to be used as a threshold value. This is because an eps value of 21 implies that at eps = 21 a valid subcluster is formed. Therefore eps = 20 is the value at which @@ -324,7 +338,7 @@ def _flatten_list(item): """ if isinstance(item, list): for element in item: - for item in HUDC._flatten_list(element): + for item in UDBSCANxH._flatten_list(element): yield item else: yield item @@ -346,7 +360,7 @@ def _traverse_hudc_tree(points, epsilon_maximum, min_points): :rtype: list[list[]] """ # Values of epsilon bellow which the cluster forks - fork_epsilon = HUDC._cluster_forks(points['value'], min_points) + fork_epsilon = UDBSCANxH._fork_epsilon(points['value'], min_points) if fork_epsilon is None: # The cluster doesn't fork so it has no children @@ -368,38 +382,17 @@ def _traverse_hudc_tree(points, epsilon_maximum, min_points): # Combined support of children is larger so divide and repeat recursively: # A minimum epsilon of 5 means the child clusters technically starts at epsilon 4.999... # we calculate the child clusters using epsilon 4 which will produce the same clusters as 4.999... - child_cluster_bounds = HUDC._cluster(points['value'], epsilon_minimum - 1, min_points) + child_cluster_bounds = UDBSCANxH._cluster(points['value'], epsilon_minimum - 1, min_points) child_points = (points[left:right] for left, right in child_cluster_bounds) # but then use epsilon 5 as the new maximum epsilon so that support is calculated from epsilon 4.999... - return [HUDC._traverse_hudc_tree(points, epsilon_minimum, min_points) for points in child_points] + return [UDBSCANxH._traverse_hudc_tree(points, epsilon_minimum, min_points) for points in child_points] @staticmethod - def hudc(array, min_points, max_eps=None, min_eps=None): + def udbscanxh(array, min_points, max_eps=None, min_eps=None): """ - Calculate Hierarchical Density Clusters for a Univariate Array. - - Clusters are identified as suitably dense, continuous regions in the data where the threshold density - is specified by parameters n (minimum points) and eps (epsilon). - The range in values among every set of n points in the array is calculated and compared to epsilon. - Sets of n points with a range equal to, or less than epsilon are classified as subclusters. - Overlapping subclusters are then merged together to form clusters. - Points in the array that do not fall inside a cluster are regarded as noise. - - The HUDC algorithm identifies values of epsilon lower than the initial value, at which previously identified - clusters are separated into smaller 'child' clusters. At each of these splits the 'area' of each parent cluster - is compared to the combined 'area' of it's descendant clusters where a clusters 'area' is the sum total of data - points found within that cluster at each value of epsilon for which that cluster persists (i.e. is not split). - If the area of the parent is larger than that of its combined descendants, then it is selected. If the area of - combined descendants is larger than the parents area, the algorithm is re-run for each of the immediate - child clusters. This process continues until a parent cluster is selected or a terminal cluster (a cluster with - no children) is reached and automatically selected. - - The HUDC algorithm is primarily based on HDBSCAN, see: https://hdbscan.readthedocs.io/en/latest/index.html - - The input array must be sorted in ascending order. - This method returns pairs of indices representing the (half open) upper and lower bounds of each cluster + See documentation for :class: `UDBSCANxH`. - :param array: An array of ints sorted in ascending order + :param array: An array of integers sorted in ascending order :type array: :class:`numpy.ndarray`[int] :param min_points: The minimum number of points allowed in each (sub)cluster :type min_points: int @@ -411,11 +404,11 @@ def hudc(array, min_points, max_eps=None, min_eps=None): :return: An array of paired lower and upper indices for each cluster found in the array :rtype: :class:`numpy.ndarray`[(int, int)] """ - assert HUDC._sorted_ascending(array) + assert UDBSCANxH._sorted_ascending(array) if len(array) < min_points: # not enough points to form a cluster - return np.array([], dtype=UDC._DTYPE_SLICE) + return np.array([], dtype=UDBSCANx._DTYPE_SLICE) else: # create a structured array for tracking value, index and core distance of each individual @@ -424,20 +417,20 @@ def hudc(array, min_points, max_eps=None, min_eps=None): ('core_dist', np.int64)])) points['value'] = array points['index'] = np.arange(len(array), dtype=int) - points['core_dist'] = HUDC._core_distances(array, min_points) + points['core_dist'] = UDBSCANxH._core_distances(array, min_points) if not max_eps: # start at highest observed eps max_eps = np.max(points['core_dist']) if min_eps: # overwrite lower eps - points['core_dist'][points['core_dist'] < min_eps] = min_eps + points['core_dist'] = np.maximum(min_eps, points['core_dist']) # initial splits based on the specified maximum epsilon - child_points = (points[left:right] for left, right in HUDC._cluster(points['value'], max_eps, min_points)) + child_points = (points[left:right] for left, right in UDBSCANxH._cluster(points['value'], max_eps, min_points)) # recursively run on all clusters - clusters = [HUDC._traverse_hudc_tree(points, max_eps, min_points) for points in child_points] - return np.fromiter(HUDC._flatten_list(clusters), dtype=UDC._DTYPE_SLICE) + clusters = [UDBSCANxH._traverse_hudc_tree(points, max_eps, min_points) for points in child_points] + return np.fromiter(UDBSCANxH._flatten_list(clusters), dtype=UDBSCANx._DTYPE_SLICE) def fit(self, array): """ @@ -448,7 +441,7 @@ def fit(self, array): :type array: :class:`numpy.ndarray`[int] """ self.input_array = np.array(array, copy=True) - self.slices = HUDC.hudc(self.input_array, self.min_pts, max_eps=self.max_eps, min_eps=self.min_eps) + self.slices = UDBSCANxH.udbscanxh(self.input_array, self.min_pts, max_eps=self.max_eps, min_eps=self.min_eps) if __name__ == '__main__': diff --git a/tefingerprint/loci.py b/tefingerprint/loci.py index 8f4a6b1..5f4d2bb 100644 --- a/tefingerprint/loci.py +++ b/tefingerprint/loci.py @@ -471,9 +471,9 @@ def fingerprint(self, min_reads, eps, min_eps=0, hierarchical=True): # fit model if hierarchical: - model = cluster.HUDC(min_reads, max_eps=eps, min_eps=min_eps) + model = cluster.UDBSCANxH(min_reads, max_eps=eps, min_eps=min_eps) else: - model = cluster.UDC(min_reads, eps) + model = cluster.UDBSCANx(min_reads, eps) model.fit(tips) # get new loci diff --git a/tests/test_cluster.py b/tests/test_cluster.py index fa6f683..bd1b2d0 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -3,7 +3,7 @@ import pytest import numpy as np import numpy.testing as npt -from tefingerprint.cluster import UDC, HUDC +from tefingerprint.cluster import UDBSCANx, UDBSCANxH class TestUDC: """ @@ -21,29 +21,29 @@ def test_sorted_ascending(self, array, answer): """ Test for hidden method _sorted_ascending. """ - assert UDC._sorted_ascending(array) == answer + assert UDBSCANx._sorted_ascending(array) == answer @pytest.mark.parametrize("slices,melted_slices", # single slice spanning single base [(np.array([(13, 14)], - dtype=UDC._DTYPE_SLICE), + dtype=UDBSCANx._DTYPE_SLICE), np.array([(13, 14)], - dtype=UDC._DTYPE_SLICE)), + dtype=UDBSCANx._DTYPE_SLICE)), # nested slices (this shouldn't happen in practice but is handled correctly) (np.array([(15, 25), (16, 17), (19, 20)], - dtype=UDC._DTYPE_SLICE), + dtype=UDBSCANx._DTYPE_SLICE), np.array([(15, 25)], - dtype=UDC._DTYPE_SLICE)), + dtype=UDBSCANx._DTYPE_SLICE)), # adjacent loci (slices are half open intervals) (np.array([(7, 9), (9, 12)], - dtype=UDC._DTYPE_SLICE), + dtype=UDBSCANx._DTYPE_SLICE), np.array([(7, 9), (9, 12)], - dtype=UDC._DTYPE_SLICE)), + dtype=UDBSCANx._DTYPE_SLICE)), # combined (np.array([(3, 6), (6, 8), (7, 9), (10, 12), (12, 13), (15, 25), (16, 17), (19, 20)], - dtype=UDC._DTYPE_SLICE), + dtype=UDBSCANx._DTYPE_SLICE), np.array([(3, 6), (6, 9), (10, 12), (12, 13), (15, 25)], - dtype=UDC._DTYPE_SLICE)) + dtype=UDBSCANx._DTYPE_SLICE)) ], ids=['single', 'nested', 'adjacent', 'combined']) def test_melt_slices(self, slices, melted_slices): @@ -54,7 +54,7 @@ def test_melt_slices(self, slices, melted_slices): * Adjacent slices do not get merged: (7, 9) & (9, 12) --> (*, 9) & (9, *) * Slice may span a single value: (13, 14) --> (13, 14) """ - npt.assert_array_equal(UDC._melt_slices(slices), melted_slices) + npt.assert_array_equal(UDBSCANx._melt_slices(slices), melted_slices) def test_subcluster(self): """ @@ -63,8 +63,8 @@ def test_subcluster(self): array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90], dtype=int) slices = np.array([(2, 6), (3, 7), (8, 12), (12, 16), (13, 17)], - dtype=UDC._DTYPE_SLICE) - npt.assert_array_equal(UDC._subcluster(array, 5, 4), slices) + dtype=UDBSCANx._DTYPE_SLICE) + npt.assert_array_equal(UDBSCANx._subcluster(array, 5, 4), slices) def test_flat_cluster(self): """ @@ -74,18 +74,18 @@ def test_flat_cluster(self): sub_slices = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90], dtype=int) slices = np.array([(2, 7), (8, 12), (12, 17)], - dtype=UDC._DTYPE_SLICE) - npt.assert_array_equal(UDC._cluster(sub_slices, 5, 4), slices) + dtype=UDBSCANx._DTYPE_SLICE) + npt.assert_array_equal(UDBSCANx._cluster(sub_slices, 5, 4), slices) @pytest.mark.parametrize("array,slices", [(np.array([], dtype=int), - np.array([], dtype=UDC._DTYPE_SLICE)), + np.array([], dtype=UDBSCANx._DTYPE_SLICE)), (np.array([1], dtype=int), - np.array([], dtype=UDC._DTYPE_SLICE)), + np.array([], dtype=UDBSCANx._DTYPE_SLICE)), (np.array([1, 2, 3, 4], dtype=int), - np.array([], dtype=UDC._DTYPE_SLICE)), + np.array([], dtype=UDBSCANx._DTYPE_SLICE)), (np.array([1, 2, 3, 4, 5], dtype=int), - np.array([(0, 5)], dtype=UDC._DTYPE_SLICE)) + np.array([(0, 5)], dtype=UDBSCANx._DTYPE_SLICE)) ], ids=['0