From 8f101a5acc593b1a655b0c4d5edbb110cd43fce0 Mon Sep 17 00:00:00 2001
From: Tim Millar <tim.millar@plantandfood.co.nz>
Date: Wed, 21 Jun 2017 15:13:44 +1200
Subject: [PATCH 1/9] fix for noise points causing -ve child area

---
 tefingerprint/cluster.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tefingerprint/cluster.py b/tefingerprint/cluster.py
index c3d44c3..e56236a 100644
--- a/tefingerprint/cluster.py
+++ b/tefingerprint/cluster.py
@@ -348,7 +348,9 @@ def _traverse_hudc_tree(points, base_eps, n):
 
         # compare areas
         total_area = np.sum(base_eps - points['eps'])
-        child_area = np.sum(threshold_eps - points['eps'])
+        child_area = threshold_eps - points['eps']
+        child_area[child_area < 0] = 0  # remove negative values caused by noise points
+        child_area = np.sum(child_area)
         parent_area = total_area - child_area
 
         if parent_area > child_area:

From f9134d4f8092c2e3f433f1293f1fa40ac1fdf82a Mon Sep 17 00:00:00 2001
From: Tim Millar <tim.millar@plantandfood.co.nz>
Date: Mon, 26 Jun 2017 17:13:49 +1200
Subject: [PATCH 2/9] correction for calculation of minimum epsilon

---
 tefingerprint/cluster.py | 114 +++++++++++++++++++++------------------
 tests/test_cluster.py    |   8 +--
 2 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/tefingerprint/cluster.py b/tefingerprint/cluster.py
index e56236a..5764c08 100644
--- a/tefingerprint/cluster.py
+++ b/tefingerprint/cluster.py
@@ -228,34 +228,35 @@ def __init__(self, n, max_eps=None, min_eps=None):
         self.input_array = np.array([], dtype=int)
 
     @staticmethod
-    def _point_eps(array, n):
+    def _core_distances(array, min_points):
         """
-        Identify the minimum value of eps for every point in an array of integers.
+        Identify the core distance (minimum value of eps) for every point in an array of integers.
         For each point, the minimum eps is calculated among all subclusters containing that point.
 
         :param array: An array of integers sorted in ascending order
         :type array: :class:`numpy.ndarray`[int]
-        :param n: number of points used to form a subcluster
+        :param min_points: number of points used to form a subcluster
         :type n: int
 
         :return: An array of minimum eps values
         :rtype: :class:`numpy.ndarray`[int]
         """
-        assert n > 1  # groups must contain at least two points
-        offset = n - 1  # offset for indexing
+        assert min_points > 1  # groups must contain at least two points
+        offset = min_points - 1  # offset for indexing because the minimum points includes itself
         length = len(array)
         lower = array[0:length - offset]
         upper = array[offset:length]
         eps_values = upper - lower
-        eps_2d = np.full((n, length), np.max(eps_values), dtype=int)
-        for i in range(n):
+        eps_2d = np.full((min_points, length), np.max(eps_values), dtype=int)
+        for i in range(min_points):
             eps_2d[i, i:length - (offset - i)] = eps_values
         return np.min(eps_2d, axis=0)
 
     @staticmethod
-    def _eps_splits(array, n):
+    def _cluster_forks(array, min_points):
         """
-        Identify eps 'splits' in an array by calculating eps of the gaps between values in the array.
+        Identify eps 'splits' in an array by calculating epsilon of the gaps between values in the array.
+        This is
 
         Eps between values must be calculated as eps of the values either side of a gap may be significantly lower.
         For example when clustering the array of vales:
@@ -273,17 +274,17 @@ def _eps_splits(array, n):
 
         :param array: An array of integers sorted in ascending order
         :type array: :class:`numpy.ndarray`[int]
-        :param n: number of points used to form a subcluster
+        :param min_points: number of points used to form a subcluster
         :type n: int
 
-        :return: An array of eps split values
-        :rtype: :class:`numpy.ndarray`[int]
+        :return: An epsilon value or None
+        :rtype: int
         """
-        if len(array) <= n:
-            # no peaks possible because all points must have the same eps
-            return np.array([], dtype=np.int)
+        if len(array) <= min_points:
+            # no forks possible because all points must have the same eps
+            return None
 
-        offset = n - 1
+        offset = min_points - 1
 
         # calculate split eps using the 2d method
         eps_values = array[offset:] - array[:-offset]
@@ -291,7 +292,6 @@ def _eps_splits(array, n):
         for i in range(offset):
             eps_2d[i, i:len(eps_values) + i] = eps_values
         splits = np.min(eps_2d, axis=0)
-        splits -= 1  # Convert values to thresholds
 
         # Remove plateaus
         gradients = splits[1:] - splits[:-1]
@@ -300,7 +300,15 @@ def _eps_splits(array, n):
         # Remove non-peaks
         is_peak = np.logical_and(np.append(np.array([False]), splits[1:] > splits[:-1]),
                                  np.append(splits[:-1] > splits[1:], np.array([False])))
-        return splits[is_peak]
+        splits = splits[is_peak]
+
+        # If this method calculates epsilon of 5 it means the child cluster starts at epsilon 4.999...
+        if len(splits) == 0:
+            # The cluster does not fork into child clusters at all
+            return None
+        else:
+            # We only require the largest fork value
+            return np.max(splits)
 
     @staticmethod
     def _flatten_list(item):
@@ -322,48 +330,51 @@ def _flatten_list(item):
             yield item
 
     @staticmethod
-    def _traverse_hudc_tree(points, base_eps, n):
+    def _traverse_hudc_tree(points, epsilon_maximum, min_points):
         """
         Traverse a tree of nested density clusters and recursively identify clusters based on their area.
         This method is intimately tied to method 'hudc'.
 
         :param points: An array of points with the slots 'value', 'index' and 'eps
         :type points: :class:`numpy.ndarray`[(int, int, int)]
-        :param base_eps: The maximum distance allowed in among each set of n points
-        :type base_eps: int
-        :param n: The minimum number of points allowed in each (sub)cluster
-        :type n: int
+        :param epsilon_maximum: Maximum distance allowed in among each set of n points
+        :type epsilon_maximum: int
+        :param min_points: The minimum number of points allowed in each (sub)cluster
+        :type min_points: int
 
         :return: A nested list of upper and (half-open) indices of selected clusters
         :rtype: list[list[]]
         """
-        splits = HUDC._eps_splits(points['value'], n)
+        # Values of epsilon bellow which the cluster forks
+        fork_epsilon = HUDC._cluster_forks(points['value'], min_points)
 
-        if len(splits) == 0:
-            # there are no children so return slice indices
+        if fork_epsilon is None:
+            # The cluster doesn't fork so it has no children
+            # Epsilon_minimum would equal the minimum of core distances but it's not neededE
             return points['index'][0], points['index'][-1] + 1
 
-        # compare based on largest peak (least dense place)
-        threshold_eps = np.max(splits)
+        # If a cluster forks into children then it's minimum epsilon is the value at which forks
+        epsilon_minimum = fork_epsilon
 
-        # compare areas
-        total_area = np.sum(base_eps - points['eps'])
-        child_area = threshold_eps - points['eps']
-        child_area[child_area < 0] = 0  # remove negative values caused by noise points
-        child_area = np.sum(child_area)
-        parent_area = total_area - child_area
+        # Compare support for cluster and its children
+        support = np.sum(epsilon_maximum - np.maximum(epsilon_minimum, points['core_dist']))
+        support_children = np.sum(np.maximum(0, epsilon_minimum - points['core_dist']))
 
-        if parent_area > child_area:
-            # parent is lager so return slice indices
+        if support > support_children:
+            # Parent is supported so return slice indices
             return points['index'][0], points['index'][-1] + 1
 
         else:
-            # combined area of children is larger so divide and repeat
-            child_points = (points[left:right] for left, right in HUDC._cluster(points['value'], threshold_eps, n))
-            return [HUDC._traverse_hudc_tree(points, threshold_eps, n) for points in child_points]
+            # Combined support of children is larger so divide and repeat recursively:
+            # A minimum epsilon of 5 means the child clusters technically starts at epsilon 4.999...
+            # we calculate the child clusters using epsilon 4 which will produce the same clusters as 4.999...
+            child_cluster_bounds = HUDC._cluster(points['value'], epsilon_minimum - 1,  min_points)
+            child_points = (points[left:right] for left, right in child_cluster_bounds)
+            # but then use epsilon 5 as the new maximum epsilon so that support is calculated from epsilon 4.999...
+            return [HUDC._traverse_hudc_tree(points, epsilon_minimum, min_points) for points in child_points]
 
     @staticmethod
-    def hudc(array, n, max_eps=None, min_eps=None):
+    def hudc(array, min_points, max_eps=None, min_eps=None):
         """
         Calculate Hierarchical Density Clusters for a Univariate Array.
 
@@ -390,8 +401,8 @@ def hudc(array, n, max_eps=None, min_eps=None):
 
         :param array: An array of ints sorted in ascending order
         :type array: :class:`numpy.ndarray`[int]
-        :param n: The minimum number of points allowed in each (sub)cluster
-        :type n: int
+        :param min_points: The minimum number of points allowed in each (sub)cluster
+        :type min_points: int
         :param max_eps: An optional value for maximum distance allowed among each set of n points
         :type max_eps: int
         :param min_eps: An optional value for the minimum value of eps to be used when calculating cluster depth
@@ -402,29 +413,30 @@ def hudc(array, n, max_eps=None, min_eps=None):
         """
         assert HUDC._sorted_ascending(array)
 
-        if len(array) < n:
+        if len(array) < min_points:
             # not enough points to form a cluster
             return np.array([], dtype=UDC._DTYPE_SLICE)
 
         else:
+            # create a structured array for tracking value, index and core distance of each individual
             points = np.empty(len(array), dtype=np.dtype([('value', np.int64),
                                                           ('index', np.int64),
-                                                          ('eps', np.int64)]))
+                                                          ('core_dist', np.int64)]))
             points['value'] = array
             points['index'] = np.arange(len(array), dtype=int)
-            points['eps'] = HUDC._point_eps(array, n)
+            points['core_dist'] = HUDC._core_distances(array, min_points)
             if not max_eps:
                 # start at highest observed eps
-                max_eps = np.max(points['eps'])
+                max_eps = np.max(points['core_dist'])
             if min_eps:
                 # overwrite lower eps
-                points['eps'][points['eps'] < min_eps] = min_eps
+                points['core_dist'][points['core_dist'] < min_eps] = min_eps
 
-            # initial splits
-            child_points = (points[left:right] for left, right in HUDC._cluster(points['value'], max_eps, n))
+            # initial splits based on the specified maximum epsilon
+            child_points = (points[left:right] for left, right in HUDC._cluster(points['value'], max_eps, min_points))
 
-            # run
-            clusters = [HUDC._traverse_hudc_tree(points, max_eps, n) for points in child_points]
+            # recursively run on all clusters
+            clusters = [HUDC._traverse_hudc_tree(points, max_eps, min_points) for points in child_points]
             return np.fromiter(HUDC._flatten_list(clusters), dtype=UDC._DTYPE_SLICE)
 
     def fit(self, array):
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 6a03f12..fa6f683 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -172,7 +172,7 @@ def test_point_eps(self):
                                54,  54,  54,  54,  54,  54,  54,  54, 106, 106, 106, 106, 123,
                               124, 124, 257, 228, 228, 186, 165, 144,  97,  89,  89,  89,  89,
                                89,  89,  89,  89,  89,  89], dtype=int)
-        npt.assert_array_equal(HUDC._point_eps(input_array, 10), point_eps)
+        npt.assert_array_equal(HUDC._core_distances(input_array, 10), point_eps)
 
     def test_point_eps_hidden_peak(self):
         """
@@ -182,7 +182,7 @@ def test_point_eps_hidden_peak(self):
         """
         input_array = np.array([0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32], dtype=int)
         point_eps = np.array([3, 3, 1, 1, 1, 2, 2, 1, 1, 1, 3, 3], dtype=int)
-        npt.assert_array_equal(HUDC._point_eps(input_array, 3), point_eps)
+        npt.assert_array_equal(HUDC._core_distances(input_array, 3), point_eps)
 
     def test_eps_splits(self):
         """
@@ -199,7 +199,7 @@ def test_eps_splits(self):
                                 1217, 1234, 1234, 1591, 1620, 1620, 1662, 1686, 1707, 1755, 1828,
                                 1828, 1848, 1848, 1848, 1848, 1851, 1851, 1852, 1917], dtype=int)
         split_eps = np.array([122,  42, 453, 436], dtype=int)
-        npt.assert_array_equal(HUDC._eps_splits(input_array, 10), split_eps)
+        npt.assert_array_equal(HUDC._cluster_forks(input_array, 10), split_eps)
 
     def test_eps_splits_hidden_peak(self):
         """
@@ -209,7 +209,7 @@ def test_eps_splits_hidden_peak(self):
         """
         input_array = np.array([0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32], dtype=int)
         split_eps = np.array([21], dtype=int)
-        npt.assert_array_equal(HUDC._eps_splits(input_array, 3), split_eps)
+        npt.assert_array_equal(HUDC._cluster_forks(input_array, 3), split_eps)
 
     @pytest.mark.parametrize("nested,flat",
                              [([(1, 3), [(6, 9), (11, 11)]],

From 4de59e94c90221ec8ef5cd4e9c8eafbe6bf5b2cd Mon Sep 17 00:00:00 2001
From: Tim Millar <tim.millar@plantandfood.co.nz>
Date: Tue, 27 Jun 2017 11:01:19 +1200
Subject: [PATCH 3/9] update clustring docstrings/variables to reflect
 terminology of Campello 2015

---
 tefingerprint/cluster.py | 211 +++++++++++++++++++--------------------
 tefingerprint/loci.py    |   4 +-
 tests/test_cluster.py    |  78 +++++++--------
 3 files changed, 143 insertions(+), 150 deletions(-)

diff --git a/tefingerprint/cluster.py b/tefingerprint/cluster.py
index 5764c08..7bfa906 100644
--- a/tefingerprint/cluster.py
+++ b/tefingerprint/cluster.py
@@ -3,16 +3,20 @@
 import numpy as np
 
 
-class UDC(object):
+class UDBSCANx(object):
     """
-    Univariate Density Cluster analysis: Create a model to calculate density clusters for a univariate array.
+    Univariate implimentation of DBSCAN*.
+
+    This is an implementation of the DBSCAN* (Campello et al 2015) clustering method for use on
+    a sorted (in ascending order) univariate array of integers.
 
     Clusters are identified as suitably dense, continuous regions in the data where the threshold density
-    is specified by parameters n (minimum points) and eps (epsilon).
-    The range in values among every set of n points in the array is calculated and compared to epsilon.
+    is specified by parameters minimum points and epsilon. Points in the array that do not fall
+    inside a cluster are classified as noise points.
+
+    The range in values among every set of min-points in the array is calculated and compared to epsilon.
     Sets of n points with a range equal to, or less than epsilon are classified as subclusters.
     Overlapping subclusters are then merged together to form clusters.
-    Points in the array that do not fall inside a cluster are regarded as noise.
 
     :param n: The minimum number of points allowed in each (sub)cluster
     :type n: int
@@ -22,10 +26,10 @@ class UDC(object):
 
     _DTYPE_SLICE = np.dtype([('lower', np.int64), ('upper', np.int64)])
 
-    def __init__(self, n, eps):
-        self.n = n
-        self.eps = eps
-        self.slices = np.array([], dtype=UDC._DTYPE_SLICE)
+    def __init__(self, min_points, epsilon):
+        self.min_points = min_points
+        self.epsilon = epsilon
+        self.slices = np.array([], dtype=UDBSCANx._DTYPE_SLICE)
         self.input_array = np.array([], dtype=int)
 
     @staticmethod
@@ -72,79 +76,68 @@ def _melter(lowers, uppers, splits):
             yield l, u
 
         # return slices formatted as a numpy array
-        return np.fromiter(_melter(lowers, uppers, splits), dtype=UDC._DTYPE_SLICE)
-
+        return np.fromiter(_melter(lowers, uppers, splits), dtype=UDBSCANx._DTYPE_SLICE)
 
     @staticmethod
-    def _subcluster(array, eps, n):
+    def _subcluster(array, epsilon, min_points):
         """
         Calculate subclusters of an array and return their slice indices.
         The input array must be sorted in ascending order.
         If n is greater than the length of the array, an empty array is returned.
 
         :param array: An array of ints sorted in ascending order
-        :param eps: The number of points in each subcluster
-        :param n: The maximum distance allowed in among points of each subcluster
+        :param epsilon: The number of points in each subcluster
+        :param min_points: The maximum distance allowed in among points of each subcluster
 
         :return: An array of paired lower and upper indices for each subcluster found in the array
         :rtype: :class:`numpy.ndarray`[(int, int)]
         """
-        assert UDC._sorted_ascending(array)
-        offset = n - 1
+        assert UDBSCANx._sorted_ascending(array)
+        offset = min_points - 1
         upper = array[offset:]
         lower = array[:-offset]
-        selected = upper - lower <= eps
+        selected = upper - lower <= epsilon
         lower_index = np.arange(0, len(lower))[selected]
         upper_index = np.arange(offset, len(array))[selected] + 1
-        return np.fromiter(zip(lower_index, upper_index), dtype=UDC._DTYPE_SLICE)
+        return np.fromiter(zip(lower_index, upper_index), dtype=UDBSCANx._DTYPE_SLICE)
 
     @staticmethod
-    def _cluster(array, eps, n):
+    def _cluster(array, epsilon, min_points):
         """
         Calculate clusters of an array and return their slice indices.
         The input array must be sorted in ascending order.
         If n is greater than the length of the array, an empty array is returned.
 
         :param array: An array of ints sorted in ascending order
-        :param eps: The minimum number of points in each (sub)cluster
-        :param n: The maximum distance allowed in among each set of n points
+        :param epsilon: The minimum number of points in each (sub)cluster
+        :param min_points: The maximum distance allowed in among each set of n points
 
         :return: An array of paired lower and upper indices for each cluster found in the array
         :rtype: :class:`numpy.ndarray`[(int, int)]
         """
         # sorted-ascending checked in method _subcluster
-        slices = UDC._subcluster(array, eps, n)
+        slices = UDBSCANx._subcluster(array, epsilon, min_points)
         if len(slices) > 1:
-            slices = UDC._melt_slices(slices)
+            slices = UDBSCANx._melt_slices(slices)
         return slices
 
     @staticmethod
-    def udc(array, n, eps):
+    def udbscanx(array, min_points, epsilon):
         """
-        Calculate Density Clusters for a Univariate Array.
-
-        Clusters are identified as suitably dense, continuous regions in the data where the threshold density
-        is specified by parameters n (minimum points) and eps (epsilon).
-        The range in values among every set of n points in the array is calculated and compared to epsilon.
-        Sets of n points with a range equal to, or less than epsilon are classified as subclusters.
-        Overlapping subclusters are then merged together to form clusters.
-        Points in the array that do not fall inside a cluster are regarded as noise.
-
-        The input array must be sorted in ascending order.
-        This method returns pairs of indices representing the (half open) upper and lower bounds of each cluster.
+        See documentation for :class: `UDBSCANx`.
 
         :param array: An array of ints sorted in ascending order
         :type array: :class:`numpy.ndarray`[int]
-        :param n: The minimum number of points in each (sub)cluster
+        :param min_points: The minimum number of points in each (sub)cluster
         :type n: int
-        :param eps: The maximum distance allowed in among each set of n points
-        :type eps: int
+        :param epsilon: The maximum distance allowed in among each set of n points
+        :type epsilon: int
 
         :return: An array of paired lower and upper indices for each cluster found in the array
         :rtype: :class:`numpy.ndarray`[int, int]
         """
-        assert UDC._sorted_ascending(array)
-        slices = UDC._cluster(array, eps, n)
+        assert UDBSCANx._sorted_ascending(array)
+        slices = UDBSCANx._cluster(array, epsilon, min_points)
         return slices
 
     def fit(self, array):
@@ -156,7 +149,7 @@ def fit(self, array):
         :type array: :class:`numpy.ndarray`[int]
         """
         self.input_array = np.array(array, copy=True)
-        self.slices = UDC.udc(self.input_array, self.n, self.eps)
+        self.slices = UDBSCANx.udbscanx(self.input_array, self.min_points, self.epsilon)
 
     def clusters(self):
         """
@@ -192,46 +185,66 @@ def labels(self):
         return labels
 
 
-class HUDC(UDC):
+class UDBSCANxH(UDBSCANx):
     """
-    Hierarchical Univariate Density Cluster analysis: A model to calculate density clusters for a univariate array.
+    Univariate DBSCAN* with Hierarchical component.
 
-    Clusters are identified as suitably dense, continuous regions in the data where the threshold density
-    is specified by parameters n (minimum points) and eps (epsilon).
-    The range in values among every set of n points in the array is calculated and compared to epsilon.
-    Sets of n points with a range equal to, or less than epsilon are classified as subclusters.
-    Overlapping subclusters are then merged together to form clusters.
-    Points in the array that do not fall inside a cluster are regarded as noise.
+    This is a variation of the DBSCAN*/HDBSCAN* (Campello et al 2015) clustering methods which allows some flexibility
+    when calculating density based clusters. While it has a hierarchical component it differs significantly from
+    HDBSCAN* in how cluster support is calculated and clusters are selected.
 
-    The HUDC algorithm identifies values of epsilon lower than the initial value, at which previously identified
-    clusters are separated into smaller 'child' clusters. At each of these splits the 'area' of each parent cluster
-    is compared to the combined 'area' of it's descendant clusters where a clusters 'area' is the sum total of data
-    points found within that cluster at each value of epsilon for which that cluster persists (i.e. is not split).
-    If the area of the parent is larger than that of its combined descendants, then it is selected. If the area of
-    combined descendants is larger than the parents area, the algorithm is re-run for each of the immediate
-    child clusters. This process continues until a parent cluster is selected or a terminal cluster (a cluster with
-    no children) is reached and automatically selected.
+    This implementation is for use on a sorted (in ascending order) univariate array of integers.
 
-    :param n: The minimum number of points allowed in each (sub)cluster
-    :type n: int
-    :param max_eps: An optional value for maximum distance allowed among each set of n points
+    This method is useful when there is some existing knowledge about the expected density of informative clusters
+    that are present in the data. It preferred over DBSCAN* because it allows *some* flexibility in the density of
+    clusters identified. However its flexibility is much more limited than HDBSCAN* to ensure that clusters are of
+    similar density to one another.
+
+    Using the definitions of Campello et al 2015, this method differs from HDBSCAN* when calculating support for
+    clusters in that support of a cluster is calculated as:
+
+    .. math::
+        (\textbf{C}_i) = \sum_{\textbf{x}_j \in \textbf{C}_i} \varepsilon_{\text{max}}(\textbf{C}_i) - \varepsilon_{\text{min}}(\textbf{x}_j, \textbf{C}_i)
+
+    where:
+
+    .. math::
+        \varepsilon_{\text{min}}(\textbf{x}_j, \textbf{C}_i)
+
+    is calculated as:
+
+    .. math::
+        \text{max}\{d_\text{core}(\textbf{x}_j) , \varepsilon_{\text{min}}(\textbf{C}_i) \}
+
+
+    Clusters selection is performed in a top down manner from the root of the tree. A given cluster is selected if
+    its support is greater than the sum of support for all of its decedent clusters. A cluster cannot be selected
+    if one of its ancestor clusters has been selected. This process results in a set of flat (non-overlapping) clusters.
+
+    The search space may be constrained by setting global maximum and minimum allowable values of epsilon.
+    If a maximum value for epsilon is selected, the search algorithm will start at the specified value. If a
+    minimum value for epsilon is selected, core distances bellow the specified level are increased to that level.
+
+    :param min_points: The minimum number of points allowed in each cluster
+    :type min_points: int
+    :param max_eps: An optional value for maximum distance allowed among each set of min-points
     :type max_eps: int
     :param min_eps: An optional value for the minimum value of eps to be used when calculating cluster depth
     :type min_eps: int
     """
 
-    def __init__(self, n, max_eps=None, min_eps=None):
-        self.min_pts = n
+    def __init__(self, min_points, max_eps=None, min_eps=None):
+        self.min_pts = min_points
         self.max_eps = max_eps
         self.min_eps = min_eps
-        self.slices = np.array([], dtype=UDC._DTYPE_SLICE)
+        self.slices = np.array([], dtype=UDBSCANx._DTYPE_SLICE)
         self.input_array = np.array([], dtype=int)
 
     @staticmethod
     def _core_distances(array, min_points):
         """
-        Identify the core distance (minimum value of eps) for every point in an array of integers.
-        For each point, the minimum eps is calculated among all subclusters containing that point.
+        Identify the core distance (minimum value of epsilon) for every point in an array of integers.
+        For each point, the minimum epsilon is calculated among all subclusters containing that point.
 
         :param array: An array of integers sorted in ascending order
         :type array: :class:`numpy.ndarray`[int]
@@ -253,20 +266,21 @@ def _core_distances(array, min_points):
         return np.min(eps_2d, axis=0)
 
     @staticmethod
-    def _cluster_forks(array, min_points):
+    def _fork_epsilon(array, min_points):
         """
         Identify eps 'splits' in an array by calculating epsilon of the gaps between values in the array.
-        This is
 
-        Eps between values must be calculated as eps of the values either side of a gap may be significantly lower.
-        For example when clustering the array of vales:
+        Identifies the minimum epsilon of a cluster prior to it forking into child clusters.
+        If the cluster does not fork into children then None is returned. Note that in this cas the minimum epsilon
+        of the cluster is then equal to the minimum core distance of points in that cluster.
+
+        Example when clustering the array of vales:
             [0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32]
-        with n = 3 (subclusters of 3 points). The eps calculated for values 6 and 26 is 2 and 2 respectively.
-        However the minimum eps value calculated for subclusters that include both values is 21. Therefore, the eps
-        split is 'hidden' if eps is only calculated on a per point basis.
+        with min-points = 3 The core distance calculated for values 6 and 26 is 2 and 2 respectively.
+        However the minimum eps value calculated for the cluster that include both values is 22 (26 - 4 or 28 - 6).
 
         Once eps is calculated for all gaps between values in an array, peaks are identified and classified as 'splits'.
-        Splits are points at which a parent clusters are split into child clusters.
+        Splits are points at which a parent cluster are split into child clusters.
 
         Eps of splits is reduced by a constant of 1 in order to be used as a threshold value. This is because an eps
         value of 21 implies that at eps = 21 a valid subcluster is formed. Therefore eps = 20 is the value at which
@@ -324,7 +338,7 @@ def _flatten_list(item):
         """
         if isinstance(item, list):
             for element in item:
-                for item in HUDC._flatten_list(element):
+                for item in UDBSCANxH._flatten_list(element):
                     yield item
         else:
             yield item
@@ -346,7 +360,7 @@ def _traverse_hudc_tree(points, epsilon_maximum, min_points):
         :rtype: list[list[]]
         """
         # Values of epsilon bellow which the cluster forks
-        fork_epsilon = HUDC._cluster_forks(points['value'], min_points)
+        fork_epsilon = UDBSCANxH._fork_epsilon(points['value'], min_points)
 
         if fork_epsilon is None:
             # The cluster doesn't fork so it has no children
@@ -368,38 +382,17 @@ def _traverse_hudc_tree(points, epsilon_maximum, min_points):
             # Combined support of children is larger so divide and repeat recursively:
             # A minimum epsilon of 5 means the child clusters technically starts at epsilon 4.999...
             # we calculate the child clusters using epsilon 4 which will produce the same clusters as 4.999...
-            child_cluster_bounds = HUDC._cluster(points['value'], epsilon_minimum - 1,  min_points)
+            child_cluster_bounds = UDBSCANxH._cluster(points['value'], epsilon_minimum - 1, min_points)
             child_points = (points[left:right] for left, right in child_cluster_bounds)
             # but then use epsilon 5 as the new maximum epsilon so that support is calculated from epsilon 4.999...
-            return [HUDC._traverse_hudc_tree(points, epsilon_minimum, min_points) for points in child_points]
+            return [UDBSCANxH._traverse_hudc_tree(points, epsilon_minimum, min_points) for points in child_points]
 
     @staticmethod
-    def hudc(array, min_points, max_eps=None, min_eps=None):
+    def udbscanxh(array, min_points, max_eps=None, min_eps=None):
         """
-        Calculate Hierarchical Density Clusters for a Univariate Array.
-
-        Clusters are identified as suitably dense, continuous regions in the data where the threshold density
-        is specified by parameters n (minimum points) and eps (epsilon).
-        The range in values among every set of n points in the array is calculated and compared to epsilon.
-        Sets of n points with a range equal to, or less than epsilon are classified as subclusters.
-        Overlapping subclusters are then merged together to form clusters.
-        Points in the array that do not fall inside a cluster are regarded as noise.
-
-        The HUDC algorithm identifies values of epsilon lower than the initial value, at which previously identified
-        clusters are separated into smaller 'child' clusters. At each of these splits the 'area' of each parent cluster
-        is compared to the combined 'area' of it's descendant clusters where a clusters 'area' is the sum total of data
-        points found within that cluster at each value of epsilon for which that cluster persists (i.e. is not split).
-        If the area of the parent is larger than that of its combined descendants, then it is selected. If the area of
-        combined descendants is larger than the parents area, the algorithm is re-run for each of the immediate
-        child clusters. This process continues until a parent cluster is selected or a terminal cluster (a cluster with
-        no children) is reached and automatically selected.
-
-        The HUDC algorithm is primarily based on HDBSCAN, see: https://hdbscan.readthedocs.io/en/latest/index.html
-
-        The input array must be sorted in ascending order.
-        This method returns pairs of indices representing the (half open) upper and lower bounds of each cluster
+        See documentation for :class: `UDBSCANxH`.
 
-        :param array: An array of ints sorted in ascending order
+        :param array: An array of integers sorted in ascending order
         :type array: :class:`numpy.ndarray`[int]
         :param min_points: The minimum number of points allowed in each (sub)cluster
         :type min_points: int
@@ -411,11 +404,11 @@ def hudc(array, min_points, max_eps=None, min_eps=None):
         :return: An array of paired lower and upper indices for each cluster found in the array
         :rtype: :class:`numpy.ndarray`[(int, int)]
         """
-        assert HUDC._sorted_ascending(array)
+        assert UDBSCANxH._sorted_ascending(array)
 
         if len(array) < min_points:
             # not enough points to form a cluster
-            return np.array([], dtype=UDC._DTYPE_SLICE)
+            return np.array([], dtype=UDBSCANx._DTYPE_SLICE)
 
         else:
             # create a structured array for tracking value, index and core distance of each individual
@@ -424,20 +417,20 @@ def hudc(array, min_points, max_eps=None, min_eps=None):
                                                           ('core_dist', np.int64)]))
             points['value'] = array
             points['index'] = np.arange(len(array), dtype=int)
-            points['core_dist'] = HUDC._core_distances(array, min_points)
+            points['core_dist'] = UDBSCANxH._core_distances(array, min_points)
             if not max_eps:
                 # start at highest observed eps
                 max_eps = np.max(points['core_dist'])
             if min_eps:
                 # overwrite lower eps
-                points['core_dist'][points['core_dist'] < min_eps] = min_eps
+                points['core_dist'] = np.maximum(min_eps, points['core_dist'])
 
             # initial splits based on the specified maximum epsilon
-            child_points = (points[left:right] for left, right in HUDC._cluster(points['value'], max_eps, min_points))
+            child_points = (points[left:right] for left, right in UDBSCANxH._cluster(points['value'], max_eps, min_points))
 
             # recursively run on all clusters
-            clusters = [HUDC._traverse_hudc_tree(points, max_eps, min_points) for points in child_points]
-            return np.fromiter(HUDC._flatten_list(clusters), dtype=UDC._DTYPE_SLICE)
+            clusters = [UDBSCANxH._traverse_hudc_tree(points, max_eps, min_points) for points in child_points]
+            return np.fromiter(UDBSCANxH._flatten_list(clusters), dtype=UDBSCANx._DTYPE_SLICE)
 
     def fit(self, array):
         """
@@ -448,7 +441,7 @@ def fit(self, array):
         :type array: :class:`numpy.ndarray`[int]
         """
         self.input_array = np.array(array, copy=True)
-        self.slices = HUDC.hudc(self.input_array, self.min_pts, max_eps=self.max_eps, min_eps=self.min_eps)
+        self.slices = UDBSCANxH.udbscanxh(self.input_array, self.min_pts, max_eps=self.max_eps, min_eps=self.min_eps)
 
 
 if __name__ == '__main__':
diff --git a/tefingerprint/loci.py b/tefingerprint/loci.py
index 8f4a6b1..5f4d2bb 100644
--- a/tefingerprint/loci.py
+++ b/tefingerprint/loci.py
@@ -471,9 +471,9 @@ def fingerprint(self, min_reads, eps, min_eps=0, hierarchical=True):
 
             # fit model
             if hierarchical:
-                model = cluster.HUDC(min_reads, max_eps=eps, min_eps=min_eps)
+                model = cluster.UDBSCANxH(min_reads, max_eps=eps, min_eps=min_eps)
             else:
-                model = cluster.UDC(min_reads, eps)
+                model = cluster.UDBSCANx(min_reads, eps)
             model.fit(tips)
 
             # get new loci
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index fa6f683..bd1b2d0 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -3,7 +3,7 @@
 import pytest
 import numpy as np
 import numpy.testing as npt
-from tefingerprint.cluster import UDC, HUDC
+from tefingerprint.cluster import UDBSCANx, UDBSCANxH
 
 class TestUDC:
     """
@@ -21,29 +21,29 @@ def test_sorted_ascending(self, array, answer):
         """
         Test for hidden method _sorted_ascending.
         """
-        assert UDC._sorted_ascending(array) == answer
+        assert UDBSCANx._sorted_ascending(array) == answer
 
     @pytest.mark.parametrize("slices,melted_slices",
                              # single slice spanning single base
                              [(np.array([(13, 14)],
-                                        dtype=UDC._DTYPE_SLICE),
+                                        dtype=UDBSCANx._DTYPE_SLICE),
                                np.array([(13, 14)],
-                                        dtype=UDC._DTYPE_SLICE)),
+                                        dtype=UDBSCANx._DTYPE_SLICE)),
                               # nested slices (this shouldn't happen in practice but is handled correctly)
                               (np.array([(15, 25), (16, 17), (19, 20)],
-                                        dtype=UDC._DTYPE_SLICE),
+                                        dtype=UDBSCANx._DTYPE_SLICE),
                                np.array([(15, 25)],
-                                        dtype=UDC._DTYPE_SLICE)),
+                                        dtype=UDBSCANx._DTYPE_SLICE)),
                               # adjacent loci (slices are half open intervals)
                               (np.array([(7, 9), (9, 12)],
-                                        dtype=UDC._DTYPE_SLICE),
+                                        dtype=UDBSCANx._DTYPE_SLICE),
                                np.array([(7, 9), (9, 12)],
-                                        dtype=UDC._DTYPE_SLICE)),
+                                        dtype=UDBSCANx._DTYPE_SLICE)),
                               # combined
                               (np.array([(3, 6), (6, 8), (7, 9), (10, 12), (12, 13), (15, 25), (16, 17), (19, 20)],
-                                        dtype=UDC._DTYPE_SLICE),
+                                        dtype=UDBSCANx._DTYPE_SLICE),
                                np.array([(3, 6), (6, 9), (10, 12), (12, 13), (15, 25)],
-                                        dtype=UDC._DTYPE_SLICE))
+                                        dtype=UDBSCANx._DTYPE_SLICE))
                               ],
                              ids=['single', 'nested', 'adjacent', 'combined'])
     def test_melt_slices(self, slices, melted_slices):
@@ -54,7 +54,7 @@ def test_melt_slices(self, slices, melted_slices):
          * Adjacent slices do not get merged: (7, 9) & (9, 12) -->  (*, 9) & (9, *)
          * Slice may span a single value: (13, 14) --> (13, 14)
         """
-        npt.assert_array_equal(UDC._melt_slices(slices), melted_slices)
+        npt.assert_array_equal(UDBSCANx._melt_slices(slices), melted_slices)
 
     def test_subcluster(self):
         """
@@ -63,8 +63,8 @@ def test_subcluster(self):
         array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90],
                          dtype=int)
         slices = np.array([(2, 6), (3, 7), (8, 12), (12, 16), (13, 17)],
-                          dtype=UDC._DTYPE_SLICE)
-        npt.assert_array_equal(UDC._subcluster(array, 5, 4), slices)
+                          dtype=UDBSCANx._DTYPE_SLICE)
+        npt.assert_array_equal(UDBSCANx._subcluster(array, 5, 4), slices)
 
     def test_flat_cluster(self):
         """
@@ -74,18 +74,18 @@ def test_flat_cluster(self):
         sub_slices = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90],
                               dtype=int)
         slices = np.array([(2, 7), (8, 12), (12, 17)],
-                          dtype=UDC._DTYPE_SLICE)
-        npt.assert_array_equal(UDC._cluster(sub_slices, 5, 4), slices)
+                          dtype=UDBSCANx._DTYPE_SLICE)
+        npt.assert_array_equal(UDBSCANx._cluster(sub_slices, 5, 4), slices)
 
     @pytest.mark.parametrize("array,slices",
                              [(np.array([], dtype=int),
-                               np.array([], dtype=UDC._DTYPE_SLICE)),
+                               np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
                               (np.array([1], dtype=int),
-                               np.array([], dtype=UDC._DTYPE_SLICE)),
+                               np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
                               (np.array([1, 2, 3, 4], dtype=int),
-                               np.array([], dtype=UDC._DTYPE_SLICE)),
+                               np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
                               (np.array([1, 2, 3, 4, 5], dtype=int),
-                               np.array([(0, 5)], dtype=UDC._DTYPE_SLICE))
+                               np.array([(0, 5)], dtype=UDBSCANx._DTYPE_SLICE))
                               ],
                              ids=['0<n', '1<n', '4<n', '5==n'])
     def test_udc_few_reads(self, array, slices):
@@ -94,7 +94,7 @@ def test_udc_few_reads(self, array, slices):
         Method udc should correctly handle an array of length 0 or greater.
         If the length of the array is less than 'n' then an empty array will always be returned.
         """
-        npt.assert_array_equal(UDC.udc(array, 5, 5), slices)
+        npt.assert_array_equal(UDBSCANx.udbscanx(array, 5, 5), slices)
 
     def test_fit(self):
         """
@@ -103,13 +103,13 @@ def test_fit(self):
         New copy of points should be sorted.
         Most edge cases should be caught in tests for component methods.
         """
-        udc_object = UDC(4, 5)
+        udc_object = UDBSCANx(4, 5)
         input_array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90],
                                dtype=int)
         answer_array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90],
                                 dtype=int)
         answer_slices = np.array([(2, 7), (8, 12), (12, 17)],
-                                 dtype=UDC._DTYPE_SLICE)
+                                 dtype=UDBSCANx._DTYPE_SLICE)
         udc_object.fit(input_array)
         assert udc_object.input_array is not input_array
         npt.assert_array_equal(udc_object.input_array, answer_array)
@@ -119,7 +119,7 @@ def test_clusters(self):
         """
         Test for method clusters.
         """
-        udc_object = UDC(4, 5)
+        udc_object = UDBSCANx(4, 5)
         input_array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90], dtype=int)
         cluster_arrays = [np.array([21, 22, 22, 22, 24], dtype=int),
                           np.array([54, 54, 55, 56], dtype=int),
@@ -130,7 +130,7 @@ def test_clusters(self):
 
     def test_cluster_extremities(self):
         """Test for method cluster_extremities"""
-        udc_object = UDC(4, 5)
+        udc_object = UDBSCANx(4, 5)
         input_array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90], dtype=int)
         cluster_extremities = [(21, 24), (54, 56), (65, 68)]
         udc_object.fit(input_array)
@@ -139,7 +139,7 @@ def test_cluster_extremities(self):
 
     def test_labels(self):
         """Test for method labels"""
-        udc_object = UDC(4, 5)
+        udc_object = UDBSCANx(4, 5)
         input_array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90], dtype=int)
         labels = np.array([-1, -1, 0, 0, 0, 0, 0, -1, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1], dtype=int)
         udc_object.fit(input_array)
@@ -172,7 +172,7 @@ def test_point_eps(self):
                                54,  54,  54,  54,  54,  54,  54,  54, 106, 106, 106, 106, 123,
                               124, 124, 257, 228, 228, 186, 165, 144,  97,  89,  89,  89,  89,
                                89,  89,  89,  89,  89,  89], dtype=int)
-        npt.assert_array_equal(HUDC._core_distances(input_array, 10), point_eps)
+        npt.assert_array_equal(UDBSCANxH._core_distances(input_array, 10), point_eps)
 
     def test_point_eps_hidden_peak(self):
         """
@@ -182,7 +182,7 @@ def test_point_eps_hidden_peak(self):
         """
         input_array = np.array([0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32], dtype=int)
         point_eps = np.array([3, 3, 1, 1, 1, 2, 2, 1, 1, 1, 3, 3], dtype=int)
-        npt.assert_array_equal(HUDC._core_distances(input_array, 3), point_eps)
+        npt.assert_array_equal(UDBSCANxH._core_distances(input_array, 3), point_eps)
 
     def test_eps_splits(self):
         """
@@ -199,7 +199,7 @@ def test_eps_splits(self):
                                 1217, 1234, 1234, 1591, 1620, 1620, 1662, 1686, 1707, 1755, 1828,
                                 1828, 1848, 1848, 1848, 1848, 1851, 1851, 1852, 1917], dtype=int)
         split_eps = np.array([122,  42, 453, 436], dtype=int)
-        npt.assert_array_equal(HUDC._cluster_forks(input_array, 10), split_eps)
+        npt.assert_array_equal(UDBSCANxH._fork_epsilon(input_array, 10), split_eps)
 
     def test_eps_splits_hidden_peak(self):
         """
@@ -209,7 +209,7 @@ def test_eps_splits_hidden_peak(self):
         """
         input_array = np.array([0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32], dtype=int)
         split_eps = np.array([21], dtype=int)
-        npt.assert_array_equal(HUDC._cluster_forks(input_array, 3), split_eps)
+        npt.assert_array_equal(UDBSCANxH._fork_epsilon(input_array, 3), split_eps)
 
     @pytest.mark.parametrize("nested,flat",
                              [([(1, 3), [(6, 9), (11, 11)]],
@@ -224,7 +224,7 @@ def test_flatten_list(self, nested, flat):
         Tests for hidden method _flatten_list.
         Method _flatten_list returns a generator which should be coerced to a list.
         """
-        assert list(HUDC._flatten_list(nested)) == flat
+        assert list(UDBSCANxH._flatten_list(nested)) == flat
 
     def test_hudc(self):
         """Test for method hudc"""
@@ -237,18 +237,18 @@ def test_hudc(self):
                                 1080, 1094, 1094, 1110, 1110, 1113, 1113, 1183, 1189, 1200, 1200,
                                 1217, 1234, 1234, 1591, 1620, 1620, 1662, 1686, 1707, 1755, 1828,
                                 1828, 1848, 1848, 1848, 1848, 1851, 1851, 1852, 1917], dtype=int)
-        answer_slices = np.fromiter([(0, 55), (56, 80), (83, 97)], dtype=HUDC._DTYPE_SLICE)
-        npt.assert_array_equal(HUDC.hudc(input_array, 10, max_eps=200, min_eps=10), answer_slices)
+        answer_slices = np.fromiter([(0, 55), (56, 80), (83, 97)], dtype=UDBSCANxH._DTYPE_SLICE)
+        npt.assert_array_equal(UDBSCANxH.udbscanxh(input_array, 10, max_eps=200, min_eps=10), answer_slices)
 
     @pytest.mark.parametrize("array,slices",
                              [(np.array([], dtype=int),
-                               np.array([], dtype=UDC._DTYPE_SLICE)),
+                               np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
                               (np.array([1], dtype=int),
-                               np.array([], dtype=UDC._DTYPE_SLICE)),
+                               np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
                               (np.array([1, 2, 3, 4], dtype=int),
-                               np.array([], dtype=UDC._DTYPE_SLICE)),
+                               np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
                               (np.array([1, 2, 3, 4, 5], dtype=int),
-                               np.array([(0, 5)], dtype=UDC._DTYPE_SLICE))
+                               np.array([(0, 5)], dtype=UDBSCANx._DTYPE_SLICE))
                               ],
                              ids=['0<n', '1<n', '4<n', '5==n'])
     def test_hudc_few_reads(self, array, slices):
@@ -257,7 +257,7 @@ def test_hudc_few_reads(self, array, slices):
         Method hudc should correctly handle an array of length 0 or greater.
         If the length of the array is less than 'n' then an empty array will always be returned.
         """
-        npt.assert_array_equal(HUDC.hudc(array, 5, 5), slices)
+        npt.assert_array_equal(UDBSCANxH.udbscanxh(array, 5, 5), slices)
 
     def test_fit(self):
         """
@@ -266,7 +266,7 @@ def test_fit(self):
         New copy of points should be sorted.
         Most edge cases should be caught in tests for component methods.
         """
-        hudc_object = HUDC(10, max_eps=200, min_eps=10)
+        hudc_object = UDBSCANxH(10, max_eps=200, min_eps=10)
         input_array = np.array([   0,    0,   60,   61,   61,   61,   76,   78,  122,  122,  141,
                                  183,  251,  260,  260,  263,  263,  267,  267,  288,  288,  295,
                                  300,  310,  310,  317,  317,  334,  334,  335,  338,  338,  338,
@@ -278,7 +278,7 @@ def test_fit(self):
                                 1828, 1848, 1848, 1848, 1848, 1851, 1851, 1852, 1917], dtype=int)
         answer_array = np.array(input_array, copy=True)
 
-        answer_slices = np.fromiter([(0, 55), (56, 80), (83, 97)], dtype=HUDC._DTYPE_SLICE)
+        answer_slices = np.fromiter([(0, 55), (56, 80), (83, 97)], dtype=UDBSCANxH._DTYPE_SLICE)
         hudc_object.fit(input_array)
         assert hudc_object.input_array is not input_array
         npt.assert_array_equal(hudc_object.input_array, answer_array)

From 5667e58682320e56d82fa977b62d8160fd26b8c0 Mon Sep 17 00:00:00 2001
From: Tim Millar <tim.millar@plantandfood.co.nz>
Date: Tue, 27 Jun 2017 11:05:12 +1200
Subject: [PATCH 4/9] update clustring docstrings/variables to reflect
 terminology of Campello 2015

---
 tefingerprint/cluster.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tefingerprint/cluster.py b/tefingerprint/cluster.py
index 7bfa906..ee0a066 100644
--- a/tefingerprint/cluster.py
+++ b/tefingerprint/cluster.py
@@ -18,10 +18,10 @@ class UDBSCANx(object):
     Sets of n points with a range equal to, or less than epsilon are classified as subclusters.
     Overlapping subclusters are then merged together to form clusters.
 
-    :param n: The minimum number of points allowed in each (sub)cluster
-    :type n: int
-    :param eps: The maximum distance allowed among each set of n points
-    :type eps: int
+    :param min_points: The minimum number of points allowed in each (sub)cluster
+    :type min_points: int
+    :param epsilon: The maximum distance allowed among each set of n points
+    :type epsilon: int
     """
 
     _DTYPE_SLICE = np.dtype([('lower', np.int64), ('upper', np.int64)])
@@ -426,7 +426,9 @@ def udbscanxh(array, min_points, max_eps=None, min_eps=None):
                 points['core_dist'] = np.maximum(min_eps, points['core_dist'])
 
             # initial splits based on the specified maximum epsilon
-            child_points = (points[left:right] for left, right in UDBSCANxH._cluster(points['value'], max_eps, min_points))
+            child_points = (points[left:right] for left, right in UDBSCANxH._cluster(points['value'],
+                                                                                     max_eps,
+                                                                                     min_points))
 
             # recursively run on all clusters
             clusters = [UDBSCANxH._traverse_hudc_tree(points, max_eps, min_points) for points in child_points]

From f19f6b3e62f50d6a36edae221f1d0eb0e5eec81f Mon Sep 17 00:00:00 2001
From: Tim Millar <tim.millar@plantandfood.co.nz>
Date: Tue, 27 Jun 2017 13:25:09 +1200
Subject: [PATCH 5/9] update tests for UDBSCANxH._fork_epsilon

---
 tests/test_cluster.py | 86 ++++++++++++++++++++-----------------------
 1 file changed, 40 insertions(+), 46 deletions(-)

diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index bd1b2d0..7c2aef1 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -7,7 +7,7 @@
 
 class TestUDC:
     """
-    Tests for class FlatUnivariateDensityCluster.
+    Tests for class UDBSCANx.
     """
     @pytest.mark.parametrize("array,answer",
                              [(np.array([0, 0, 0, 3, 4, 7, 99], dtype=int),
@@ -88,9 +88,9 @@ def test_flat_cluster(self):
                                np.array([(0, 5)], dtype=UDBSCANx._DTYPE_SLICE))
                               ],
                              ids=['0<n', '1<n', '4<n', '5==n'])
-    def test_udc_few_reads(self, array, slices):
+    def test_udbscanx_few_reads(self, array, slices):
         """
-        Test for method udc with small arrays.
+        Test for method udbscanx with small arrays.
         Method udc should correctly handle an array of length 0 or greater.
         If the length of the array is less than 'n' then an empty array will always be returned.
         """
@@ -103,38 +103,38 @@ def test_fit(self):
         New copy of points should be sorted.
         Most edge cases should be caught in tests for component methods.
         """
-        udc_object = UDBSCANx(4, 5)
+        model = UDBSCANx(4, 5)
         input_array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90],
                                dtype=int)
         answer_array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90],
                                 dtype=int)
         answer_slices = np.array([(2, 7), (8, 12), (12, 17)],
                                  dtype=UDBSCANx._DTYPE_SLICE)
-        udc_object.fit(input_array)
-        assert udc_object.input_array is not input_array
-        npt.assert_array_equal(udc_object.input_array, answer_array)
-        npt.assert_array_equal(udc_object.slices, answer_slices)
+        model.fit(input_array)
+        assert model.input_array is not input_array
+        npt.assert_array_equal(model.input_array, answer_array)
+        npt.assert_array_equal(model.slices, answer_slices)
 
     def test_clusters(self):
         """
         Test for method clusters.
         """
-        udc_object = UDBSCANx(4, 5)
+        model = UDBSCANx(4, 5)
         input_array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90], dtype=int)
         cluster_arrays = [np.array([21, 22, 22, 22, 24], dtype=int),
                           np.array([54, 54, 55, 56], dtype=int),
                           np.array([65, 65, 66, 67, 68], dtype=int)]
-        udc_object.fit(input_array)
-        for result, answer in zip(udc_object.clusters(), cluster_arrays):
+        model.fit(input_array)
+        for result, answer in zip(model.clusters(), cluster_arrays):
             npt.assert_array_equal(result, answer)
 
     def test_cluster_extremities(self):
         """Test for method cluster_extremities"""
-        udc_object = UDBSCANx(4, 5)
+        model = UDBSCANx(4, 5)
         input_array = np.array([1, 2, 21, 22, 22, 22, 24, 38, 54, 54, 55, 56, 65, 65, 66, 67, 68, 90], dtype=int)
         cluster_extremities = [(21, 24), (54, 56), (65, 68)]
-        udc_object.fit(input_array)
-        for result, answer in zip(udc_object.cluster_extremities(), cluster_extremities):
+        model.fit(input_array)
+        for result, answer in zip(model.cluster_extremities(), cluster_extremities):
             assert result == answer
 
     def test_labels(self):
@@ -148,12 +148,12 @@ def test_labels(self):
 
 class TestHUDC:
     """
-    Tests for class HierarchicalUnivariateDensityCluster.
+    Tests for class UDBSCANxH.
     """
-    def test_point_eps(self):
+    def test_core_distances(self):
         """
-        Test for hidden method _point_eps.
-        Data is derived from real edge case containing "hidden" eps peaks.
+        Test for hidden method _core_distances.
+        Data is derived from real edge case.
         """
         input_array = np.array([   0,    0,   60,   61,   61,   61,   76,   78,  122,  122,  141,
                                  183,  251,  260,  260,  263,  263,  267,  267,  288,  288,  295,
@@ -184,32 +184,26 @@ def test_point_eps_hidden_peak(self):
         point_eps = np.array([3, 3, 1, 1, 1, 2, 2, 1, 1, 1, 3, 3], dtype=int)
         npt.assert_array_equal(UDBSCANxH._core_distances(input_array, 3), point_eps)
 
-    def test_eps_splits(self):
-        """
-        Test for hidden method _eps_splits.
-        Data is derived from real edge case containing "hidden" eps peaks.
-        """
-        input_array = np.array([   0,    0,   60,   61,   61,   61,   76,   78,  122,  122,  141,
-                                 183,  251,  260,  260,  263,  263,  267,  267,  288,  288,  295,
-                                 300,  310,  310,  317,  317,  334,  334,  335,  338,  338,  338,
-                                 338,  340,  342,  342,  344,  344,  358,  367,  370,  370,  377,
-                                 387,  402,  403,  410,  410,  410,  418,  418,  424,  424,  577,
-                                 857,  879,  921,  921, 1007, 1031, 1051, 1051, 1059, 1071, 1071,
-                                1080, 1094, 1094, 1110, 1110, 1113, 1113, 1183, 1189, 1200, 1200,
-                                1217, 1234, 1234, 1591, 1620, 1620, 1662, 1686, 1707, 1755, 1828,
-                                1828, 1848, 1848, 1848, 1848, 1851, 1851, 1852, 1917], dtype=int)
-        split_eps = np.array([122,  42, 453, 436], dtype=int)
-        npt.assert_array_equal(UDBSCANxH._fork_epsilon(input_array, 10), split_eps)
-
-    def test_eps_splits_hidden_peak(self):
+    @pytest.mark.parametrize("array,min_points,answer",
+                             [(np.array([], dtype=int), 3, None),
+                              (np.array([1], dtype=int), 3, None),
+                              (np.array([0, 4, 6, 7, 7, 7, 7, 7, 8, 10, 20], dtype=int), 3, None),
+                              (np.array([0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32], dtype=int), 3, 22),
+                              (np.array([   0,    0,   60,   61,   61,   61,   76,   78,  122,  122,  141,
+                                          183,  251,  260,  260,  263,  263,  267,  267,  288,  288,  295,
+                                          300,  310,  310,  317,  317,  334,  334,  335,  338,  338,  338,
+                                          338,  340,  342,  342,  344,  344,  358,  367,  370,  370,  377,
+                                          387,  402,  403,  410,  410,  410,  418,  418,  424,  424,  577,
+                                          857,  879,  921,  921, 1007, 1031, 1051, 1051, 1059, 1071, 1071,
+                                         1080, 1094, 1094, 1110, 1110, 1113, 1113, 1183, 1189, 1200, 1200,
+                                         1217, 1234, 1234, 1591, 1620, 1620, 1662, 1686, 1707, 1755, 1828,
+                                         1828, 1848, 1848, 1848, 1848, 1851, 1851, 1852, 1917], dtype=int), 10, 454)],
+                             ids=['empty', 'short', 'none', 'hidden', 'real'])
+    def test_fork_epsilon(self, array, min_points, answer):
         """
-        Test for hidden method _eps_splits.
-        Data contains a single "hidden" eps peak between values 6 and 26
-        (values themselves have low eps but a large gap between them).
+        Test for hidden method _fork_epsilon.
         """
-        input_array = np.array([0, 0, 3, 4, 4, 6, 26, 28, 28, 29, 32, 32], dtype=int)
-        split_eps = np.array([21], dtype=int)
-        npt.assert_array_equal(UDBSCANxH._fork_epsilon(input_array, 3), split_eps)
+        assert UDBSCANxH._fork_epsilon(array, min_points) == answer
 
     @pytest.mark.parametrize("nested,flat",
                              [([(1, 3), [(6, 9), (11, 11)]],
@@ -226,8 +220,8 @@ def test_flatten_list(self, nested, flat):
         """
         assert list(UDBSCANxH._flatten_list(nested)) == flat
 
-    def test_hudc(self):
-        """Test for method hudc"""
+    def test_udbscanxh(self):
+        """Test for method udbscanxh"""
         input_array = np.array([   0,    0,   60,   61,   61,   61,   76,   78,  122,  122,  141,
                                  183,  251,  260,  260,  263,  263,  267,  267,  288,  288,  295,
                                  300,  310,  310,  317,  317,  334,  334,  335,  338,  338,  338,
@@ -251,9 +245,9 @@ def test_hudc(self):
                                np.array([(0, 5)], dtype=UDBSCANx._DTYPE_SLICE))
                               ],
                              ids=['0<n', '1<n', '4<n', '5==n'])
-    def test_hudc_few_reads(self, array, slices):
+    def test_udbscanxh_few_reads(self, array, slices):
         """
-        Test for method hudc with small arrays.
+        Test for method udbscanxh with small arrays.
         Method hudc should correctly handle an array of length 0 or greater.
         If the length of the array is less than 'n' then an empty array will always be returned.
         """

From 5ec9a77ffd69527f9c652d09d0034c69b958a3f1 Mon Sep 17 00:00:00 2001
From: Tim Millar <tim.millar@plantandfood.co.nz>
Date: Tue, 27 Jun 2017 16:03:51 +1200
Subject: [PATCH 6/9] update tests for udbscanxh including negative child-area
 bug

---
 tests/test_cluster.py | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index 7c2aef1..e1a49a9 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -220,7 +220,24 @@ def test_flatten_list(self, nested, flat):
         """
         assert list(UDBSCANxH._flatten_list(nested)) == flat
 
-    def test_udbscanxh(self):
+    @pytest.mark.parametrize("array,min_points,max_eps,answer",
+                             [(np.array([], dtype=int), 5, 5, np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
+                              (np.array([1], dtype=int), 5, 5, np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
+                              (np.array([1, 2, 3, 4], dtype=int), 5, 5, np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
+                              (np.array([1, 2, 3, 4, 5], dtype=int), 5, 5, np.array([(0, 5)], dtype=UDBSCANx._DTYPE_SLICE)),
+                              (np.array([0, 2, 3, 7, 8, 12, 13, 17], dtype=int), 3, 5, np.array([(0, 8)], dtype=UDBSCANx._DTYPE_SLICE)),
+                              (np.array([0, 2, 3, 7, 8, 12, 13, 16], dtype=int), 3, 5, np.array([(0, 3), (5, 8)], dtype=UDBSCANx._DTYPE_SLICE)),
+                              (np.array([0, 1, 2, 7, 8, 13, 14, 16, 21, 23, 28, 30], dtype=int), 3, 8, np.array([(0, 3), (5, 8)], dtype=UDBSCANx._DTYPE_SLICE))],
+                             ids=['empty', 'single', 'p < min-points', 'p == min-points', 'parent', 'children', '-ve child area'])
+    def test_udbscanxh(self, array, min_points, max_eps, answer):
+        """
+        Test for method udbscanxh with small arrays.
+        Method hudc should correctly handle an array of length 0 or greater.
+        If the length of the array is less than 'n' then an empty array will always be returned.
+        """
+        npt.assert_array_equal(UDBSCANxH.udbscanxh(array, min_points, max_eps), answer)
+
+    def test_udbscanxh_real(self):
         """Test for method udbscanxh"""
         input_array = np.array([   0,    0,   60,   61,   61,   61,   76,   78,  122,  122,  141,
                                  183,  251,  260,  260,  263,  263,  267,  267,  288,  288,  295,
@@ -234,25 +251,6 @@ def test_udbscanxh(self):
         answer_slices = np.fromiter([(0, 55), (56, 80), (83, 97)], dtype=UDBSCANxH._DTYPE_SLICE)
         npt.assert_array_equal(UDBSCANxH.udbscanxh(input_array, 10, max_eps=200, min_eps=10), answer_slices)
 
-    @pytest.mark.parametrize("array,slices",
-                             [(np.array([], dtype=int),
-                               np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
-                              (np.array([1], dtype=int),
-                               np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
-                              (np.array([1, 2, 3, 4], dtype=int),
-                               np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
-                              (np.array([1, 2, 3, 4, 5], dtype=int),
-                               np.array([(0, 5)], dtype=UDBSCANx._DTYPE_SLICE))
-                              ],
-                             ids=['0<n', '1<n', '4<n', '5==n'])
-    def test_udbscanxh_few_reads(self, array, slices):
-        """
-        Test for method udbscanxh with small arrays.
-        Method hudc should correctly handle an array of length 0 or greater.
-        If the length of the array is less than 'n' then an empty array will always be returned.
-        """
-        npt.assert_array_equal(UDBSCANxH.udbscanxh(array, 5, 5), slices)
-
     def test_fit(self):
         """
         Test to run class via the method fit.

From e47730f947ed07323dc684cb21b58bcb18a0afe7 Mon Sep 17 00:00:00 2001
From: Tim Millar <tim.millar@plantandfood.co.nz>
Date: Tue, 27 Jun 2017 17:34:52 +1200
Subject: [PATCH 7/9] default selection of starting epsilon value matches
 Campello et al 2015

---
 tefingerprint/cluster.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tefingerprint/cluster.py b/tefingerprint/cluster.py
index ee0a066..d3e1aef 100644
--- a/tefingerprint/cluster.py
+++ b/tefingerprint/cluster.py
@@ -419,19 +419,18 @@ def udbscanxh(array, min_points, max_eps=None, min_eps=None):
             points['index'] = np.arange(len(array), dtype=int)
             points['core_dist'] = UDBSCANxH._core_distances(array, min_points)
             if not max_eps:
-                # start at highest observed eps
-                max_eps = np.max(points['core_dist'])
+                # start at first fork, i.e. bellow root node as in HDBSCAN* (Campello 2015)
+                max_eps = UDBSCANxH._fork_epsilon(points['value'], min_points) - 1
             if min_eps:
                 # overwrite lower eps
                 points['core_dist'] = np.maximum(min_eps, points['core_dist'])
 
-            # initial splits based on the specified maximum epsilon
-            child_points = (points[left:right] for left, right in UDBSCANxH._cluster(points['value'],
-                                                                                     max_eps,
-                                                                                     min_points))
-
+            # initial splits based on the specified max_eps
+            initial_cluster_bounds = UDBSCANxH._cluster(points['value'], max_eps, min_points)
+            child_points = (points[left:right] for left, right in initial_cluster_bounds)
             # recursively run on all clusters
-            clusters = [UDBSCANxH._traverse_hudc_tree(points, max_eps, min_points) for points in child_points]
+            # initialise with max_eps + 1 to ensure points with core_distance == max_eps are counted
+            clusters = [UDBSCANxH._traverse_hudc_tree(points, max_eps + 1, min_points) for points in child_points]
             return np.fromiter(UDBSCANxH._flatten_list(clusters), dtype=UDBSCANx._DTYPE_SLICE)
 
     def fit(self, array):

From 38f0c79561602ea5bf92f313f685ca1c19574372 Mon Sep 17 00:00:00 2001
From: Tim Millar <tim.millar@plantandfood.co.nz>
Date: Tue, 27 Jun 2017 17:35:14 +1200
Subject: [PATCH 8/9] stand alone tests for clustering edge cases

---
 tests/test_cluster.py | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index e1a49a9..34ef3ae 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -226,9 +226,8 @@ def test_flatten_list(self, nested, flat):
                               (np.array([1, 2, 3, 4], dtype=int), 5, 5, np.array([], dtype=UDBSCANx._DTYPE_SLICE)),
                               (np.array([1, 2, 3, 4, 5], dtype=int), 5, 5, np.array([(0, 5)], dtype=UDBSCANx._DTYPE_SLICE)),
                               (np.array([0, 2, 3, 7, 8, 12, 13, 17], dtype=int), 3, 5, np.array([(0, 8)], dtype=UDBSCANx._DTYPE_SLICE)),
-                              (np.array([0, 2, 3, 7, 8, 12, 13, 16], dtype=int), 3, 5, np.array([(0, 3), (5, 8)], dtype=UDBSCANx._DTYPE_SLICE)),
-                              (np.array([0, 1, 2, 7, 8, 13, 14, 16, 21, 23, 28, 30], dtype=int), 3, 8, np.array([(0, 3), (5, 8)], dtype=UDBSCANx._DTYPE_SLICE))],
-                             ids=['empty', 'single', 'p < min-points', 'p == min-points', 'parent', 'children', '-ve child area'])
+                              (np.array([0, 2, 3, 7, 8, 12, 13, 16], dtype=int), 3, 5, np.array([(0, 3), (5, 8)], dtype=UDBSCANx._DTYPE_SLICE))],
+                             ids=['empty', 'single', 'p < min-points', 'p == min-points', 'parent', 'children'])
     def test_udbscanxh(self, array, min_points, max_eps, answer):
         """
         Test for method udbscanxh with small arrays.
@@ -237,6 +236,33 @@ def test_udbscanxh(self, array, min_points, max_eps, answer):
         """
         npt.assert_array_equal(UDBSCANxH.udbscanxh(array, min_points, max_eps), answer)
 
+    def test_udbscanxh_child_support_calculation(self):
+        """
+        Test for method udbscanxh.
+        Tests that the support for child clusters is calculated correctly.
+
+        If child support is incorrectly calculated as the sum(epsilon_minimum - core_distances) then core_distances
+        larger than epsilon_minimum (of the parent cluster) lead to a negative value which lowers the calculated
+        child support and results in the selection of the parent cluster (0, 12).
+        If child support is correctly calculated as the sum(maximum(0, (epsilon_minimum - core_distances))) then
+        points with core_distance larger than epsilon_minimum (i.e. those not present in the child clusters) don't
+        affect calculated support and the child clusters are selected.
+        """
+        array = np.array([0, 1, 2, 7, 8, 13, 14, 16, 21, 23, 28, 30], dtype=int)
+        answer = np.fromiter([(0, 3), (5, 8)], dtype=UDBSCANxH._DTYPE_SLICE)
+        npt.assert_array_equal(UDBSCANxH.udbscanxh(array, 3), answer)
+
+    def test_udbscanxh_default_max_eps(self):
+        """
+        Test for method udbscanxh.
+        Tests that the correct value of max_eps is selected if none is specified.
+        By default it should exclude the root node of the cluster tree.
+
+        """
+        array = np.array([0, 1, 2, 3, 4, 5, 7, 9, 10, 11, 12, 13, 14], dtype=int)
+        answer = np.fromiter([(0, 6), (7, 13)], dtype=UDBSCANxH._DTYPE_SLICE)
+        npt.assert_array_equal(UDBSCANxH.udbscanxh(array, 3), answer)
+
     def test_udbscanxh_real(self):
         """Test for method udbscanxh"""
         input_array = np.array([   0,    0,   60,   61,   61,   61,   76,   78,  122,  122,  141,

From 1ce01b9f47a77026fb25875d7549c2439f693cc9 Mon Sep 17 00:00:00 2001
From: Tim Millar <tim.millar@plantandfood.co.nz>
Date: Tue, 27 Jun 2017 17:37:28 +1200
Subject: [PATCH 9/9] bump version for minor bug fix

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 56d0df3..99db0a6 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@ def read_file(file_name):
 
 
 setup(name='tefingerprint',
-      version='0.1.0',
+      version='0.1.1',
       author='Tim Millar',
       author_email='tim.millar@plantandfood.co.nz',
       url='https://github.com/PlantandFoodResearch/TEFingerprint',