Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster sparse reachability #416

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 85 additions & 39 deletions hdbscan/_hdbscan_reachability.pyx
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
# cython: boundscheck=False
# cython: wraparound=False
# cython: nonecheck=False
# cython: initializedcheck=False
# mutual reachability distance compiutations
# mutual reachability distance computations
# Authors: Leland McInnes
# License: 3-clause BSD

import numpy as np
cimport numpy as np

from numpy.math cimport INFINITY, isfinite
from scipy.spatial.distance import pdist, squareform
from scipy.sparse import lil_matrix as sparse_matrix
from sklearn.neighbors import KDTree, BallTree
import gc

Expand Down Expand Up @@ -59,44 +60,89 @@ def mutual_reachability(distance_matrix, min_points=5, alpha=1.0):
return result


cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5,
float alpha=1.0, float max_dist=0.):

cdef np.intp_t i
cdef np.intp_t j
cdef np.intp_t n
cdef np.double_t mr_dist
cdef list sorted_row_data
cdef np.ndarray[dtype=np.double_t, ndim=1] core_distance
cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_row_data
cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_col_data

result = sparse_matrix(lil_matrix.shape)
core_distance = np.empty(lil_matrix.shape[0], dtype=np.double)

for i in range(lil_matrix.shape[0]):
sorted_row_data = sorted(lil_matrix.data[i])
if min_points < len(sorted_row_data):
core_distance[i] = sorted_row_data[min_points]
cpdef sparse_mutual_reachability(object distance_matrix, np.intp_t min_points=5,
float alpha=1.0, float max_dist=0.0):
""" Compute mutual reachability for distance matrix. For best performance
pass distance_matrix in CSR form which will modify it in place and return
it without making a copy """

# tocsr() is a fast noop if distance_matrix is already CSR
D = distance_matrix.tocsr()
if D.shape != (D.shape[0], D.shape[0]):
raise Exception("Distance matrix must be 2D square shaped instead of {}".format(D.shape))
# Convert to smallest supported data type if necessary
if D.dtype not in (np.float32, np.float64):
if D.dtype <= np.dtype(np.float32):
D = D.astype(np.float32)
else:
core_distance[i] = np.infty

if alpha != 1.0:
lil_matrix = lil_matrix / alpha

nz_row_data, nz_col_data = lil_matrix.nonzero()

for n in range(nz_row_data.shape[0]):
i = nz_row_data[n]
j = nz_col_data[n]

mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j])
if np.isfinite(mr_dist):
result[i, j] = mr_dist
elif max_dist > 0:
result[i, j] = max_dist

return result.tocsr()
D = D.astype(np.float64)

# Call typed function which modifies D in place
t = (D.data.dtype, D.indices.dtype, D.indptr.dtype)
if t == (np.float32, np.int32, np.int32):
sparse_mr_fast[np.float32_t, np.int32_t](D.data, D.indices, D.indptr,
min_points, alpha, max_dist)
elif t == (np.float32, np.int64, np.int64):
sparse_mr_fast[np.float32_t, np.int64_t](D.data, D.indices, D.indptr,
min_points, alpha, max_dist)
elif t == (np.float64, np.int32, np.int32):
sparse_mr_fast[np.float64_t, np.int32_t](D.data, D.indices, D.indptr,
min_points, alpha, max_dist)
elif t == (np.float64, np.int64, np.int64):
sparse_mr_fast[np.float64_t, np.int64_t](D.data, D.indices, D.indptr,
min_points, alpha, max_dist)
else:
raise Exception("Unsupported CSR format {}".format(t))

return D


ctypedef fused mr_indx_t:
np.int32_t
np.int64_t

ctypedef fused mr_data_t:
np.float32_t
np.float64_t

cdef sparse_mr_fast(np.ndarray[dtype=mr_data_t, ndim=1, mode='c'] data,
np.ndarray[dtype=mr_indx_t, ndim=1, mode='c'] indices,
np.ndarray[dtype=mr_indx_t, ndim=1, mode='c'] indptr,
mr_indx_t min_points,
mr_data_t alpha,
mr_data_t max_dist):
cdef mr_indx_t row, col, colptr
cdef mr_data_t mr_dist
cdef np.ndarray[dtype=mr_data_t, ndim=1, mode='c'] row_data
cdef np.ndarray[dtype=mr_data_t, ndim=1, mode='c'] core_distance

core_distance = np.empty(indptr.shape[0]-1, dtype=data.dtype)

for row in range(indptr.shape[0]-1):
row_data = data[indptr[row]:indptr[row+1]].copy()
if min_points < row_data.shape[0]:
# sort is faster for small arrays because of lower startup cost but
# partition has worst case O(n) runtime for larger ones.
# https://stackoverflow.com/questions/43588711/numpys-partition-slower-than-sort-for-small-arrays
if row_data.shape[0] > 200:
row_data.partition(min_points)
else:
row_data.sort()
core_distance[row] = row_data[min_points]
else:
core_distance[row] = INFINITY

if alpha != <mr_data_t>1.0:
data /= alpha

for row in range(indptr.shape[0]-1):
for colptr in range(indptr[row],indptr[row+1]):
col = indices[colptr]
mr_dist = max(core_distance[row], core_distance[col], data[colptr])
if isfinite(mr_dist):
data[colptr] = mr_dist
elif max_dist > 0:
data[colptr] = max_dist


def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5,
Expand Down
16 changes: 9 additions & 7 deletions hdbscan/hdbscan_.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,16 @@ def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
# sklearn.metrics.pairwise_distances handle it,
# enables the usage of numpy.inf in the distance
# matrix to indicate missing distance information.
# TODO: Check if copying is necessary
distance_matrix = X.copy()
# Give the user the option to have the distance matrix
# modified to save memory.
if kwargs.get("overwrite", False):
distance_matrix = X
else:
distance_matrix = X.copy()
else:
distance_matrix = pairwise_distances(X, metric=metric, **kwargs)

if issparse(distance_matrix):
# raise TypeError('Sparse distance matrices not yet supported')
return _hdbscan_sparse_distance_matrix(distance_matrix, min_samples,
alpha, metric, p,
leaf_size, gen_min_span_tree,
Expand Down Expand Up @@ -141,12 +144,11 @@ def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0,
'relations connecting them\n'
'Run hdbscan on each component.')

lil_matrix = X.tolil()

# Compute sparse mutual reachability graph
# if max_dist > 0, max distance to use when the reachability is infinite
max_dist = kwargs.get("max_dist", 0.)
mutual_reachability_ = sparse_mutual_reachability(lil_matrix,
# sparse_mutual_reachability() may modify X in place and return it
mutual_reachability_ = sparse_mutual_reachability(X,
min_points=min_samples,
max_dist=max_dist,
alpha=alpha)
Expand All @@ -163,7 +165,7 @@ def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0,

# Compute the minimum spanning tree for the sparse graph
sparse_min_spanning_tree = csgraph.minimum_spanning_tree(
mutual_reachability_)
mutual_reachability_, overwrite=True)

# Convert the graph to scipy cluster array format
nonzeros = sparse_min_spanning_tree.nonzero()
Expand Down
6 changes: 6 additions & 0 deletions hdbscan/tests/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ def test_hdbscan_sparse_distance_matrix():
n_clusters_2 = len(set(labels)) - int(-1 in labels)
assert_equal(n_clusters_2, n_clusters)

# Verify single and double precision return same results
assert_equal(D.dtype, np.double)
labels_double = hdbscan(D, metric='precomputed')[0]
labels_single = hdbscan(D.astype(np.single), metric='precomputed')[0]
assert_array_equal(labels_single, labels_double)


def test_hdbscan_feature_vector():
labels, p, persist, ctree, ltree, mtree = hdbscan(X)
Expand Down