Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/whats_new/v0.22.rst
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,12 @@ Changelog
match `spectral_clustering`.
:pr:`13726` by :user:`Shuzhe Xiao <fdas3213>`.

- |Fix| :class:`cluster.OPTICS` now coerces fractional ``min_samples`` and
``min_cluster_size`` values to integer neighbor counts, preventing
``NearestNeighbors`` from receiving floats and raising a ``TypeError`` when
``min_samples`` is provided as a fraction.
:issue:`41` by :user:`Casey Brooks <caseybrooks>`.

:mod:`sklearn.feature_selection`
................................
- |Fix| Fixed a bug where :class:`VarianceThreshold` with `threshold=0` did not
Expand Down
79 changes: 55 additions & 24 deletions sklearn/cluster/optics_.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
License: BSD 3 clause
"""

import math
import warnings

import numpy as np

from ..utils import check_array
Expand Down Expand Up @@ -47,9 +49,9 @@ class OPTICS(BaseEstimator, ClusterMixin):
min_samples : int > 1 or float between 0 and 1 (default=None)
The number of samples in a neighborhood for a point to be considered as
a core point. Also, up and down steep regions can't have more then
``min_samples`` consecutive non-steep points. Expressed as an absolute
number or a fraction of the number of samples (rounded to be at least
2).
``min_samples`` consecutive non-steep points. If given as a fraction,
it is converted to ``ceil(min_samples * n_samples)`` and clipped to be
at least 2.

max_eps : float, optional (default=np.inf)
The maximum distance between two samples for one to be considered as
Expand Down Expand Up @@ -115,9 +117,10 @@ class OPTICS(BaseEstimator, ClusterMixin):

min_cluster_size : int > 1 or float between 0 and 1 (default=None)
Minimum number of samples in an OPTICS cluster, expressed as an
absolute number or a fraction of the number of samples (rounded to be
at least 2). If ``None``, the value of ``min_samples`` is used instead.
Used only when ``cluster_method='xi'``.
absolute number or a fraction of the number of samples. Fractions are
converted to ``ceil(min_cluster_size * n_samples)`` with a minimum of
2. If ``None``, the normalized value of ``min_samples`` is used
instead. Used only when ``cluster_method='xi'``.

algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
Algorithm used to compute the nearest neighbors:
Expand Down Expand Up @@ -166,6 +169,13 @@ class OPTICS(BaseEstimator, ClusterMixin):
Point that a sample was reached from, indexed by object order.
Seed points have a predecessor of -1.

min_samples_ : int
Normalized number of samples used when querying nearest neighbors.

min_cluster_size_ : int
Normalized cluster size used by the Xi extraction method. Equal to
``min_samples_`` when ``min_cluster_size`` is ``None``.

cluster_hierarchy_ : array, shape (n_clusters, 2)
The list of clusters in the form of ``[start, end]`` in each row, with
all indices inclusive. The clusters are ordered according to
Expand Down Expand Up @@ -240,9 +250,21 @@ def fit(self, X, y=None):
" 'dbscan' or 'xi' but is %s" %
self.cluster_method)

n_samples = X.shape[0]
min_samples = _normalize_size(self.min_samples, n_samples,
'min_samples')
if self.min_cluster_size is None:
min_cluster_size = min_samples
else:
min_cluster_size = _normalize_size(self.min_cluster_size,
n_samples,
'min_cluster_size')
self.min_samples_ = min_samples
self.min_cluster_size_ = min_cluster_size

(self.ordering_, self.core_distances_, self.reachability_,
self.predecessor_) = compute_optics_graph(
X=X, min_samples=self.min_samples, algorithm=self.algorithm,
X=X, min_samples=min_samples, algorithm=self.algorithm,
leaf_size=self.leaf_size, metric=self.metric,
metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs,
max_eps=self.max_eps)
Expand All @@ -253,8 +275,8 @@ def fit(self, X, y=None):
self.reachability_,
self.predecessor_,
self.ordering_,
self.min_samples,
self.min_cluster_size,
min_samples,
min_cluster_size,
self.xi,
self.predecessor_correction)
self.cluster_hierarchy_ = clusters_
Expand Down Expand Up @@ -290,6 +312,16 @@ def _validate_size(size, n_samples, param_name):
(param_name, n_samples, size))


def _normalize_size(size, n_samples, param_name):
"""Validate and normalize a size parameter to an integer >= 2."""
_validate_size(size, n_samples, param_name)
if size <= 1:
normalized = int(math.ceil(size * n_samples))
else:
normalized = int(size)
return max(2, normalized)


# OPTICS helper functions
def _compute_core_distances_(X, neighbors, min_samples, working_memory):
"""Compute the k-th nearest neighbor of each sample
Expand Down Expand Up @@ -343,8 +375,9 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params,

min_samples : int (default=5)
The number of samples in a neighborhood for a point to be considered
as a core point. Expressed as an absolute number or a fraction of the
number of samples (rounded to be at least 2).
as a core point. Expressed as an absolute number or, if a fraction, it
is converted to ``ceil(min_samples * n_samples)`` and clipped to be at
least 2.

max_eps : float, optional (default=np.inf)
The maximum distance between two samples for one to be considered as
Expand Down Expand Up @@ -435,9 +468,7 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params,
structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
"""
n_samples = X.shape[0]
_validate_size(min_samples, n_samples, 'min_samples')
if min_samples <= 1:
min_samples = max(2, min_samples * n_samples)
min_samples = _normalize_size(min_samples, n_samples, 'min_samples')

# Start all points as 'unprocessed' ##
reachability_ = np.empty(n_samples)
Expand Down Expand Up @@ -585,13 +616,15 @@ def cluster_optics_xi(reachability, predecessor, ordering, min_samples,
min_samples : int > 1 or float between 0 and 1 (default=None)
The same as the min_samples given to OPTICS. Up and down steep regions
can't have more then ``min_samples`` consecutive non-steep points.
Expressed as an absolute number or a fraction of the number of samples
(rounded to be at least 2).
Fractions are converted to ``ceil(min_samples * n_samples)`` and
clipped to be at least 2.

min_cluster_size : int > 1 or float between 0 and 1 (default=None)
Minimum number of samples in an OPTICS cluster, expressed as an
absolute number or a fraction of the number of samples (rounded to be
at least 2). If ``None``, the value of ``min_samples`` is used instead.
absolute number or a fraction of the number of samples. Fractions are
converted to ``ceil(min_cluster_size * n_samples)`` with a minimum of
2. If ``None``, the normalized value of ``min_samples`` is used
instead.

xi : float, between 0 and 1, optional (default=0.05)
Determines the minimum steepness on the reachability plot that
Expand All @@ -617,14 +650,12 @@ def cluster_optics_xi(reachability, predecessor, ordering, min_samples,
np.unique(labels)``.
"""
n_samples = len(reachability)
_validate_size(min_samples, n_samples, 'min_samples')
if min_samples <= 1:
min_samples = max(2, min_samples * n_samples)
min_samples = _normalize_size(min_samples, n_samples, 'min_samples')
if min_cluster_size is None:
min_cluster_size = min_samples
_validate_size(min_cluster_size, n_samples, 'min_cluster_size')
if min_cluster_size <= 1:
min_cluster_size = max(2, min_cluster_size * n_samples)
else:
min_cluster_size = _normalize_size(min_cluster_size, n_samples,
'min_cluster_size')

clusters = _xi_cluster(reachability[ordering], predecessor[ordering],
ordering, xi,
Expand Down
71 changes: 70 additions & 1 deletion sklearn/cluster/tests/test_optics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster.optics_ import (OPTICS,
_extend_region,
_extract_xi_labels)
_extract_xi_labels,
cluster_optics_xi)
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster.dbscan_ import DBSCAN
Expand Down Expand Up @@ -179,6 +180,74 @@ def test_minimum_number_of_sample_check():
assert_raise_message(ValueError, msg, clust.fit, X)


def test_optics_min_samples_float_no_typeerror():
X, _ = make_blobs(n_samples=200, centers=3, random_state=0)

clust = OPTICS(min_samples=0.5)
clust.fit(X)

assert clust.min_samples_ == 100
assert clust.min_cluster_size_ == 100


def test_optics_min_samples_fraction_rounding_ceil():
X = np.arange(30, dtype=float).reshape(-1, 1)

clust = OPTICS(min_samples=0.21)
clust.fit(X)

# ceil(0.21 * 30) == 7
assert clust.min_samples_ == 7


def test_cluster_optics_xi_accepts_float_min_samples():
X, _ = make_blobs(n_samples=40, centers=3, random_state=1)
optics = OPTICS(min_samples=5, min_cluster_size=6, xi=0.05)
optics.fit(X)

labels_int, clusters_int = cluster_optics_xi(
optics.reachability_,
optics.predecessor_,
optics.ordering_,
5,
6,
optics.xi,
optics.predecessor_correction)

labels_float, clusters_float = cluster_optics_xi(
optics.reachability_,
optics.predecessor_,
optics.ordering_,
0.125, # ceil(0.125 * 40) == 5
0.15, # ceil(0.15 * 40) == 6
optics.xi,
optics.predecessor_correction)

assert_array_equal(labels_float, labels_int)
assert_array_equal(clusters_float, clusters_int)


@pytest.mark.parametrize("min_samples", [0, -1, 1.5])
def test_optics_min_samples_invalid_values(min_samples):
msg = 'min_samples must be a positive integer or a float between 0 and 1'

X, _ = make_blobs(n_samples=10, centers=1, random_state=2)
clust = OPTICS(min_samples=min_samples)

assert_raise_message(ValueError, msg, clust.fit, X)


@pytest.mark.parametrize("min_cluster_size", [0, -2, 1.3])
def test_optics_min_cluster_size_invalid_values(min_cluster_size):
msg = ('min_cluster_size must be a positive integer or a float between '
'0 and 1')

X, _ = make_blobs(n_samples=12, centers=1, random_state=3)
clust = OPTICS(min_samples=2, min_cluster_size=min_cluster_size)

assert_raise_message(ValueError, msg, clust.fit, X)


def test_bad_extract():
# Test an extraction of eps too close to original eps
msg = "Specify an epsilon smaller than 0.15. Got 0.3."
Expand Down