From 981d82436b60eb212c933eb1a0ca0b9b5a7d2380 Mon Sep 17 00:00:00 2001 From: Casey Brooks Date: Fri, 26 Dec 2025 03:14:13 +0000 Subject: [PATCH] fix(cluster): normalize optics min_samples --- doc/whats_new/v0.22.rst | 6 +++ sklearn/cluster/optics_.py | 79 +++++++++++++++++++--------- sklearn/cluster/tests/test_optics.py | 71 ++++++++++++++++++++++++- 3 files changed, 131 insertions(+), 25 deletions(-) diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst index 93635d88069d5..5ade3cef3b276 100644 --- a/doc/whats_new/v0.22.rst +++ b/doc/whats_new/v0.22.rst @@ -216,6 +216,12 @@ Changelog match `spectral_clustering`. :pr:`13726` by :user:`Shuzhe Xiao `. +- |Fix| :class:`cluster.OPTICS` now coerces fractional ``min_samples`` and + ``min_cluster_size`` values to integer neighbor counts, preventing + ``NearestNeighbors`` from receiving floats and raising a ``TypeError`` when + ``min_samples`` is provided as a fraction. + :issue:`41` by :user:`Casey Brooks `. + :mod:`sklearn.feature_selection` ................................ - |Fix| Fixed a bug where :class:`VarianceThreshold` with `threshold=0` did not diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py index 4f7eb11ab2f72..77950381c40c4 100755 --- a/sklearn/cluster/optics_.py +++ b/sklearn/cluster/optics_.py @@ -11,7 +11,9 @@ License: BSD 3 clause """ +import math import warnings + import numpy as np from ..utils import check_array @@ -47,9 +49,9 @@ class OPTICS(BaseEstimator, ClusterMixin): min_samples : int > 1 or float between 0 and 1 (default=None) The number of samples in a neighborhood for a point to be considered as a core point. Also, up and down steep regions can't have more then - ``min_samples`` consecutive non-steep points. Expressed as an absolute - number or a fraction of the number of samples (rounded to be at least - 2). + ``min_samples`` consecutive non-steep points. If given as a fraction, + it is converted to ``ceil(min_samples * n_samples)`` and clipped to be + at least 2. max_eps : float, optional (default=np.inf) The maximum distance between two samples for one to be considered as @@ -115,9 +117,10 @@ class OPTICS(BaseEstimator, ClusterMixin): min_cluster_size : int > 1 or float between 0 and 1 (default=None) Minimum number of samples in an OPTICS cluster, expressed as an - absolute number or a fraction of the number of samples (rounded to be - at least 2). If ``None``, the value of ``min_samples`` is used instead. - Used only when ``cluster_method='xi'``. + absolute number or a fraction of the number of samples. Fractions are + converted to ``ceil(min_cluster_size * n_samples)`` with a minimum of + 2. If ``None``, the normalized value of ``min_samples`` is used + instead. Used only when ``cluster_method='xi'``. algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional Algorithm used to compute the nearest neighbors: @@ -166,6 +169,13 @@ class OPTICS(BaseEstimator, ClusterMixin): Point that a sample was reached from, indexed by object order. Seed points have a predecessor of -1. + min_samples_ : int + Normalized number of samples used when querying nearest neighbors. + + min_cluster_size_ : int + Normalized cluster size used by the Xi extraction method. Equal to + ``min_samples_`` when ``min_cluster_size`` is ``None``. + cluster_hierarchy_ : array, shape (n_clusters, 2) The list of clusters in the form of ``[start, end]`` in each row, with all indices inclusive. The clusters are ordered according to @@ -240,9 +250,21 @@ def fit(self, X, y=None): " 'dbscan' or 'xi' but is %s" % self.cluster_method) + n_samples = X.shape[0] + min_samples = _normalize_size(self.min_samples, n_samples, + 'min_samples') + if self.min_cluster_size is None: + min_cluster_size = min_samples + else: + min_cluster_size = _normalize_size(self.min_cluster_size, + n_samples, + 'min_cluster_size') + self.min_samples_ = min_samples + self.min_cluster_size_ = min_cluster_size + (self.ordering_, self.core_distances_, self.reachability_, self.predecessor_) = compute_optics_graph( - X=X, min_samples=self.min_samples, algorithm=self.algorithm, + X=X, min_samples=min_samples, algorithm=self.algorithm, leaf_size=self.leaf_size, metric=self.metric, metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs, max_eps=self.max_eps) @@ -253,8 +275,8 @@ def fit(self, X, y=None): self.reachability_, self.predecessor_, self.ordering_, - self.min_samples, - self.min_cluster_size, + min_samples, + min_cluster_size, self.xi, self.predecessor_correction) self.cluster_hierarchy_ = clusters_ @@ -290,6 +312,16 @@ def _validate_size(size, n_samples, param_name): (param_name, n_samples, size)) +def _normalize_size(size, n_samples, param_name): + """Validate and normalize a size parameter to an integer >= 2.""" + _validate_size(size, n_samples, param_name) + if size <= 1: + normalized = int(math.ceil(size * n_samples)) + else: + normalized = int(size) + return max(2, normalized) + + # OPTICS helper functions def _compute_core_distances_(X, neighbors, min_samples, working_memory): """Compute the k-th nearest neighbor of each sample @@ -343,8 +375,9 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params, min_samples : int (default=5) The number of samples in a neighborhood for a point to be considered - as a core point. Expressed as an absolute number or a fraction of the - number of samples (rounded to be at least 2). + as a core point. Expressed as an absolute number or, if a fraction, it + is converted to ``ceil(min_samples * n_samples)`` and clipped to be at + least 2. max_eps : float, optional (default=np.inf) The maximum distance between two samples for one to be considered as @@ -435,9 +468,7 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params, structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60. """ n_samples = X.shape[0] - _validate_size(min_samples, n_samples, 'min_samples') - if min_samples <= 1: - min_samples = max(2, min_samples * n_samples) + min_samples = _normalize_size(min_samples, n_samples, 'min_samples') # Start all points as 'unprocessed' ## reachability_ = np.empty(n_samples) @@ -585,13 +616,15 @@ def cluster_optics_xi(reachability, predecessor, ordering, min_samples, min_samples : int > 1 or float between 0 and 1 (default=None) The same as the min_samples given to OPTICS. Up and down steep regions can't have more then ``min_samples`` consecutive non-steep points. - Expressed as an absolute number or a fraction of the number of samples - (rounded to be at least 2). + Fractions are converted to ``ceil(min_samples * n_samples)`` and + clipped to be at least 2. min_cluster_size : int > 1 or float between 0 and 1 (default=None) Minimum number of samples in an OPTICS cluster, expressed as an - absolute number or a fraction of the number of samples (rounded to be - at least 2). If ``None``, the value of ``min_samples`` is used instead. + absolute number or a fraction of the number of samples. Fractions are + converted to ``ceil(min_cluster_size * n_samples)`` with a minimum of + 2. If ``None``, the normalized value of ``min_samples`` is used + instead. xi : float, between 0 and 1, optional (default=0.05) Determines the minimum steepness on the reachability plot that @@ -617,14 +650,12 @@ def cluster_optics_xi(reachability, predecessor, ordering, min_samples, np.unique(labels)``. """ n_samples = len(reachability) - _validate_size(min_samples, n_samples, 'min_samples') - if min_samples <= 1: - min_samples = max(2, min_samples * n_samples) + min_samples = _normalize_size(min_samples, n_samples, 'min_samples') if min_cluster_size is None: min_cluster_size = min_samples - _validate_size(min_cluster_size, n_samples, 'min_cluster_size') - if min_cluster_size <= 1: - min_cluster_size = max(2, min_cluster_size * n_samples) + else: + min_cluster_size = _normalize_size(min_cluster_size, n_samples, + 'min_cluster_size') clusters = _xi_cluster(reachability[ordering], predecessor[ordering], ordering, xi, diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py index d5f4d62ea61b6..333751cfc7ab2 100644 --- a/sklearn/cluster/tests/test_optics.py +++ b/sklearn/cluster/tests/test_optics.py @@ -8,7 +8,8 @@ from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster.optics_ import (OPTICS, _extend_region, - _extract_xi_labels) + _extract_xi_labels, + cluster_optics_xi) from sklearn.metrics.cluster import contingency_matrix from sklearn.metrics.pairwise import pairwise_distances from sklearn.cluster.dbscan_ import DBSCAN @@ -179,6 +180,74 @@ def test_minimum_number_of_sample_check(): assert_raise_message(ValueError, msg, clust.fit, X) +def test_optics_min_samples_float_no_typeerror(): + X, _ = make_blobs(n_samples=200, centers=3, random_state=0) + + clust = OPTICS(min_samples=0.5) + clust.fit(X) + + assert clust.min_samples_ == 100 + assert clust.min_cluster_size_ == 100 + + +def test_optics_min_samples_fraction_rounding_ceil(): + X = np.arange(30, dtype=float).reshape(-1, 1) + + clust = OPTICS(min_samples=0.21) + clust.fit(X) + + # ceil(0.21 * 30) == 7 + assert clust.min_samples_ == 7 + + +def test_cluster_optics_xi_accepts_float_min_samples(): + X, _ = make_blobs(n_samples=40, centers=3, random_state=1) + optics = OPTICS(min_samples=5, min_cluster_size=6, xi=0.05) + optics.fit(X) + + labels_int, clusters_int = cluster_optics_xi( + optics.reachability_, + optics.predecessor_, + optics.ordering_, + 5, + 6, + optics.xi, + optics.predecessor_correction) + + labels_float, clusters_float = cluster_optics_xi( + optics.reachability_, + optics.predecessor_, + optics.ordering_, + 0.125, # ceil(0.125 * 40) == 5 + 0.15, # ceil(0.15 * 40) == 6 + optics.xi, + optics.predecessor_correction) + + assert_array_equal(labels_float, labels_int) + assert_array_equal(clusters_float, clusters_int) + + +@pytest.mark.parametrize("min_samples", [0, -1, 1.5]) +def test_optics_min_samples_invalid_values(min_samples): + msg = 'min_samples must be a positive integer or a float between 0 and 1' + + X, _ = make_blobs(n_samples=10, centers=1, random_state=2) + clust = OPTICS(min_samples=min_samples) + + assert_raise_message(ValueError, msg, clust.fit, X) + + +@pytest.mark.parametrize("min_cluster_size", [0, -2, 1.3]) +def test_optics_min_cluster_size_invalid_values(min_cluster_size): + msg = ('min_cluster_size must be a positive integer or a float between ' + '0 and 1') + + X, _ = make_blobs(n_samples=12, centers=1, random_state=3) + clust = OPTICS(min_samples=2, min_cluster_size=min_cluster_size) + + assert_raise_message(ValueError, msg, clust.fit, X) + + def test_bad_extract(): # Test an extraction of eps too close to original eps msg = "Specify an epsilon smaller than 0.15. Got 0.3."