agyn-sandbox · casey-brooks · Dec 26, 2025
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -216,6 +216,12 @@ Changelog
   match `spectral_clustering`.
   :pr:`13726` by :user:`Shuzhe Xiao <fdas3213>`.
 
+- |Fix| :class:`cluster.OPTICS` now coerces fractional ``min_samples`` and
+  ``min_cluster_size`` values to integer neighbor counts, preventing
+  ``NearestNeighbors`` from receiving floats and raising a ``TypeError`` when
+  ``min_samples`` is provided as a fraction.
+  :issue:`41` by :user:`Casey Brooks <caseybrooks>`.
+
 :mod:`sklearn.feature_selection`
 ................................
 - |Fix| Fixed a bug where :class:`VarianceThreshold` with `threshold=0` did not

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
@@ -11,7 +11,9 @@
 License: BSD 3 clause
 """
 
+import math
 import warnings
+
 import numpy as np
 
 from ..utils import check_array
@@ -47,9 +49,9 @@ class OPTICS(BaseEstimator, ClusterMixin):
     min_samples : int > 1 or float between 0 and 1 (default=None)
         The number of samples in a neighborhood for a point to be considered as
         a core point. Also, up and down steep regions can't have more then
-        ``min_samples`` consecutive non-steep points. Expressed as an absolute
-        number or a fraction of the number of samples (rounded to be at least
-        2).
+        ``min_samples`` consecutive non-steep points. If given as a fraction,
+        it is converted to ``ceil(min_samples * n_samples)`` and clipped to be
+        at least 2.
 
     max_eps : float, optional (default=np.inf)
         The maximum distance between two samples for one to be considered as
@@ -115,9 +117,10 @@ class OPTICS(BaseEstimator, ClusterMixin):
 
     min_cluster_size : int > 1 or float between 0 and 1 (default=None)
         Minimum number of samples in an OPTICS cluster, expressed as an
-        absolute number or a fraction of the number of samples (rounded to be
-        at least 2). If ``None``, the value of ``min_samples`` is used instead.
-        Used only when ``cluster_method='xi'``.
+        absolute number or a fraction of the number of samples. Fractions are
+        converted to ``ceil(min_cluster_size * n_samples)`` with a minimum of
+        2. If ``None``, the normalized value of ``min_samples`` is used
+        instead. Used only when ``cluster_method='xi'``.
 
     algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional
         Algorithm used to compute the nearest neighbors:
@@ -166,6 +169,13 @@ class OPTICS(BaseEstimator, ClusterMixin):
         Point that a sample was reached from, indexed by object order.
         Seed points have a predecessor of -1.
 
+    min_samples_ : int
+        Normalized number of samples used when querying nearest neighbors.
+
+    min_cluster_size_ : int
+        Normalized cluster size used by the Xi extraction method. Equal to
+        ``min_samples_`` when ``min_cluster_size`` is ``None``.
+
     cluster_hierarchy_ : array, shape (n_clusters, 2)
         The list of clusters in the form of ``[start, end]`` in each row, with
         all indices inclusive. The clusters are ordered according to
@@ -240,9 +250,21 @@ def fit(self, X, y=None):
                              " 'dbscan' or 'xi' but is %s" %
                              self.cluster_method)
 
+        n_samples = X.shape[0]
+        min_samples = _normalize_size(self.min_samples, n_samples,
+                                      'min_samples')
+        if self.min_cluster_size is None:
+            min_cluster_size = min_samples
+        else:
+            min_cluster_size = _normalize_size(self.min_cluster_size,
+                                               n_samples,
+                                               'min_cluster_size')
+        self.min_samples_ = min_samples
+        self.min_cluster_size_ = min_cluster_size
+
         (self.ordering_, self.core_distances_, self.reachability_,
          self.predecessor_) = compute_optics_graph(
-             X=X, min_samples=self.min_samples, algorithm=self.algorithm,
+             X=X, min_samples=min_samples, algorithm=self.algorithm,
              leaf_size=self.leaf_size, metric=self.metric,
              metric_params=self.metric_params, p=self.p, n_jobs=self.n_jobs,
              max_eps=self.max_eps)
@@ -253,8 +275,8 @@ def fit(self, X, y=None):
                 self.reachability_,
                 self.predecessor_,
                 self.ordering_,
-                self.min_samples,
-                self.min_cluster_size,
+                min_samples,
+                min_cluster_size,
                 self.xi,
                 self.predecessor_correction)
             self.cluster_hierarchy_ = clusters_
@@ -290,6 +312,16 @@ def _validate_size(size, n_samples, param_name):
                          (param_name, n_samples, size))
 
 
+def _normalize_size(size, n_samples, param_name):
+    """Validate and normalize a size parameter to an integer >= 2."""
+    _validate_size(size, n_samples, param_name)
+    if size <= 1:
+        normalized = int(math.ceil(size * n_samples))
+    else:
+        normalized = int(size)
+    return max(2, normalized)
+
+
 # OPTICS helper functions
 def _compute_core_distances_(X, neighbors, min_samples, working_memory):
     """Compute the k-th nearest neighbor of each sample
@@ -343,8 +375,9 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params,
 
     min_samples : int (default=5)
         The number of samples in a neighborhood for a point to be considered
-        as a core point. Expressed as an absolute number or a fraction of the
-        number of samples (rounded to be at least 2).
+        as a core point. Expressed as an absolute number or, if a fraction, it
+        is converted to ``ceil(min_samples * n_samples)`` and clipped to be at
+        least 2.
 
     max_eps : float, optional (default=np.inf)
         The maximum distance between two samples for one to be considered as
@@ -435,9 +468,7 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params,
        structure." ACM SIGMOD Record 28, no. 2 (1999): 49-60.
     """
     n_samples = X.shape[0]
-    _validate_size(min_samples, n_samples, 'min_samples')
-    if min_samples <= 1:
-        min_samples = max(2, min_samples * n_samples)
+    min_samples = _normalize_size(min_samples, n_samples, 'min_samples')
 
     # Start all points as 'unprocessed' ##
     reachability_ = np.empty(n_samples)
@@ -585,13 +616,15 @@ def cluster_optics_xi(reachability, predecessor, ordering, min_samples,
     min_samples : int > 1 or float between 0 and 1 (default=None)
         The same as the min_samples given to OPTICS. Up and down steep regions
         can't have more then ``min_samples`` consecutive non-steep points.
-        Expressed as an absolute number or a fraction of the number of samples
-        (rounded to be at least 2).
+        Fractions are converted to ``ceil(min_samples * n_samples)`` and
+        clipped to be at least 2.
 
     min_cluster_size : int > 1 or float between 0 and 1 (default=None)
         Minimum number of samples in an OPTICS cluster, expressed as an
-        absolute number or a fraction of the number of samples (rounded to be
-        at least 2). If ``None``, the value of ``min_samples`` is used instead.
+        absolute number or a fraction of the number of samples. Fractions are
+        converted to ``ceil(min_cluster_size * n_samples)`` with a minimum of
+        2. If ``None``, the normalized value of ``min_samples`` is used
+        instead.
 
     xi : float, between 0 and 1, optional (default=0.05)
         Determines the minimum steepness on the reachability plot that
@@ -617,14 +650,12 @@ def cluster_optics_xi(reachability, predecessor, ordering, min_samples,
         np.unique(labels)``.
     """
     n_samples = len(reachability)
-    _validate_size(min_samples, n_samples, 'min_samples')
-    if min_samples <= 1:
-        min_samples = max(2, min_samples * n_samples)
+    min_samples = _normalize_size(min_samples, n_samples, 'min_samples')
     if min_cluster_size is None:
         min_cluster_size = min_samples
-    _validate_size(min_cluster_size, n_samples, 'min_cluster_size')
-    if min_cluster_size <= 1:
-        min_cluster_size = max(2, min_cluster_size * n_samples)
+    else:
+        min_cluster_size = _normalize_size(min_cluster_size, n_samples,
+                                           'min_cluster_size')
 
     clusters = _xi_cluster(reachability[ordering], predecessor[ordering],
                            ordering, xi,

diff --git a/sklearn/cluster/tests/test_optics.py b/sklearn/cluster/tests/test_optics.py
@@ -8,7 +8,8 @@
 from sklearn.datasets.samples_generator import make_blobs
 from sklearn.cluster.optics_ import (OPTICS,
                                      _extend_region,
-                                     _extract_xi_labels)
+                                     _extract_xi_labels,
+                                     cluster_optics_xi)
 from sklearn.metrics.cluster import contingency_matrix
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.cluster.dbscan_ import DBSCAN
@@ -179,6 +180,74 @@ def test_minimum_number_of_sample_check():
     assert_raise_message(ValueError, msg, clust.fit, X)
 
 
+def test_optics_min_samples_float_no_typeerror():
+    X, _ = make_blobs(n_samples=200, centers=3, random_state=0)
+
+    clust = OPTICS(min_samples=0.5)
+    clust.fit(X)
+
+    assert clust.min_samples_ == 100
+    assert clust.min_cluster_size_ == 100
+
+
+def test_optics_min_samples_fraction_rounding_ceil():
+    X = np.arange(30, dtype=float).reshape(-1, 1)
+
+    clust = OPTICS(min_samples=0.21)
+    clust.fit(X)
+
+    # ceil(0.21 * 30) == 7
+    assert clust.min_samples_ == 7
+
+
+def test_cluster_optics_xi_accepts_float_min_samples():
+    X, _ = make_blobs(n_samples=40, centers=3, random_state=1)
+    optics = OPTICS(min_samples=5, min_cluster_size=6, xi=0.05)
+    optics.fit(X)
+
+    labels_int, clusters_int = cluster_optics_xi(
+        optics.reachability_,
+        optics.predecessor_,
+        optics.ordering_,
+        5,
+        6,
+        optics.xi,
+        optics.predecessor_correction)
+
+    labels_float, clusters_float = cluster_optics_xi(
+        optics.reachability_,
+        optics.predecessor_,
+        optics.ordering_,
+        0.125,  # ceil(0.125 * 40) == 5
+        0.15,   # ceil(0.15 * 40) == 6
+        optics.xi,
+        optics.predecessor_correction)
+
+    assert_array_equal(labels_float, labels_int)
+    assert_array_equal(clusters_float, clusters_int)
+
+
+@pytest.mark.parametrize("min_samples", [0, -1, 1.5])
+def test_optics_min_samples_invalid_values(min_samples):
+    msg = 'min_samples must be a positive integer or a float between 0 and 1'
+
+    X, _ = make_blobs(n_samples=10, centers=1, random_state=2)
+    clust = OPTICS(min_samples=min_samples)
+
+    assert_raise_message(ValueError, msg, clust.fit, X)
+
+
+@pytest.mark.parametrize("min_cluster_size", [0, -2, 1.3])
+def test_optics_min_cluster_size_invalid_values(min_cluster_size):
+    msg = ('min_cluster_size must be a positive integer or a float between '
+           '0 and 1')
+
+    X, _ = make_blobs(n_samples=12, centers=1, random_state=3)
+    clust = OPTICS(min_samples=2, min_cluster_size=min_cluster_size)
+
+    assert_raise_message(ValueError, msg, clust.fit, X)
+
+
 def test_bad_extract():
     # Test an extraction of eps too close to original eps
     msg = "Specify an epsilon smaller than 0.15. Got 0.3."