From b2abe6d1e7a7a3f39ef3c5c2f516312b9e5b4cbe Mon Sep 17 00:00:00 2001 From: Rowan Stein Date: Sat, 27 Dec 2025 21:02:20 +0000 Subject: [PATCH 1/4] fix(linear_model): allow bool X in huber Coerce boolean feature matrices and sample weights to FLOAT_DTYPES in HuberRegressor.fit and default unweighted runs to float64 weights. This prevents the "TypeError: can't convert bool to float" regression reported for boolean inputs and adds regression coverage for dense bool data and bool sample_weight to guard against future regressions. --- sklearn/linear_model/huber.py | 8 ++- sklearn/linear_model/tests/test_huber.py | 79 ++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 3 deletions(-) diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py index 65c6864007eb2..5a5c3f58bd0ab 100644 --- a/sklearn/linear_model/huber.py +++ b/sklearn/linear_model/huber.py @@ -7,10 +7,11 @@ from ..base import BaseEstimator, RegressorMixin from .base import LinearModel -from ..utils import check_X_y +from ..utils import check_X_y, check_array from ..utils import check_consistent_length from ..utils import axis0_safe_slice from ..utils.extmath import safe_sparse_dot +from ..utils.validation import FLOAT_DTYPES def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None): @@ -253,10 +254,11 @@ def fit(self, X, y, sample_weight=None): X, y = check_X_y( X, y, copy=False, accept_sparse=['csr'], y_numeric=True) if sample_weight is not None: - sample_weight = np.array(sample_weight) + sample_weight = check_array( + sample_weight, ensure_2d=False, dtype=FLOAT_DTYPES) check_consistent_length(y, sample_weight) else: - sample_weight = np.ones_like(y) + sample_weight = np.ones_like(y, dtype=np.float64) if self.epsilon < 1.0: raise ValueError( diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py index 6a8b26133d5ac..ba58fba252270 100644 --- a/sklearn/linear_model/tests/test_huber.py +++ b/sklearn/linear_model/tests/test_huber.py @@ -2,6 +2,8 @@ # License: BSD 3 clause import numpy as np +from numpy.testing import assert_allclose +from numpy.testing import assert_allclose from scipy import optimize, sparse import pytest @@ -199,3 +201,80 @@ def test_huber_better_r2_score(): # The huber model should also fit poorly on the outliers. assert_greater(ridge_outlier_score, huber_outlier_score) + + + +def test_huber_bool_dense_X_equivalence(): + rng = np.random.RandomState(0) + X_bool = rng.rand(30, 5) > 0.5 + y = rng.randn(30) + + huber_bool = HuberRegressor().fit(X_bool, y) + huber_float = HuberRegressor().fit(X_bool.astype(np.float64), y) + + assert_allclose(huber_bool.coef_, huber_float.coef_, rtol=1e-7, + atol=1e-7) + assert_allclose(huber_bool.intercept_, huber_float.intercept_, + rtol=1e-7, atol=1e-7) + assert_allclose(huber_bool.scale_, huber_float.scale_, + rtol=1e-7, atol=1e-7) + assert_array_equal(huber_bool.outliers_, huber_float.outliers_) + + +def test_huber_bool_sample_weight(): + rng = np.random.RandomState(0) + X = rng.randn(40, 4) + y = rng.randn(40) + sample_weight_bool = rng.rand(40) > 0.3 + sample_weight_bool[0] = True + sample_weight_bool[1] = False + + huber_bool = HuberRegressor().fit(X, y, sample_weight=sample_weight_bool) + huber_float = HuberRegressor().fit( + X, y, sample_weight=sample_weight_bool.astype(np.float64)) + + assert_allclose(huber_bool.coef_, huber_float.coef_, rtol=1e-7, + atol=1e-7) + assert_allclose(huber_bool.intercept_, huber_float.intercept_, + rtol=1e-7, atol=1e-7) + assert_allclose(huber_bool.scale_, huber_float.scale_, + rtol=1e-7, atol=1e-7) + assert_array_equal(huber_bool.outliers_, huber_float.outliers_) + + +def test_huber_bool_dense_X_equivalence(): + rng = np.random.RandomState(0) + X_bool = rng.rand(30, 5) > 0.5 + y = rng.randn(30) + + huber_bool = HuberRegressor().fit(X_bool, y) + huber_float = HuberRegressor().fit(X_bool.astype(np.float64), y) + + assert_allclose(huber_bool.coef_, huber_float.coef_, rtol=1e-7, + atol=1e-7) + assert_allclose(huber_bool.intercept_, huber_float.intercept_, + rtol=1e-7, atol=1e-7) + assert_allclose(huber_bool.scale_, huber_float.scale_, + rtol=1e-7, atol=1e-7) + assert_array_equal(huber_bool.outliers_, huber_float.outliers_) + + +def test_huber_bool_sample_weight(): + rng = np.random.RandomState(0) + X = rng.randn(40, 4) + y = rng.randn(40) + sample_weight_bool = rng.rand(40) > 0.3 + sample_weight_bool[0] = True + sample_weight_bool[1] = False + + huber_bool = HuberRegressor().fit(X, y, sample_weight=sample_weight_bool) + huber_float = HuberRegressor().fit( + X, y, sample_weight=sample_weight_bool.astype(np.float64)) + + assert_allclose(huber_bool.coef_, huber_float.coef_, rtol=1e-7, + atol=1e-7) + assert_allclose(huber_bool.intercept_, huber_float.intercept_, + rtol=1e-7, atol=1e-7) + assert_allclose(huber_bool.scale_, huber_float.scale_, + rtol=1e-7, atol=1e-7) + assert_array_equal(huber_bool.outliers_, huber_float.outliers_) From a0bef1a91caca0bbe9a074166cb8f5e1ffb8d3a2 Mon Sep 17 00:00:00 2001 From: Rowan Stein Date: Sat, 27 Dec 2025 21:05:33 +0000 Subject: [PATCH 2/4] chore: address review comments - Ensure X is coerced to FLOAT_DTYPES via check_X_y in HuberRegressor.fit - Remove duplicate assert_allclose import and duplicate test definitions --- sklearn/linear_model/tests/test_huber.py | 36 ------------------------ 1 file changed, 36 deletions(-) diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py index ba58fba252270..18be6974053da 100644 --- a/sklearn/linear_model/tests/test_huber.py +++ b/sklearn/linear_model/tests/test_huber.py @@ -3,7 +3,6 @@ import numpy as np from numpy.testing import assert_allclose -from numpy.testing import assert_allclose from scipy import optimize, sparse import pytest @@ -243,38 +242,3 @@ def test_huber_bool_sample_weight(): def test_huber_bool_dense_X_equivalence(): - rng = np.random.RandomState(0) - X_bool = rng.rand(30, 5) > 0.5 - y = rng.randn(30) - - huber_bool = HuberRegressor().fit(X_bool, y) - huber_float = HuberRegressor().fit(X_bool.astype(np.float64), y) - - assert_allclose(huber_bool.coef_, huber_float.coef_, rtol=1e-7, - atol=1e-7) - assert_allclose(huber_bool.intercept_, huber_float.intercept_, - rtol=1e-7, atol=1e-7) - assert_allclose(huber_bool.scale_, huber_float.scale_, - rtol=1e-7, atol=1e-7) - assert_array_equal(huber_bool.outliers_, huber_float.outliers_) - - -def test_huber_bool_sample_weight(): - rng = np.random.RandomState(0) - X = rng.randn(40, 4) - y = rng.randn(40) - sample_weight_bool = rng.rand(40) > 0.3 - sample_weight_bool[0] = True - sample_weight_bool[1] = False - - huber_bool = HuberRegressor().fit(X, y, sample_weight=sample_weight_bool) - huber_float = HuberRegressor().fit( - X, y, sample_weight=sample_weight_bool.astype(np.float64)) - - assert_allclose(huber_bool.coef_, huber_float.coef_, rtol=1e-7, - atol=1e-7) - assert_allclose(huber_bool.intercept_, huber_float.intercept_, - rtol=1e-7, atol=1e-7) - assert_allclose(huber_bool.scale_, huber_float.scale_, - rtol=1e-7, atol=1e-7) - assert_array_equal(huber_bool.outliers_, huber_float.outliers_) From 2bfa8d3bd23906aef9486b8bf6f7ca3465b0993e Mon Sep 17 00:00:00 2001 From: Rowan Stein Date: Sat, 27 Dec 2025 21:05:45 +0000 Subject: [PATCH 3/4] fix: coerce X dtype via check_X_y(dtype=FLOAT_DTYPES) --- sklearn/linear_model/huber.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py index 5a5c3f58bd0ab..9d0981dc85e2e 100644 --- a/sklearn/linear_model/huber.py +++ b/sklearn/linear_model/huber.py @@ -252,7 +252,7 @@ def fit(self, X, y, sample_weight=None): self : object """ X, y = check_X_y( - X, y, copy=False, accept_sparse=['csr'], y_numeric=True) + X, y, copy=False, accept_sparse=['csr'], y_numeric=True, dtype=FLOAT_DTYPES) if sample_weight is not None: sample_weight = check_array( sample_weight, ensure_2d=False, dtype=FLOAT_DTYPES) From 7e272c49a4d47bbd4013d400218b499ffcdf1abf Mon Sep 17 00:00:00 2001 From: Rowan Stein Date: Sat, 27 Dec 2025 21:07:25 +0000 Subject: [PATCH 4/4] fix(tests): remove stray duplicate test definition at EOF --- sklearn/linear_model/tests/test_huber.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/linear_model/tests/test_huber.py b/sklearn/linear_model/tests/test_huber.py index 18be6974053da..6693ad349f04e 100644 --- a/sklearn/linear_model/tests/test_huber.py +++ b/sklearn/linear_model/tests/test_huber.py @@ -241,4 +241,3 @@ def test_huber_bool_sample_weight(): assert_array_equal(huber_bool.outliers_, huber_float.outliers_) -def test_huber_bool_dense_X_equivalence():