From b6b86121ea528f39ba19216778a39c6dbb992647 Mon Sep 17 00:00:00 2001 From: Sathya Kamesh Date: Thu, 4 Jul 2024 11:35:31 +0200 Subject: [PATCH 1/7] adding data checks function --- tabpfn_client/client.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tabpfn_client/client.py b/tabpfn_client/client.py index bb8a9eb..d31533d 100644 --- a/tabpfn_client/client.py +++ b/tabpfn_client/client.py @@ -10,6 +10,7 @@ import json from typing import Literal +from sklearn.utils import check_consistent_length, check_array from tabpfn_client.tabpfn_common_utils import utils as common_utils @@ -67,6 +68,34 @@ def reset_authorization(self): @property def is_initialized(self): return self.access_token is not None and self.access_token != "" + + def check_training_data(self, X, y): + """ + Check the integrity of the training data. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The training input samples. + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values. + + Returns + ------- + is_valid : bool + True if the data is valid. + message : str + The message returned from the server. + """ + + X = check_array( + X, accept_sparse="csr", dtype=np.float32, force_all_finite=False + ) + y = check_array(y, ensure_2d=False, dtype=np.float32, force_all_finite=False) + + check_consistent_length(X, y) + + return X, y def upload_train_set(self, X, y) -> str: """ @@ -85,6 +114,10 @@ def upload_train_set(self, X, y) -> str: The unique ID of the train set in the server. """ + + #checking the integrity of the data + X, y = self.check_training_data(X, y) + X = common_utils.serialize_to_csv_formatted_bytes(X) y = common_utils.serialize_to_csv_formatted_bytes(y) From 55f3a67be8cbbd94574d59cb336513f098895899 Mon Sep 17 00:00:00 2001 From: Sathya Kamesh Date: Thu, 4 Jul 2024 14:55:13 +0200 Subject: [PATCH 2/7] adding unit tests --- tabpfn_client/client.py | 4 ++-- tabpfn_client/tests/unit/test_client.py | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tabpfn_client/client.py b/tabpfn_client/client.py index d31533d..68ca445 100644 --- a/tabpfn_client/client.py +++ b/tabpfn_client/client.py @@ -69,7 +69,8 @@ def reset_authorization(self): def is_initialized(self): return self.access_token is not None and self.access_token != "" - def check_training_data(self, X, y): + @staticmethod + def check_training_data(X, y): """ Check the integrity of the training data. @@ -94,7 +95,6 @@ def check_training_data(self, X, y): y = check_array(y, ensure_2d=False, dtype=np.float32, force_all_finite=False) check_consistent_length(X, y) - return X, y def upload_train_set(self, X, y) -> str: diff --git a/tabpfn_client/tests/unit/test_client.py b/tabpfn_client/tests/unit/test_client.py index 294e51a..11bcb9b 100644 --- a/tabpfn_client/tests/unit/test_client.py +++ b/tabpfn_client/tests/unit/test_client.py @@ -221,3 +221,12 @@ def test_validate_response_only_version_check(self): response.json.return_value = {"detail": "Some other error"} r = self.client._validate_response(response, "test", only_version_check=True) self.assertIsNone(r) + + def test_input_data_check(self): + X, y = load_breast_cancer(return_X_y=True) + + # Test for valid input + ServiceClient.check_training_data(X[:99], y[:99]) + with self.assertRaises(ValueError) as cm: + ServiceClient.check_training_data(X[:99], y[:98]) + self.assertEqual(str(cm.exception), "Found input variables with inconsistent numbers of samples: [99, 98]") From ef589781b20c7f76cf9bfb38319ca775adf64a71 Mon Sep 17 00:00:00 2001 From: Sathya Kamesh Date: Thu, 18 Jul 2024 15:15:50 +0200 Subject: [PATCH 3/7] adding checks for size of input --- tabpfn_client/client.py | 4 ++++ tabpfn_client/tests/unit/test_client.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/tabpfn_client/client.py b/tabpfn_client/client.py index 68ca445..9c26a72 100644 --- a/tabpfn_client/client.py +++ b/tabpfn_client/client.py @@ -95,6 +95,10 @@ def check_training_data(X, y): y = check_array(y, ensure_2d=False, dtype=np.float32, force_all_finite=False) check_consistent_length(X, y) + # length and feature assertions + assert X.shape[0] <= 10000, "The number of samples should not be more than 10000." + assert X.shape[1] <= 500, "The number of features should not be more than 500." + return X, y def upload_train_set(self, X, y) -> str: diff --git a/tabpfn_client/tests/unit/test_client.py b/tabpfn_client/tests/unit/test_client.py index 11bcb9b..7ad7317 100644 --- a/tabpfn_client/tests/unit/test_client.py +++ b/tabpfn_client/tests/unit/test_client.py @@ -230,3 +230,15 @@ def test_input_data_check(self): with self.assertRaises(ValueError) as cm: ServiceClient.check_training_data(X[:99], y[:98]) self.assertEqual(str(cm.exception), "Found input variables with inconsistent numbers of samples: [99, 98]") + + # Test for oversized data + X = np.random.randn(10001,501) + y = np.random.randint(0,2,10001) + + with self.assertRaises(AssertionError) as cm: + ServiceClient.check_training_data(X[:10000], y[:10000]) + self.assertEqual(str(cm.exception), "The number of samples should not be more than 10000.") + + with self.assertRaises(AssertionError) as cm: + ServiceClient.check_training_data(X[:, :500], y) + self.assertEqual(str(cm.exception), "The number of features should not be more than 500.") From 8b211a6c895096858c5d7bdd401257fbad81fede Mon Sep 17 00:00:00 2001 From: Sathya Kamesh Date: Thu, 18 Jul 2024 15:22:13 +0200 Subject: [PATCH 4/7] fixing tests --- tabpfn_client/tests/unit/test_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tabpfn_client/tests/unit/test_client.py b/tabpfn_client/tests/unit/test_client.py index 7ad7317..3ee7948 100644 --- a/tabpfn_client/tests/unit/test_client.py +++ b/tabpfn_client/tests/unit/test_client.py @@ -237,8 +237,8 @@ def test_input_data_check(self): with self.assertRaises(AssertionError) as cm: ServiceClient.check_training_data(X[:10000], y[:10000]) - self.assertEqual(str(cm.exception), "The number of samples should not be more than 10000.") + self.assertEqual(str(cm.exception), "The number of features should not be more than 500.") with self.assertRaises(AssertionError) as cm: ServiceClient.check_training_data(X[:, :500], y) - self.assertEqual(str(cm.exception), "The number of features should not be more than 500.") + self.assertEqual(str(cm.exception), "The number of samples should not be more than 10000.") From c93a709d6b6ae9db2b40156cdb73809902889bd8 Mon Sep 17 00:00:00 2001 From: Sathya Kamesh Date: Fri, 23 Aug 2024 15:40:27 +0200 Subject: [PATCH 5/7] reformat commit --- tabpfn_client/client.py | 8 ++++---- tabpfn_client/tests/unit/test_client.py | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/tabpfn_client/client.py b/tabpfn_client/client.py index 9c26a72..f094bae 100644 --- a/tabpfn_client/client.py +++ b/tabpfn_client/client.py @@ -68,7 +68,7 @@ def reset_authorization(self): @property def is_initialized(self): return self.access_token is not None and self.access_token != "" - + @staticmethod def check_training_data(X, y): """ @@ -96,8 +96,8 @@ def check_training_data(X, y): check_consistent_length(X, y) # length and feature assertions - assert X.shape[0] <= 10000, "The number of samples should not be more than 10000." - assert X.shape[1] <= 500, "The number of features should not be more than 500." + assert X.shape[0] <= 10000, "The number of samples cannot be more than 10000." + assert X.shape[1] <= 500, "The number of features cannot be more than 500." return X, y @@ -119,7 +119,7 @@ def upload_train_set(self, X, y) -> str: """ - #checking the integrity of the data + # checking the integrity of the data X, y = self.check_training_data(X, y) X = common_utils.serialize_to_csv_formatted_bytes(X) diff --git a/tabpfn_client/tests/unit/test_client.py b/tabpfn_client/tests/unit/test_client.py index 3ee7948..d9fec7c 100644 --- a/tabpfn_client/tests/unit/test_client.py +++ b/tabpfn_client/tests/unit/test_client.py @@ -229,16 +229,23 @@ def test_input_data_check(self): ServiceClient.check_training_data(X[:99], y[:99]) with self.assertRaises(ValueError) as cm: ServiceClient.check_training_data(X[:99], y[:98]) - self.assertEqual(str(cm.exception), "Found input variables with inconsistent numbers of samples: [99, 98]") + self.assertEqual( + str(cm.exception), + "Found input variables with inconsistent numbers of samples: [99, 98]", + ) # Test for oversized data - X = np.random.randn(10001,501) - y = np.random.randint(0,2,10001) + X = np.random.randn(10001, 501) + y = np.random.randint(0, 2, 10001) with self.assertRaises(AssertionError) as cm: ServiceClient.check_training_data(X[:10000], y[:10000]) - self.assertEqual(str(cm.exception), "The number of features should not be more than 500.") + self.assertEqual( + str(cm.exception), "The number of features cannot be more than 500." + ) with self.assertRaises(AssertionError) as cm: ServiceClient.check_training_data(X[:, :500], y) - self.assertEqual(str(cm.exception), "The number of samples should not be more than 10000.") + self.assertEqual( + str(cm.exception), "The number of samples cannot be more than 10000." + ) From 6de4513c17a6cb1d006b157d57115843332f993b Mon Sep 17 00:00:00 2001 From: "Liam, SB Hoo" Date: Sat, 21 Sep 2024 18:57:46 +0200 Subject: [PATCH 6/7] Move data size check to estimator, check on train and predict, add test --- tabpfn_client/client.py | 36 ----------- tabpfn_client/estimator.py | 36 +++++++++++ tabpfn_client/tests/unit/test_client.py | 28 -------- .../tests/unit/test_tabpfn_classifier.py | 64 ++++++++++++++++++- .../tests/unit/test_tabpfn_regressor.py | 64 ++++++++++++++++++- 5 files changed, 162 insertions(+), 66 deletions(-) diff --git a/tabpfn_client/client.py b/tabpfn_client/client.py index 781948b..4ed4339 100644 --- a/tabpfn_client/client.py +++ b/tabpfn_client/client.py @@ -11,7 +11,6 @@ import json from typing import Literal -from sklearn.utils import check_consistent_length, check_array from tabpfn_client.tabpfn_common_utils import utils as common_utils @@ -91,38 +90,6 @@ def reset_authorization(self): def is_initialized(self): return self.access_token is not None and self.access_token != "" - @staticmethod - def check_training_data(X, y): - """ - Check the integrity of the training data. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The training input samples. - y : array-like of shape (n_samples,) or (n_samples, n_outputs) - The target values. - - Returns - ------- - is_valid : bool - True if the data is valid. - message : str - The message returned from the server. - """ - - X = check_array( - X, accept_sparse="csr", dtype=np.float32, force_all_finite=False - ) - y = check_array(y, ensure_2d=False, dtype=np.float32, force_all_finite=False) - - check_consistent_length(X, y) - # length and feature assertions - assert X.shape[0] <= 10000, "The number of samples cannot be more than 10000." - assert X.shape[1] <= 500, "The number of features cannot be more than 500." - - return X, y - def upload_train_set(self, X, y) -> str: """ Upload a train set to server and return the train set UID if successful. @@ -141,9 +108,6 @@ def upload_train_set(self, X, y) -> str: """ - # checking the integrity of the data - X, y = self.check_training_data(X, y) - X = common_utils.serialize_to_csv_formatted_bytes(X) y = common_utils.serialize_to_csv_formatted_bytes(y) diff --git a/tabpfn_client/estimator.py b/tabpfn_client/estimator.py index 290600c..8238694 100644 --- a/tabpfn_client/estimator.py +++ b/tabpfn_client/estimator.py @@ -6,11 +6,15 @@ from tabpfn_client import init from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.utils.validation import check_is_fitted +from sklearn.utils import check_consistent_length from tabpfn_client import config logger = logging.getLogger(__name__) +MAX_ROWS = 10000 +MAX_COLS = 500 + @dataclass(eq=True, frozen=True) class PreprocessorConfig: @@ -194,10 +198,16 @@ def _validate_targets_and_classes(self, y) -> np.ndarray: not_nan_mask = ~np.isnan(y) self.classes_ = np.unique(y_[not_nan_mask]) + @staticmethod + def _validate_data_size(X: np.ndarray, y: np.ndarray | None): + if X.shape[0] != y.shape[0]: + raise ValueError("X and y must have the same number of samples") + def fit(self, X, y): # assert init() is called init() + validate_data_size(X, y) self._validate_targets_and_classes(y) if config.g_tabpfn_config.use_server: @@ -207,6 +217,7 @@ def fit(self, X, y): ), "Only 'latest_tabpfn_hosted' model is supported at the moment for init(use_server=True)" except AssertionError as e: print(e) + self.last_train_set_uid = config.g_tabpfn_config.inference_handler.fit(X, y) self.fitted_ = True else: @@ -223,6 +234,8 @@ def predict(self, X): def predict_proba(self, X): check_is_fitted(self) + validate_data_size(X) + return config.g_tabpfn_config.inference_handler.predict( X, task="classification", @@ -344,6 +357,8 @@ def fit(self, X, y): # assert init() is called init() + validate_data_size(X, y) + if config.g_tabpfn_config.use_server: self.last_train_set_uid = config.g_tabpfn_config.inference_handler.fit(X, y) self.fitted_ = True @@ -366,6 +381,7 @@ def predict(self, X): def predict_full(self, X): check_is_fitted(self) + validate_data_size(X) estimator_param = self.get_params() if "model" in estimator_param: @@ -393,3 +409,23 @@ def _model_name_to_path(self, model_name: str) -> str: return f"{base_path}_{model_name}.ckpt" else: raise ValueError(f"Invalid model name: {model_name}") + + +def validate_data_size(X: np.ndarray, y: np.ndarray | None = None): + """ + Check the integrity of the training data. + - check if the number of rows between X and y is consistent + if y is not None (ValueError) + - check if the number of rows is less than MAX_ROWS (ValueError) + - check if the number of columns is less than MAX_COLS (ValueError) + """ + + # check if the number of samples is consistent (ValueError) + if y is not None: + check_consistent_length(X, y) + + # length and feature assertions + if X.shape[0] > MAX_ROWS: + raise ValueError(f"The number of rows cannot be more than {MAX_ROWS}.") + if X.shape[1] > MAX_COLS: + raise ValueError(f"The number of columns cannot be more than {MAX_COLS}.") diff --git a/tabpfn_client/tests/unit/test_client.py b/tabpfn_client/tests/unit/test_client.py index ded7045..adb1740 100644 --- a/tabpfn_client/tests/unit/test_client.py +++ b/tabpfn_client/tests/unit/test_client.py @@ -231,31 +231,3 @@ def test_validate_response_only_version_check(self): response.json.return_value = {"detail": "Some other error"} r = self.client._validate_response(response, "test", only_version_check=True) self.assertIsNone(r) - - def test_input_data_check(self): - X, y = load_breast_cancer(return_X_y=True) - - # Test for valid input - ServiceClient.check_training_data(X[:99], y[:99]) - with self.assertRaises(ValueError) as cm: - ServiceClient.check_training_data(X[:99], y[:98]) - self.assertEqual( - str(cm.exception), - "Found input variables with inconsistent numbers of samples: [99, 98]", - ) - - # Test for oversized data - X = np.random.randn(10001, 501) - y = np.random.randint(0, 2, 10001) - - with self.assertRaises(AssertionError) as cm: - ServiceClient.check_training_data(X[:10000], y[:10000]) - self.assertEqual( - str(cm.exception), "The number of features cannot be more than 500." - ) - - with self.assertRaises(AssertionError) as cm: - ServiceClient.check_training_data(X[:, :500], y) - self.assertEqual( - str(cm.exception), "The number of samples cannot be more than 10000." - ) diff --git a/tabpfn_client/tests/unit/test_tabpfn_classifier.py b/tabpfn_client/tests/unit/test_tabpfn_classifier.py index 863d14b..c3419cc 100644 --- a/tabpfn_client/tests/unit/test_tabpfn_classifier.py +++ b/tabpfn_client/tests/unit/test_tabpfn_classifier.py @@ -1,5 +1,5 @@ import unittest -from unittest.mock import patch +from unittest.mock import patch, MagicMock import shutil import numpy as np @@ -14,6 +14,7 @@ from tabpfn_client.client import ServiceClient from tabpfn_client.tests.mock_tabpfn_server import with_mock_server from tabpfn_client.constants import CACHE_DIR +from tabpfn_client import config class TestTabPFNClassifierInit(unittest.TestCase): @@ -160,3 +161,64 @@ def test_decline_terms_and_cond(self, mock_server, mock_prompt_for_terms_and_con self.assertRaises(RuntimeError, init, use_server=True) self.assertTrue(mock_prompt_for_terms_and_cond.called) + + +class TestTabPFNClassifierInference(unittest.TestCase): + def setUp(self): + # skip init + config.g_tabpfn_config.is_initialized = True + + def tearDown(self): + # undo setUp + config.reset() + + def test_data_size_check_on_train_with_inconsistent_number_of_samples_raise_error( + self, + ): + X = np.random.rand(10, 5) + y = np.random.randint(0, 2, 11) + tabpfn = TabPFNClassifier() + + with self.assertRaises(ValueError): + tabpfn.fit(X, y) + + def test_data_size_check_on_train_with_oversized_data_raise_error(self): + X = np.random.randn(10001, 501) + y = np.random.randint(0, 2, 10001) + + tabpfn = TabPFNClassifier() + + # test oversized columns + with self.assertRaises(ValueError): + tabpfn.fit(X[:10], y[:10]) + + # test oversized rows + with self.assertRaises(ValueError): + tabpfn.fit(X[:, :10], y) + + def test_data_size_check_on_predict_with_oversized_data_raise_error(self): + test_X = np.random.randn(10001, 5) + tabpfn = TabPFNClassifier() + + # skip fitting + tabpfn.fitted_ = True + + # test oversized rows + with self.assertRaises(ValueError): + tabpfn.predict(test_X) + + def test_data_check_on_predict_with_valid_data_pass(self): + test_X = np.random.randn(10, 5) + tabpfn = TabPFNClassifier() + + # skip fitting + tabpfn.fitted_ = True + tabpfn.classes_ = np.array([0, 1]) + + # mock prediction + config.g_tabpfn_config.inference_handler = MagicMock() + config.g_tabpfn_config.inference_handler.predict = MagicMock( + return_value={"probas": np.random.rand(10, 2)} + ) + + tabpfn.predict(test_X) diff --git a/tabpfn_client/tests/unit/test_tabpfn_regressor.py b/tabpfn_client/tests/unit/test_tabpfn_regressor.py index 2b5215e..db065a5 100644 --- a/tabpfn_client/tests/unit/test_tabpfn_regressor.py +++ b/tabpfn_client/tests/unit/test_tabpfn_regressor.py @@ -1,5 +1,6 @@ import unittest -from unittest.mock import patch +from unittest.mock import patch, MagicMock + import shutil import numpy as np from sklearn.datasets import load_diabetes @@ -13,6 +14,7 @@ from tabpfn_client.client import ServiceClient from tabpfn_client.tests.mock_tabpfn_server import with_mock_server from tabpfn_client.constants import CACHE_DIR +from tabpfn_client import config class TestTabPFNRegressorInit(unittest.TestCase): @@ -175,3 +177,63 @@ def test_decline_terms_and_cond(self, mock_server, mock_prompt_for_terms_and_con self.assertRaises(RuntimeError, init, use_server=True) self.assertTrue(mock_prompt_for_terms_and_cond.called) + + +class TestTabPFNRegressorInference(unittest.TestCase): + def setUp(self): + # skip init + config.g_tabpfn_config.is_initialized = True + + def tearDown(self): + # undo setUp + config.reset() + + def test_data_size_check_on_train_with_inconsistent_number_of_samples_raise_error( + self, + ): + X = np.random.rand(10, 5) + y = np.random.rand(11) + tabpfn = TabPFNRegressor() + + with self.assertRaises(ValueError): + tabpfn.fit(X, y) + + def test_data_size_check_on_train_with_oversized_data_raise_error(self): + X = np.random.randn(10001, 501) + y = np.random.randn(10001) + + tabpfn = TabPFNRegressor() + + # test oversized columns + with self.assertRaises(ValueError): + tabpfn.fit(X[:10], y[:10]) + + # test oversized rows + with self.assertRaises(ValueError): + tabpfn.fit(X[:, :10], y) + + def test_data_size_check_on_predict_with_oversized_data_raise_error(self): + test_X = np.random.randn(10001, 5) + tabpfn = TabPFNRegressor() + + # skip fitting + tabpfn.fitted_ = True + + # test oversized rows + with self.assertRaises(ValueError): + tabpfn.predict(test_X) + + def test_data_check_on_predict_with_valid_data_pass(self): + test_X = np.random.randn(10, 5) + tabpfn = TabPFNRegressor() + + # skip fitting + tabpfn.fitted_ = True + + # mock prediction + config.g_tabpfn_config.inference_handler = MagicMock() + config.g_tabpfn_config.inference_handler.predict = MagicMock( + return_value={"mean": np.random.randn(10)} + ) + + tabpfn.predict(test_X) From 14fa565334c1d5d5f4509b94a434c6b8a91b2199 Mon Sep 17 00:00:00 2001 From: "Liam, SB Hoo" Date: Sun, 22 Sep 2024 10:54:37 +0200 Subject: [PATCH 7/7] Minor change --- tabpfn_client/estimator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tabpfn_client/estimator.py b/tabpfn_client/estimator.py index 8238694..b630dba 100644 --- a/tabpfn_client/estimator.py +++ b/tabpfn_client/estimator.py @@ -6,7 +6,6 @@ from tabpfn_client import init from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from sklearn.utils.validation import check_is_fitted -from sklearn.utils import check_consistent_length from tabpfn_client import config @@ -422,7 +421,8 @@ def validate_data_size(X: np.ndarray, y: np.ndarray | None = None): # check if the number of samples is consistent (ValueError) if y is not None: - check_consistent_length(X, y) + if X.shape[0] != y.shape[0]: + raise ValueError("X and y must have the same number of samples") # length and feature assertions if X.shape[0] > MAX_ROWS: