From a3d516066fc898356db120671b997761e5a30c11 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Thu, 18 Apr 2024 05:38:56 +0200 Subject: [PATCH 1/2] do v0.13.0 --- mlsauce.egg-info/PKG-INFO | 36 ++++++++++++++++ mlsauce.egg-info/SOURCES.txt | 59 ++++++++++++++++++++++++++ mlsauce.egg-info/dependency_links.txt | 1 + mlsauce.egg-info/not-zip-safe | 1 + mlsauce.egg-info/requires.txt | 14 ++++++ mlsauce.egg-info/top_level.txt | 1 + mlsauce/adaopt/_adaopt.py | 1 + mlsauce/booster/_booster_classifier.py | 1 + mlsauce/booster/_booster_regressor.py | 2 +- mlsauce/utils/__init__.py | 3 +- mlsauce/utils/misc/__init__.py | 4 +- mlsauce/utils/misc/misc.py | 48 +++++++++++++++++++++ setup.py | 2 +- 13 files changed, 168 insertions(+), 5 deletions(-) create mode 100644 mlsauce.egg-info/PKG-INFO create mode 100644 mlsauce.egg-info/SOURCES.txt create mode 100644 mlsauce.egg-info/dependency_links.txt create mode 100644 mlsauce.egg-info/not-zip-safe create mode 100644 mlsauce.egg-info/requires.txt create mode 100644 mlsauce.egg-info/top_level.txt diff --git a/mlsauce.egg-info/PKG-INFO b/mlsauce.egg-info/PKG-INFO new file mode 100644 index 0000000..92719d9 --- /dev/null +++ b/mlsauce.egg-info/PKG-INFO @@ -0,0 +1,36 @@ +Metadata-Version: 2.1 +Name: mlsauce +Version: 0.12.3 +Summary: Miscellaneous Statistical/Machine Learning tools +Maintainer: T. Moudiki +Maintainer-email: thierry.moudiki@gmail.com +License: BSD3 Clause Clear +Platform: linux +Platform: macosx +Platform: windows +Classifier: Development Status :: 2 - Pre-Alpha +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: BSD License +Classifier: Natural Language :: English +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 +Classifier: Programming Language :: Python :: 3.7 +Classifier: Programming Language :: Python :: 3.8 +Requires-Python: >=3.5 +License-File: LICENSE +Requires-Dist: numpy +Requires-Dist: Cython +Requires-Dist: joblib +Requires-Dist: pandas +Requires-Dist: requests +Requires-Dist: scikit-learn +Requires-Dist: scipy +Requires-Dist: tqdm +Requires-Dist: jax +Requires-Dist: jaxlib +Provides-Extra: alldeps +Requires-Dist: numpy>=1.13.0; extra == "alldeps" +Requires-Dist: scipy>=0.19.0; extra == "alldeps" + +Miscellaneous Statistical/Machine Learning tools diff --git a/mlsauce.egg-info/SOURCES.txt b/mlsauce.egg-info/SOURCES.txt new file mode 100644 index 0000000..30eae25 --- /dev/null +++ b/mlsauce.egg-info/SOURCES.txt @@ -0,0 +1,59 @@ +LICENSE +README.md +setup.cfg +setup.py +mlsauce/__init__.py +mlsauce/_config.py +mlsauce/setup.py +mlsauce.egg-info/PKG-INFO +mlsauce.egg-info/SOURCES.txt +mlsauce.egg-info/dependency_links.txt +mlsauce.egg-info/not-zip-safe +mlsauce.egg-info/requires.txt +mlsauce.egg-info/top_level.txt +mlsauce/adaopt/__init__.py +mlsauce/adaopt/_adaopt.py +mlsauce/adaopt/_adaoptc.c +mlsauce/adaopt/setup.py +mlsauce/booster/__init__.py +mlsauce/booster/_booster_classifier.py +mlsauce/booster/_booster_regressor.py +mlsauce/booster/_boosterc.c +mlsauce/booster/setup.py +mlsauce/datasets/__init__.py +mlsauce/datasets/dowload.py +mlsauce/encoders/__init__.py +mlsauce/encoders/target_encoders.py +mlsauce/lasso/__init__.py +mlsauce/lasso/_lasso.py +mlsauce/lasso/_lassoc.c +mlsauce/lasso/setup.py +mlsauce/nonconformist/__init__.py +mlsauce/nonconformist/acp.py +mlsauce/nonconformist/base.py +mlsauce/nonconformist/cp.py +mlsauce/nonconformist/evaluation.py +mlsauce/nonconformist/icp.py +mlsauce/nonconformist/nc.py +mlsauce/nonconformist/util.py +mlsauce/predictioninterval/__init__.py +mlsauce/predictioninterval/predictioninterval.py +mlsauce/ridge/__init__.py +mlsauce/ridge/_ridge.py +mlsauce/ridge/_ridgec.c +mlsauce/ridge/setup.py +mlsauce/stump/__init__.py +mlsauce/stump/_stump_classifier.py +mlsauce/stump/_stumpc.c +mlsauce/stump/setup.py +mlsauce/tests/__init__.py +mlsauce/tests/test_adaopt.py +mlsauce/utils/__init__.py +mlsauce/utils/get_beta.py +mlsauce/utils/progress_bar.py +mlsauce/utils/memoryuse/__init__.py +mlsauce/utils/memoryuse/mem_usage.py +mlsauce/utils/misc/__init__.py +mlsauce/utils/misc/misc.py +mlsauce/utils/sampling/__init__.py +mlsauce/utils/sampling/rowsubsampling.py \ No newline at end of file diff --git a/mlsauce.egg-info/dependency_links.txt b/mlsauce.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/mlsauce.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/mlsauce.egg-info/not-zip-safe b/mlsauce.egg-info/not-zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/mlsauce.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/mlsauce.egg-info/requires.txt b/mlsauce.egg-info/requires.txt new file mode 100644 index 0000000..b3f1862 --- /dev/null +++ b/mlsauce.egg-info/requires.txt @@ -0,0 +1,14 @@ +numpy +Cython +joblib +pandas +requests +scikit-learn +scipy +tqdm +jax +jaxlib + +[alldeps] +numpy>=1.13.0 +scipy>=0.19.0 diff --git a/mlsauce.egg-info/top_level.txt b/mlsauce.egg-info/top_level.txt new file mode 100644 index 0000000..2ccd09c --- /dev/null +++ b/mlsauce.egg-info/top_level.txt @@ -0,0 +1 @@ +mlsauce diff --git a/mlsauce/adaopt/_adaopt.py b/mlsauce/adaopt/_adaopt.py index 2470d00..2d80ba6 100644 --- a/mlsauce/adaopt/_adaopt.py +++ b/mlsauce/adaopt/_adaopt.py @@ -7,6 +7,7 @@ from numpy.linalg import norm from tqdm import tqdm from ..utils import subsample +from ..utils import cluster try: from . import _adaoptc as adaoptc diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 68ac1d0..87ea6a1 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -4,6 +4,7 @@ from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin from . import _boosterc as boosterc +from ..utils import cluster class LSBoostClassifier(BaseEstimator, ClassifierMixin): diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index a1fe6fc..05a645e 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -5,7 +5,7 @@ from sklearn.base import RegressorMixin from . import _boosterc as boosterc from ..predictioninterval import PredictionInterval - +from ..utils import cluster class LSBoostRegressor(BaseEstimator, RegressorMixin): """LSBoost regressor. diff --git a/mlsauce/utils/__init__.py b/mlsauce/utils/__init__.py index 49792bf..fea8a9a 100644 --- a/mlsauce/utils/__init__.py +++ b/mlsauce/utils/__init__.py @@ -1,9 +1,10 @@ from .sampling.rowsubsampling import subsample -from .misc.misc import merge_two_dicts, flatten, is_float, is_factor +from .misc.misc import cluster, merge_two_dicts, flatten, is_float, is_factor from .progress_bar import Progbar from .get_beta import get_beta __all__ = [ + "cluster", "subsample", "merge_two_dicts", "flatten", diff --git a/mlsauce/utils/misc/__init__.py b/mlsauce/utils/misc/__init__.py index 1ab4b57..ddd6694 100644 --- a/mlsauce/utils/misc/__init__.py +++ b/mlsauce/utils/misc/__init__.py @@ -1,4 +1,4 @@ -from .misc import merge_two_dicts, flatten, is_float, is_factor +from .misc import cluster, merge_two_dicts, flatten, is_float, is_factor -__all__ = ["merge_two_dicts", "flatten", "is_float", "is_factor"] +__all__ = ["cluster", "merge_two_dicts", "flatten", "is_float", "is_factor"] diff --git a/mlsauce/utils/misc/misc.py b/mlsauce/utils/misc/misc.py index 057bc38..b572430 100644 --- a/mlsauce/utils/misc/misc.py +++ b/mlsauce/utils/misc/misc.py @@ -2,6 +2,54 @@ # # License: BSD 3 +def cluster(X, n_clusters=None, + method="kmeans", + type_scaling = "standard", + training=True, + scaler=None, + label_encoder=None, + clusterer=None, + seed=123): + + assert method in ("kmeans", "gmm"), "method must be in ('kmeans', 'gmm')" + assert type_scaling in ("standard", "minmax", "robust"), "type_scaling must be in ('standard', 'minmax', 'robust')" + + if training: + assert n_clusters is not None, "n_clusters must be provided at training time" + if type_scaling == "standard": + scaler = StandardScaler() + elif type_scaling == "minmax": + scaler = MinMaxScaler() + elif type_scaling == "robust": + scaler = RobustScaler() + else: + raise ValueError("type_scaling must be in ('standard', 'minmax', 'robust')") + + scaled_X = scaler.fit_transform(X) + label_encoder = OneHotEncoder(handle_unknown='ignore') + + if method == "kmeans": + clusterer = KMeans(n_clusters=n_clusters, + random_state=seed, + n_init="auto").fit(scaled_X) + res = label_encoder.fit_transform(clusterer.labels_.reshape(-1, 1)).toarray() + elif method == "gmm": + clusterer = GaussianMixture(n_components=n_clusters, + random_state=seed).fit(scaled_X) + res = label_encoder.fit_transform(clusterer.predict(scaled_X).reshape(-1, 1)).toarray() + else: + raise ValueError("method must be in ('kmeans', 'gmm')") + + return res, scaler, label_encoder, clusterer + + else: # @ inference time + + assert scaler is not None, "scaler must be provided at inferlabel_encodere time" + assert label_encoder is not None, "label_encoder must be provided at inferlabel_encodere time" + assert clusterer is not None, "clusterer must be provided at inferlabel_encodere time" + scaled_X = scaler.transform(X) + + return label_encoder.transform(clusterer.predict(scaled_X).reshape(-1, 1)).toarray() # merge two dictionaries def merge_two_dicts(x, y): diff --git a/setup.py b/setup.py index dd23fbb..1c18541 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ MAINTAINER_EMAIL = 'thierry.moudiki@gmail.com' LICENSE = 'BSD3 Clause Clear' -__version__ = '0.12.3' +__version__ = '0.13.0' VERSION = __version__ From 69504663bf8ec8fbfdf6c6051c4938220b7a7bc8 Mon Sep 17 00:00:00 2001 From: Thierry Moudiki Date: Thu, 18 Apr 2024 06:39:21 +0000 Subject: [PATCH 2/2] do v0.13.0 --- CHANGES.md | 4 + examples/adaopt_classifier.py | 102 ++++++++++++------------- examples/lsboost_classifier.py | 55 ++++++++++++- examples/lsboost_regressor.py | 29 +++++++ mlsauce.egg-info/PKG-INFO | 2 +- mlsauce/adaopt/_adaopt.py | 45 +++++++++++ mlsauce/booster/_booster_classifier.py | 44 ++++++++++- mlsauce/booster/_booster_regressor.py | 42 ++++++++++ mlsauce/utils/misc/misc.py | 6 ++ 9 files changed, 273 insertions(+), 56 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 10e3c2d..cc98111 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,7 @@ +# version 0.13.0 + +- add clustering to `LSBoostRegressor`, `LSBoostClassifier`, and `AdaOpt` + # version 0.12.3 - add prediction intervals to `LSBoostRegressor` (split conformal prediction, diff --git a/examples/adaopt_classifier.py b/examples/adaopt_classifier.py index 73d3e9b..f16c96f 100644 --- a/examples/adaopt_classifier.py +++ b/examples/adaopt_classifier.py @@ -35,6 +35,16 @@ print(obj.score(X_test, y_test)) print(time()-start) +obj = ms.AdaOpt(n_jobs=4, type_dist="euclidean", verbose=1, + n_clusters_input=2) +#obj = ms.AdaOpt() +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) + # data 2 wine = load_wine() @@ -52,6 +62,13 @@ print(obj.score(X_test, y_test)) print(time()-start) +obj = ms.AdaOpt(n_clusters_input=3) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) # data 3 iris = load_iris() @@ -70,6 +87,14 @@ print(obj.score(X_test, y_test)) print(time()-start) +obj = ms.AdaOpt(n_clusters_input=3) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) + # data 4 digits = load_digits() @@ -105,57 +130,30 @@ print(obj.score(X_test, y_test)) print(time()-start) +obj = ms.AdaOpt(n_iterations=50, + learning_rate=0.3, + reg_lambda=0.1, + reg_alpha=0.5, + eta=0.01, + gamma=0.01, + tolerance=1e-4, + row_sample=1, + k=1, n_clusters_input=5, + n_jobs=3, type_dist="euclidean", verbose=1) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) -# # data 5 - -# zip_dir = "/Users/moudiki/Documents/Papers/adaopt/data/zip" -# data_train = pd.read_csv(zip_dir + "/zip_train.csv", -# index_col=0) -# data_test = pd.read_csv(zip_dir + "/zip_test.csv", -# index_col=0) - -# y_train = data_train.y.values -# y_test = data_test.y.values -# X_train = np.ascontiguousarray(np.delete(data_train.values, 0, axis=1)) -# X_test = np.ascontiguousarray(np.delete(data_test.values, 0, axis=1)) - -# obj = ms.AdaOpt(type_dist="euclidean-f", -# k=1, row_sample=1) -# start = time() -# obj.fit(X_train, y_train) -# print(time()-start) -# start = time() -# print(obj.score(X_test, y_test)) -# print(time()-start) - - -# # data 6 - -# letter_dir = "/Users/moudiki/Documents/Papers/adaopt/data/letter" -# data_letter = pd.read_csv(letter_dir + "/letter_recognition.csv", -# index_col=0) - - -# y = data_letter.V1.values -# X = np.asarray(np.ascontiguousarray(np.delete(data_letter.values, 0, -# axis=1)), dtype='float64') - -# np.random.seed(1323) -# X_train, X_test, y_train, y_test = train_test_split(X, y, -# test_size=0.3) - - -# obj = ms.AdaOpt(type_dist="euclidean-f", -# k=1, row_sample=1) -# start = time() -# obj.fit(X_train, y_train) -# print(time()-start) -# start = time() -# print(obj.score(X_test, y_test)) -# print(time()-start) - -# start = time() -# preds = obj.predict(X_test) -# print(time() - start) -# print(metrics.classification_report(preds, y_test)) +# with clustering +obj = ms.AdaOpt(n_clusters=25, k=1, + n_clusters_input=3) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) diff --git a/examples/lsboost_classifier.py b/examples/lsboost_classifier.py index 0dd0e1f..dbced40 100644 --- a/examples/lsboost_classifier.py +++ b/examples/lsboost_classifier.py @@ -19,9 +19,12 @@ #ridge print("\n") -print("ridge -----") +print("lsboost ridge -----") print("\n") +print("\n") +print("breast_cancer data -----") + # data 1 breast_cancer = load_breast_cancer() X = breast_cancer.data @@ -42,6 +45,17 @@ print(obj.obj['loss']) +obj = ms.LSBoostClassifier(tolerance=1e-2, n_clusters=2) +print(obj.get_params()) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) + +print(obj.obj['loss']) + # MORE DATA NEEDED # MORE DATA NEEDED # MORE DATA NEEDED obj = ms.LSBoostClassifier(backend="gpu") print(obj.get_params()) @@ -55,6 +69,9 @@ print(obj.obj['loss']) # data 2 +print("\n") +print("wine data -----") + wine = load_wine() Z = wine.data t = wine.target @@ -73,6 +90,17 @@ print(obj.obj['loss']) +obj = ms.LSBoostClassifier(n_clusters=3) +print(obj.get_params()) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) + +print(obj.obj['loss']) + # MORE DATA NEEDED # MORE DATA NEEDED # MORE DATA NEEDED obj = ms.LSBoostClassifier(backend="gpu") print(obj.get_params()) @@ -86,6 +114,9 @@ print(obj.obj['loss']) # data 3 +print("\n") +print("iris data -----") + iris = load_iris() Z = iris.data t = iris.target @@ -119,10 +150,13 @@ #lasso print("\n") -print("lasso -----") +print("lsboost lasso -----") print("\n") # data 1 +print("\n") +print("breast_cancer data -----") + breast_cancer = load_breast_cancer() X = breast_cancer.data y = breast_cancer.target @@ -152,6 +186,9 @@ # print(time()-start) # data 2 +print("\n") +print("wine data -----") + wine = load_wine() Z = wine.data t = wine.target @@ -179,6 +216,9 @@ # print(time()-start) # data 3 +print("\n") +print("iris data -----") + iris = load_iris() Z = iris.data t = iris.target @@ -196,6 +236,17 @@ print(obj.score(X_test, y_test)) print(time()-start) +obj = ms.LSBoostClassifier(solver="lasso", + n_clusters=3, + clustering_method="gmm") +print(obj.get_params()) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(obj.score(X_test, y_test)) +print(time()-start) + # MORE DATA NEEDED # MORE DATA NEEDED # MORE DATA NEEDED # obj = ms.LSBoostClassifier(backend="gpu", solver="lasso") # print(obj.get_params()) diff --git a/examples/lsboost_regressor.py b/examples/lsboost_regressor.py index 02d357c..fbf0fd1 100644 --- a/examples/lsboost_regressor.py +++ b/examples/lsboost_regressor.py @@ -23,6 +23,10 @@ print("\n") # data 2 + +print("\n") +print("diabetes data -----") + diabetes = load_diabetes() X = diabetes.data y = diabetes.target @@ -42,6 +46,17 @@ print(obj.obj['loss']) +obj = ms.LSBoostRegressor(col_sample=0.9, row_sample=0.9, n_clusters=2) +print(obj.get_params()) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test)))) +print(time()-start) + +print(obj.obj['loss']) + # MORE DATA NEEDED # MORE DATA NEEDED # MORE DATA NEEDED obj = ms.LSBoostRegressor(backend="gpu") print(obj.get_params()) @@ -61,6 +76,9 @@ print("\n") # data 2 +print("\n") +print("diabetes data -----") + diabetes = load_diabetes() X = diabetes.data y = diabetes.target @@ -80,6 +98,17 @@ print(obj.obj['loss']) +obj = ms.LSBoostRegressor(solver="lasso", n_clusters=2) +print(obj.get_params()) +start = time() +obj.fit(X_train, y_train) +print(time()-start) +start = time() +print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test)))) +print(time()-start) + +print(obj.obj['loss']) + # MORE DATA NEEDED # MORE DATA NEEDED # MORE DATA NEEDED # obj = ms.LSBoostRegressor(backend="gpu", solver="lasso") # print(obj.get_params()) diff --git a/mlsauce.egg-info/PKG-INFO b/mlsauce.egg-info/PKG-INFO index 92719d9..617cd3d 100644 --- a/mlsauce.egg-info/PKG-INFO +++ b/mlsauce.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: mlsauce -Version: 0.12.3 +Version: 0.13.0 Summary: Miscellaneous Statistical/Machine Learning tools Maintainer: T. Moudiki Maintainer-email: thierry.moudiki@gmail.com diff --git a/mlsauce/adaopt/_adaopt.py b/mlsauce/adaopt/_adaopt.py index 2d80ba6..c513788 100644 --- a/mlsauce/adaopt/_adaopt.py +++ b/mlsauce/adaopt/_adaopt.py @@ -72,6 +72,15 @@ class AdaOpt(BaseEstimator, ClassifierMixin): cache: boolean if the nearest neighbors are cached or not, for faster retrieval in subsequent calls. + + n_clusters_input: int + number of clusters (a priori) for clustering the features + + clustering_method: str + clustering method: currently 'kmeans', 'gmm' + + cluster_scaling: str + scaling method for clustering: currently 'standard', 'robust', 'minmax' seed: int reproducibility seed for nodes_sim=='uniform', clustering and dropout. @@ -95,8 +104,22 @@ def __init__( n_jobs=None, verbose=0, cache=True, + n_clusters_input = 0, + clustering_method = "kmeans", + cluster_scaling = "standard", seed=123, ): + if n_clusters_input > 0: + assert clustering_method in ( + "kmeans", + "gmm", + ), "`clustering_method` must be in ('kmeans', 'gmm')" + assert cluster_scaling in ( + "standard", + "robust", + "minmax", + ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')" + assert type_dist in ( "euclidean", "manhattan", @@ -119,6 +142,10 @@ def __init__( self.n_jobs = n_jobs self.cache = cache self.verbose = verbose + self.n_clusters_input = n_clusters_input + self.clustering_method = clustering_method + self.cluster_scaling = cluster_scaling + self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None self.seed = seed def fit(self, X, y, **kwargs): @@ -141,6 +168,14 @@ def fit(self, X, y, **kwargs): """ + if self.n_clusters_input > 0: + clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = cluster(X, n_clusters=self.n_clusters_input, + method=self.clustering_method, + type_scaling=self.cluster_scaling, + training=True, + seed=self.seed) + X = np.column_stack((X.copy(), clustered_X)) + if self.row_sample < 1: index_subsample = subsample( y, row_sample=self.row_sample, seed=self.seed @@ -218,6 +253,16 @@ def predict_proba(self, X, **kwargs): """ n_train, p_train = self.scaled_X_train.shape + + if self.n_clusters_input > 0: + X = np.column_stack((X.copy(), cluster( + X, training=False, + scaler=self.scaler_, + label_encoder=self.label_encoder_, + clusterer=self.clusterer_, + seed=self.seed + ))) + n_test = X.shape[0] if self.n_jobs is None: diff --git a/mlsauce/booster/_booster_classifier.py b/mlsauce/booster/_booster_classifier.py index 87ea6a1..53746f8 100644 --- a/mlsauce/booster/_booster_classifier.py +++ b/mlsauce/booster/_booster_classifier.py @@ -55,6 +55,15 @@ class LSBoostClassifier(BaseEstimator, ClassifierMixin): activation: str activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' + + n_clusters: int + number of clusters for clustering the features + + clustering_method: str + clustering method: currently 'kmeans', 'gmm' + + cluster_scaling: str + scaling method for clustering: currently 'standard', 'robust', 'minmax' """ @@ -74,7 +83,21 @@ def __init__( backend="cpu", solver="ridge", activation="relu", + n_clusters = 0, + clustering_method = "kmeans", + cluster_scaling = "standard" ): + if n_clusters > 0: + assert clustering_method in ( + "kmeans", + "gmm", + ), "`clustering_method` must be in ('kmeans', 'gmm')" + assert cluster_scaling in ( + "standard", + "robust", + "minmax", + ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')" + assert backend in ( "cpu", "gpu", @@ -109,6 +132,10 @@ def __init__( self.obj = None self.solver = solver self.activation = activation + self.n_clusters = n_clusters + self.clustering_method = clustering_method + self.cluster_scaling = cluster_scaling + self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None def fit(self, X, y, **kwargs): """Fit Booster (classifier) to training data (X, y) @@ -129,6 +156,14 @@ def fit(self, X, y, **kwargs): self: object. """ + if self.n_clusters > 0: + clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = cluster(X, n_clusters=self.n_clusters, + method=self.clustering_method, + type_scaling=self.cluster_scaling, + training=True, + seed=self.seed) + X = np.column_stack((X.copy(), clustered_X)) + self.obj = boosterc.fit_booster_classifier( np.asarray(X, order="C"), np.asarray(y, order="C"), @@ -186,7 +221,14 @@ def predict_proba(self, X, **kwargs): probability estimates for test data: {array-like} """ - + if self.n_clusters > 0: + X = np.column_stack((X.copy(), cluster( + X, training=False, + scaler=self.scaler_, + label_encoder=self.label_encoder_, + clusterer=self.clusterer_, + seed=self.seed + ))) return boosterc.predict_proba_booster_classifier( self.obj, np.asarray(X, order="C") ) diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py index 05a645e..4e63c26 100644 --- a/mlsauce/booster/_booster_regressor.py +++ b/mlsauce/booster/_booster_regressor.py @@ -66,6 +66,14 @@ class LSBoostRegressor(BaseEstimator, RegressorMixin): Used only in `self.predict`, for `self.kernel` in ('gaussian', 'tophat') and `self.type_pi = 'kde'`. Default is `None`. + n_clusters: int + number of clusters for clustering the features + + clustering_method: str + clustering method: currently 'kmeans', 'gmm' + + cluster_scaling: str + scaling method for clustering: currently 'standard', 'robust', 'minmax' """ @@ -88,7 +96,21 @@ def __init__( type_pi=None, replications=None, kernel=None, + n_clusters = 0, + clustering_method = "kmeans", + cluster_scaling = "standard" ): + if n_clusters > 0: + assert clustering_method in ( + "kmeans", + "gmm", + ), "`clustering_method` must be in ('kmeans', 'gmm')" + assert cluster_scaling in ( + "standard", + "robust", + "minmax", + ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')" + assert backend in ( "cpu", "gpu", @@ -126,6 +148,10 @@ def __init__( self.type_pi = type_pi self.replications = replications self.kernel = kernel + self.n_clusters = n_clusters + self.clustering_method = clustering_method + self.cluster_scaling = cluster_scaling + self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None def fit(self, X, y, **kwargs): """Fit Booster (regressor) to training data (X, y) @@ -146,6 +172,14 @@ def fit(self, X, y, **kwargs): self: object. """ + if self.n_clusters > 0: + clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = cluster(X, n_clusters=self.n_clusters, + method=self.clustering_method, + type_scaling=self.cluster_scaling, + training=True, + seed=self.seed) + X = np.column_stack((X.copy(), clustered_X)) + self.obj = boosterc.fit_booster_regressor( X=np.asarray(X, order="C"), y=np.asarray(y, order="C"), @@ -197,6 +231,14 @@ def predict(self, X, level=95, method=None, **kwargs): probability estimates for test data: {array-like} """ + if self.n_clusters > 0: + X = np.column_stack((X.copy(), cluster( + X, training=False, + scaler=self.scaler_, + label_encoder=self.label_encoder_, + clusterer=self.clusterer_, + seed=self.seed + ))) if "return_pi" in kwargs: assert method in ( "splitconformal", diff --git a/mlsauce/utils/misc/misc.py b/mlsauce/utils/misc/misc.py index b572430..c753c24 100644 --- a/mlsauce/utils/misc/misc.py +++ b/mlsauce/utils/misc/misc.py @@ -1,6 +1,12 @@ # Authors: Thierry Moudiki # # License: BSD 3 +import numpy as np +from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler +from sklearn.preprocessing import OneHotEncoder +from sklearn.cluster import KMeans +from sklearn.mixture import GaussianMixture + def cluster(X, n_clusters=None, method="kmeans",