Skip to content

Commit dfd97ba

Browse files
Merge pull request #40 from Techtonique/hist-gboost
Hist gboost
2 parents 20df6f6 + 4a530bb commit dfd97ba

12 files changed

+503
-61
lines changed

examples/hist_genboost_classifier.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import numpy as np
2+
from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris
3+
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
4+
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
5+
from sklearn.kernel_ridge import KernelRidge
6+
from sklearn.linear_model import LinearRegression
7+
from time import time
8+
from os import chdir
9+
from sklearn import metrics
10+
import os
11+
12+
print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n")
13+
14+
print(os.path.relpath(os.path.dirname(__file__)))
15+
16+
#wd="/workspace/mlsauce/mlsauce/examples"
17+
#
18+
#chdir(wd)
19+
20+
import mlsauce as ms
21+
22+
#ridge
23+
24+
print("\n")
25+
print("GenericBoosting Decision tree -----")
26+
print("\n")
27+
28+
print("\n")
29+
print("breast_cancer data -----")
30+
31+
# data 1
32+
breast_cancer = load_breast_cancer()
33+
X = breast_cancer.data
34+
y = breast_cancer.target
35+
# split data into training test and test set
36+
np.random.seed(15029)
37+
X_train, X_test, y_train, y_test = train_test_split(X, y,
38+
test_size=0.2)
39+
40+
clf = ExtraTreeRegressor()
41+
clf2 = LinearRegression()
42+
43+
obj = ms.HistGenericBoostingClassifier(clf)
44+
print(obj.get_params())
45+
start = time()
46+
obj.fit(X_train, y_train)
47+
print(time()-start)
48+
start = time()
49+
print(obj.score(X_test, y_test))
50+
print(time()-start)
51+
52+
print(obj.obj['loss'])
53+
54+
print(obj.obj['fit_obj_i'])
55+

examples/hist_genboost_regressor.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import subprocess
2+
import sys
3+
import os
4+
5+
print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n")
6+
7+
8+
subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib"])
9+
10+
import mlsauce as ms
11+
import numpy as np
12+
import matplotlib.pyplot as plt
13+
from sklearn.datasets import load_diabetes, fetch_california_housing
14+
from sklearn.linear_model import Ridge, LinearRegression
15+
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
16+
from sklearn.tree import ExtraTreeRegressor
17+
from time import time
18+
from os import chdir
19+
from sklearn import metrics
20+
21+
22+
print("\n")
23+
print("diabetes data -----")
24+
25+
regr = ExtraTreeRegressor()
26+
27+
diabetes = load_diabetes()
28+
X = diabetes.data
29+
y = diabetes.target
30+
# split data into training test and test set
31+
np.random.seed(15029)
32+
X_train, X_test, y_train, y_test = train_test_split(X, y,
33+
test_size=0.2)
34+
35+
36+
obj = ms.HistGenericBoostingRegressor(regr)
37+
print(obj.get_params())
38+
start = time()
39+
obj.fit(X_train, y_train)
40+
print(time()-start)
41+
start = time()
42+
print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
43+
print(time()-start)
44+
print(obj.obj['loss'])
45+
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import os
2+
import mlsauce as ms
3+
from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits
4+
from sklearn.model_selection import train_test_split
5+
from time import time
6+
7+
print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n")
8+
9+
#load_models = [load_breast_cancer, load_iris, load_wine, load_digits]
10+
load_models = [load_breast_cancer, load_iris, load_wine]
11+
#load_models = [load_digits]
12+
13+
for model in load_models:
14+
15+
data = model()
16+
X = data.data
17+
y= data.target
18+
19+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13)
20+
21+
clf = ms.LazyBoostingClassifier(verbose=0, ignore_warnings=True, #n_jobs=2,
22+
custom_metric=None, preprocess=False)
23+
24+
start = time()
25+
models, predictioms = clf.fit(X_train, X_test, y_train, y_test, hist=True)
26+
print(f"\nElapsed: {time() - start} seconds\n")
27+
28+
print(models)
29+
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import os
2+
import mlsauce as ms
3+
import numpy as np
4+
from sklearn.datasets import load_diabetes
5+
from sklearn.datasets import fetch_california_housing
6+
from sklearn.model_selection import train_test_split
7+
8+
print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n")
9+
10+
data = load_diabetes()
11+
X = data.data
12+
y= data.target
13+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)
14+
15+
regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, #n_jobs=2,
16+
custom_metric=None, preprocess=True)
17+
models, predictioms = regr.fit(X_train, X_test, y_train, y_test)
18+
model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test)
19+
print(models)
20+
21+
data = fetch_california_housing()
22+
X = data.data[0:1000,:]
23+
y= data.target[0:1000]
24+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)
25+
26+
regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True,
27+
custom_metric=None, preprocess=True)
28+
models, predictioms = regr.fit(X_train, X_test, y_train, y_test, hist=True)
29+
model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test)
30+
print(models)
31+
32+
33+
from sklearn.datasets import fetch_openml
34+
35+
# Load the dataset from OpenML
36+
boston = fetch_openml(name='boston', version=1, as_frame=True)
37+
38+
# Get the features and target
39+
X = boston.data
40+
y = boston.target
41+
42+
# Display the first few rows
43+
print(X.head())
44+
print(y.head())
45+
46+
np.random.seed(1509)
47+
X_train, X_test, y_train, y_test = train_test_split(X, y,
48+
test_size=0.2)
49+
50+
X_train = X_train.astype(np.float64)
51+
X_test = X_test.astype(np.float64)
52+
y_train = y_train.astype(np.float64)
53+
y_test = y_test.astype(np.float64)
54+
55+
regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, #n_jobs=2,
56+
custom_metric=None, preprocess=True)
57+
models, predictioms = regr.fit(X_train, X_test, y_train, y_test, hist=True)
58+
model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test)
59+
print(models)

mlsauce/booster/_booster_classifier.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from . import _boosterc as boosterc
1212
except ImportError:
1313
import _boosterc as boosterc
14-
from ..utils import cluster, check_and_install
14+
from ..utils import cluster, check_and_install, get_histo_features
1515

1616

1717
class LSBoostClassifier(BaseEstimator, ClassifierMixin):
@@ -83,6 +83,12 @@ class LSBoostClassifier(BaseEstimator, ClassifierMixin):
8383
weights_distr: str
8484
distribution of weights for constructing the model's hidden layer;
8585
currently 'uniform', 'gaussian'
86+
87+
hist: bool
88+
indicates whether histogram features are used or not (default is False)
89+
90+
bins: int or str
91+
number of bins for histogram features (same as numpy.histogram, default is 'auto')
8692
8793
Examples:
8894
@@ -307,9 +313,14 @@ def __init__(
307313
degree=None,
308314
weights_distr="uniform",
309315
base_model=None,
316+
hist=False,
317+
bins="auto",
310318
):
311319

312320
self.base_model = base_model
321+
self.hist = hist
322+
self.bins = bins
323+
self.hist_bins_ = None
313324

314325
if n_clusters > 0:
315326
assert clustering_method in (
@@ -391,6 +402,14 @@ def fit(self, X, y, **kwargs):
391402

392403
if isinstance(X, pd.DataFrame):
393404
X = X.values
405+
406+
if self.hist == True:
407+
X, self.hist_bins_ = get_histo_features(X)
408+
409+
if isinstance(y, pd.Series):
410+
y = y.values.ravel()
411+
else:
412+
y = y.ravel()
394413

395414
if self.degree is not None:
396415
assert isinstance(self.degree, int), "`degree` must be an integer"
@@ -433,7 +452,8 @@ def fit(self, X, y, **kwargs):
433452
obj=self.base_model,
434453
)
435454

436-
self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn
455+
self.classes_ = np.unique(y) # for compatibility with sklearn
456+
self.n_classes_ = len(self.classes_) # for compatibility with sklearn
437457
self.n_estimators = self.obj["n_estimators"]
438458
return self
439459

@@ -476,6 +496,9 @@ def predict_proba(self, X, **kwargs):
476496
if isinstance(X, pd.DataFrame):
477497
X = X.values
478498

499+
if self.hist == True:
500+
X = get_histo_features(X, bins=self.hist_bins_)
501+
479502
if self.degree is not None:
480503
X = self.poly_.transform(X)
481504

@@ -543,7 +566,8 @@ def update(self, X, y, eta=0.9):
543566
)
544567

545568
self.obj = boosterc.update_booster(
546-
self.obj, np.asarray(X, order="C"), np.asarray(y, order="C"), eta
569+
self.obj, np.asarray(X, order="C"),
570+
np.asarray(y, order="C").ravel(), eta
547571
)
548572

549573
return self
@@ -621,6 +645,12 @@ class GenericBoostingClassifier(LSBoostClassifier):
621645
weights_distr: str
622646
distribution of weights for constructing the model's hidden layer;
623647
currently 'uniform', 'gaussian'
648+
649+
hist: bool
650+
indicates whether histogram features are used or not (default is False)
651+
652+
bins: int or str
653+
number of bins for histogram features (same as numpy.histogram, default is 'auto')
624654
625655
"""
626656

@@ -647,8 +677,14 @@ def __init__(
647677
cluster_scaling="standard",
648678
degree=None,
649679
weights_distr="uniform",
680+
hist=False,
681+
bins="auto",
650682
):
651683
self.base_model = base_model
684+
self.hist = hist
685+
self.bins = bins
686+
self.hist_bins_ = None
687+
652688
super().__init__(
653689
n_estimators=n_estimators,
654690
learning_rate=learning_rate,
@@ -671,4 +707,4 @@ def __init__(
671707
degree=degree,
672708
weights_distr=weights_distr,
673709
base_model=self.base_model,
674-
)
710+
)

0 commit comments

Comments
 (0)