From b84d322d5a1a8f39e0a2e9b8218b4a55774b3ccf Mon Sep 17 00:00:00 2001 From: Omer Ronen Date: Tue, 27 Dec 2022 11:54:27 -0800 Subject: [PATCH] init impl --- imodels/tree/figs.py | 43 ++++++++++++++++++++++++++++++++++----- imodels/util/arguments.py | 6 ++++-- imodels/util/data_util.py | 19 +++++++++++++++++ tests/figs_test.py | 21 +++++++++++++++++++ 4 files changed, 82 insertions(+), 7 deletions(-) diff --git a/imodels/tree/figs.py b/imodels/tree/figs.py index fdd0afc5..34798f9a 100644 --- a/imodels/tree/figs.py +++ b/imodels/tree/figs.py @@ -15,6 +15,7 @@ from imodels.tree.viz_utils import extract_sklearn_tree_from_figs from imodels.util.arguments import check_fit_arguments +from imodels.util.data_util import encode_categories plt.rcParams['figure.dpi'] = 300 @@ -182,7 +183,14 @@ def _construct_node_with_stump(self, X, y, idxs, tree_num, sample_weight=None, node_split.setattrs(left_temp=node_left, right_temp=node_right, ) return node_split - def fit(self, X, y=None, feature_names=None, verbose=False, sample_weight=None): + def _encode_categories(self, X, categorical_features): + encoder = None + if hasattr(self, "_encoder"): + encoder = self._encoder + return encode_categories(X, categorical_features, encoder) + + + def fit(self, X, y=None, feature_names=None, verbose=False, sample_weight=None, categorical_features=None): """ Params ------ @@ -191,6 +199,8 @@ def fit(self, X, y=None, feature_names=None, verbose=False, sample_weight=None): Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. """ + if categorical_features is not None: + X, self._encoder = self._encode_categories(X, categorical_features) X, y, feature_names = check_fit_arguments(self, X, y, feature_names) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) @@ -375,7 +385,9 @@ def print_tree(self, X, y, feature_names=None): s = s.replace(f'X_{i}', feature_names[i]) return s - def predict(self, X): + def predict(self, X, categorical_features=None): + if hasattr(self, "_encoder"): + X = self._encode_categories(X, categorical_features=categorical_features) X = check_array(X) preds = np.zeros(X.shape[0]) for tree in self.trees_: @@ -385,7 +397,9 @@ def predict(self, X): elif isinstance(self, ClassifierMixin): return (preds > 0.5).astype(int) - def predict_proba(self, X): + def predict_proba(self, X, categorical_features=None): + if hasattr(self, "_encoder"): + X = self._encode_categories(X, categorical_features=categorical_features) X = check_array(X) if isinstance(self, RegressorMixin): return NotImplemented @@ -449,8 +463,8 @@ def plot(self, cols=2, feature_names=None, filename=None, label="all", except IndexError: ax.axis('off') continue - - ax.set_title(f"Tree {i}") + ttl = f"Tree {i}" if n_plots > 1 else f"Tree {tree_number}" + ax.set_title(ttl) if filename is not None: plt.savefig(filename) return @@ -522,11 +536,30 @@ def __init__(self, X_cls, Y_cls = datasets.load_breast_cancer(return_X_y=True) X_reg, Y_reg = datasets.make_friedman1(100) + categories = ['cat', 'dog', 'bird', 'fish'] + categories_2 = ['bear', 'chicken', 'cow'] + + X_cat = pd.DataFrame(X_reg) + X_cat['pet1'] = np.random.choice(categories, size=(100, 1)) + X_cat['pet2'] = np.random.choice(categories_2, size=(100, 1)) + + # X_cat.columns[-1] = "pet" + Y_cat = Y_reg + + est = FIGSRegressor(max_rules=10) + est.fit(X_cat, Y_cat, categorical_features=['pet1', 'pet2']) + est.predict(X_cat, categorical_features=['pet1', 'pet2']) + est.plot(tree_number=1) + + + est = FIGSClassifier(max_rules=10) # est.fit(X_cls, Y_cls, sample_weight=np.arange(0, X_cls.shape[0])) est.fit(X_cls, Y_cls, sample_weight=[1] * X_cls.shape[0]) est.predict(X_cls) + + est = FIGSRegressorCV() est.fit(X_reg, Y_reg) est.predict(X_reg) diff --git a/imodels/util/arguments.py b/imodels/util/arguments.py index 5261c493..022e4132 100644 --- a/imodels/util/arguments.py +++ b/imodels/util/arguments.py @@ -2,9 +2,9 @@ import pandas as pd from sklearn.base import ClassifierMixin from sklearn.utils.validation import check_X_y -from sklearn.utils.validation import _check_sample_weight from sklearn.utils.multiclass import check_classification_targets + def check_fit_arguments(model, X, y, feature_names): """Process arguments for fit and predict methods. """ @@ -25,4 +25,6 @@ def check_fit_arguments(model, X, y, feature_names): y = y.astype(float) return X, y, model.feature_names_ # if sample_weight is not None: - # sample_weight = _check_sample_weight(sample_weight, X) \ No newline at end of file + # sample_weight = _check_sample_weight(sample_weight, X) + + diff --git a/imodels/util/data_util.py b/imodels/util/data_util.py index 0d8ec120..07eecc84 100644 --- a/imodels/util/data_util.py +++ b/imodels/util/data_util.py @@ -8,6 +8,7 @@ import sklearn.datasets from scipy.sparse import issparse from sklearn.datasets import fetch_openml +from sklearn.preprocessing import OneHotEncoder from ..util.tree_interaction_utils import make_rj, make_vp @@ -156,3 +157,21 @@ def _download_imodels_dataset(dataset_fname, data_path: str): os.makedirs(oj(data_path, 'imodels_data'), exist_ok=True) with open(oj(data_path, 'imodels_data', dataset_fname), 'w') as f: f.write(r.text) + + +def encode_categories(X, features, encoder=None): + columns_to_keep = list(set(X.columns).difference(features)) + X_encoded = X.loc[:, columns_to_keep] + X_cat = pd.DataFrame({f: X.loc[:, f] for f in features}) + + if encoder is None: + one_hot_encoder = OneHotEncoder(sparse=False, categories="auto") + X_one_hot = pd.DataFrame(one_hot_encoder.fit_transform(X_cat)) + else: + one_hot_encoder = encoder + X_one_hot = pd.DataFrame(one_hot_encoder.transform(X_cat)) + X_one_hot.columns = one_hot_encoder.get_feature_names_out(features) + X_encoded = pd.concat([X_encoded,X_one_hot], axis=1) + if encoder is not None: + return X_encoded + return X_encoded, one_hot_encoder diff --git a/tests/figs_test.py b/tests/figs_test.py index 227574ff..2a135e72 100644 --- a/tests/figs_test.py +++ b/tests/figs_test.py @@ -3,6 +3,7 @@ from functools import partial import numpy as np +import pandas as pd from sklearn.tree import DecisionTreeRegressor from imodels import FIGSClassifier, FIGSRegressor, FIGSClassifierCV, FIGSRegressorCV @@ -36,6 +37,25 @@ def test_recognized_by_sklearn(self): verbose=2) comb_model.fit(self.X, self.y_reg) + def test_categorical(self): + """Test FIGS with categorical data""" + categories = ['cat', 'dog', 'bird', 'fish'] + categories_2 = ['bear', 'chicken', 'cow'] + + self.X_cat = pd.DataFrame(self.X) + self.X_cat['pet1'] = np.random.choice(categories, size=(self.n, 1)) + self.X_cat['pet2'] = np.random.choice(categories_2, size=(self.n, 1)) + + figs_reg = FIGSRegressor() + figs_cls = FIGSClassifier() + + figs_reg.fit(self.X_cat, self.y_reg, categorical_features=["pet1", 'pet2']) + figs_reg.predict(self.X_cat, categorical_features=["pet1", 'pet2']) + + figs_cls.fit(self.X_cat, self.y_reg, categorical_features=["pet1", 'pet2']) + figs_cls.predict_proba(self.X_cat, categorical_features=["pet1", 'pet2']) + + def test_fitting(self): '''Test on a real (small) dataset ''' @@ -87,3 +107,4 @@ def test_fitting(self): t = TestFIGS() t.setup() t.test_fitting() + t.test_categorical()