Skip to content

Commit

Permalink
Merge pull request #151 from csinva/cat_var_support
Browse files Browse the repository at this point in the history
init categorical one-hot for FIGS
  • Loading branch information
csinva authored Dec 27, 2022
2 parents 8877e77 + b84d322 commit 9d208dc
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 7 deletions.
43 changes: 38 additions & 5 deletions imodels/tree/figs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from imodels.tree.viz_utils import extract_sklearn_tree_from_figs
from imodels.util.arguments import check_fit_arguments
from imodels.util.data_util import encode_categories

plt.rcParams['figure.dpi'] = 300

Expand Down Expand Up @@ -182,7 +183,14 @@ def _construct_node_with_stump(self, X, y, idxs, tree_num, sample_weight=None,
node_split.setattrs(left_temp=node_left, right_temp=node_right, )
return node_split

def fit(self, X, y=None, feature_names=None, verbose=False, sample_weight=None):
def _encode_categories(self, X, categorical_features):
encoder = None
if hasattr(self, "_encoder"):
encoder = self._encoder
return encode_categories(X, categorical_features, encoder)


def fit(self, X, y=None, feature_names=None, verbose=False, sample_weight=None, categorical_features=None):
"""
Params
------
Expand All @@ -191,6 +199,8 @@ def fit(self, X, y=None, feature_names=None, verbose=False, sample_weight=None):
Splits that would create child nodes with net zero or negative weight
are ignored while searching for a split in each node.
"""
if categorical_features is not None:
X, self._encoder = self._encode_categories(X, categorical_features)
X, y, feature_names = check_fit_arguments(self, X, y, feature_names)
if sample_weight is not None:
sample_weight = _check_sample_weight(sample_weight, X)
Expand Down Expand Up @@ -375,7 +385,9 @@ def print_tree(self, X, y, feature_names=None):
s = s.replace(f'X_{i}', feature_names[i])
return s

def predict(self, X):
def predict(self, X, categorical_features=None):
if hasattr(self, "_encoder"):
X = self._encode_categories(X, categorical_features=categorical_features)
X = check_array(X)
preds = np.zeros(X.shape[0])
for tree in self.trees_:
Expand All @@ -385,7 +397,9 @@ def predict(self, X):
elif isinstance(self, ClassifierMixin):
return (preds > 0.5).astype(int)

def predict_proba(self, X):
def predict_proba(self, X, categorical_features=None):
if hasattr(self, "_encoder"):
X = self._encode_categories(X, categorical_features=categorical_features)
X = check_array(X)
if isinstance(self, RegressorMixin):
return NotImplemented
Expand Down Expand Up @@ -449,8 +463,8 @@ def plot(self, cols=2, feature_names=None, filename=None, label="all",
except IndexError:
ax.axis('off')
continue

ax.set_title(f"Tree {i}")
ttl = f"Tree {i}" if n_plots > 1 else f"Tree {tree_number}"
ax.set_title(ttl)
if filename is not None:
plt.savefig(filename)
return
Expand Down Expand Up @@ -522,11 +536,30 @@ def __init__(self,
X_cls, Y_cls = datasets.load_breast_cancer(return_X_y=True)
X_reg, Y_reg = datasets.make_friedman1(100)

categories = ['cat', 'dog', 'bird', 'fish']
categories_2 = ['bear', 'chicken', 'cow']

X_cat = pd.DataFrame(X_reg)
X_cat['pet1'] = np.random.choice(categories, size=(100, 1))
X_cat['pet2'] = np.random.choice(categories_2, size=(100, 1))

# X_cat.columns[-1] = "pet"
Y_cat = Y_reg

est = FIGSRegressor(max_rules=10)
est.fit(X_cat, Y_cat, categorical_features=['pet1', 'pet2'])
est.predict(X_cat, categorical_features=['pet1', 'pet2'])
est.plot(tree_number=1)



est = FIGSClassifier(max_rules=10)
# est.fit(X_cls, Y_cls, sample_weight=np.arange(0, X_cls.shape[0]))
est.fit(X_cls, Y_cls, sample_weight=[1] * X_cls.shape[0])
est.predict(X_cls)



est = FIGSRegressorCV()
est.fit(X_reg, Y_reg)
est.predict(X_reg)
Expand Down
6 changes: 4 additions & 2 deletions imodels/util/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import pandas as pd
from sklearn.base import ClassifierMixin
from sklearn.utils.validation import check_X_y
from sklearn.utils.validation import _check_sample_weight
from sklearn.utils.multiclass import check_classification_targets


def check_fit_arguments(model, X, y, feature_names):
"""Process arguments for fit and predict methods.
"""
Expand All @@ -25,4 +25,6 @@ def check_fit_arguments(model, X, y, feature_names):
y = y.astype(float)
return X, y, model.feature_names_
# if sample_weight is not None:
# sample_weight = _check_sample_weight(sample_weight, X)
# sample_weight = _check_sample_weight(sample_weight, X)


19 changes: 19 additions & 0 deletions imodels/util/data_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import sklearn.datasets
from scipy.sparse import issparse
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import OneHotEncoder

from ..util.tree_interaction_utils import make_rj, make_vp

Expand Down Expand Up @@ -156,3 +157,21 @@ def _download_imodels_dataset(dataset_fname, data_path: str):
os.makedirs(oj(data_path, 'imodels_data'), exist_ok=True)
with open(oj(data_path, 'imodels_data', dataset_fname), 'w') as f:
f.write(r.text)


def encode_categories(X, features, encoder=None):
columns_to_keep = list(set(X.columns).difference(features))
X_encoded = X.loc[:, columns_to_keep]
X_cat = pd.DataFrame({f: X.loc[:, f] for f in features})

if encoder is None:
one_hot_encoder = OneHotEncoder(sparse=False, categories="auto")
X_one_hot = pd.DataFrame(one_hot_encoder.fit_transform(X_cat))
else:
one_hot_encoder = encoder
X_one_hot = pd.DataFrame(one_hot_encoder.transform(X_cat))
X_one_hot.columns = one_hot_encoder.get_feature_names_out(features)
X_encoded = pd.concat([X_encoded,X_one_hot], axis=1)
if encoder is not None:
return X_encoded
return X_encoded, one_hot_encoder
21 changes: 21 additions & 0 deletions tests/figs_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from functools import partial

import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeRegressor

from imodels import FIGSClassifier, FIGSRegressor, FIGSClassifierCV, FIGSRegressorCV
Expand Down Expand Up @@ -36,6 +37,25 @@ def test_recognized_by_sklearn(self):
verbose=2)
comb_model.fit(self.X, self.y_reg)

def test_categorical(self):
"""Test FIGS with categorical data"""
categories = ['cat', 'dog', 'bird', 'fish']
categories_2 = ['bear', 'chicken', 'cow']

self.X_cat = pd.DataFrame(self.X)
self.X_cat['pet1'] = np.random.choice(categories, size=(self.n, 1))
self.X_cat['pet2'] = np.random.choice(categories_2, size=(self.n, 1))

figs_reg = FIGSRegressor()
figs_cls = FIGSClassifier()

figs_reg.fit(self.X_cat, self.y_reg, categorical_features=["pet1", 'pet2'])
figs_reg.predict(self.X_cat, categorical_features=["pet1", 'pet2'])

figs_cls.fit(self.X_cat, self.y_reg, categorical_features=["pet1", 'pet2'])
figs_cls.predict_proba(self.X_cat, categorical_features=["pet1", 'pet2'])


def test_fitting(self):
'''Test on a real (small) dataset
'''
Expand Down Expand Up @@ -87,3 +107,4 @@ def test_fitting(self):
t = TestFIGS()
t.setup()
t.test_fitting()
t.test_categorical()

0 comments on commit 9d208dc

Please sign in to comment.