diff --git a/imodels/rule_set/rule_fit.py b/imodels/rule_set/rule_fit.py index bf20e4b1..e78b52e6 100644 --- a/imodels/rule_set/rule_fit.py +++ b/imodels/rule_set/rule_fit.py @@ -113,7 +113,8 @@ def fit(self, X, y=None, feature_names=None): self.feature_names = np.array(list(self.feature_dict_.values())) extracted_rules = self._extract_rules(X, y) - self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(X, y, extracted_rules) + self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules( + X, y, extracted_rules) self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] @@ -160,7 +161,8 @@ def predict_proba(self, X): X = X.toarray() X = check_array(X) continuous_output = self._predict_continuous_output(X) - logits = np.vstack((1 - continuous_output, continuous_output)).transpose() + logits = np.vstack( + (1 - continuous_output, continuous_output)).transpose() return softmax(logits, axis=1) def transform(self, X=None, rules=None): @@ -178,9 +180,15 @@ def transform(self, X=None, rules=None): Transformed data set """ df = pd.DataFrame(X, columns=self.feature_placeholders) + print('df', df.dtypes, df.head()) X_transformed = np.zeros((X.shape[0], len(rules))) for i, r in enumerate(rules): features_r_uses = [term.split(' ')[0] for term in r.split(' and ')] + # print('r', r) + # print('feats', df[features_r_uses]) + # print('ans', df[features_r_uses].query(r)) + # print( + # 'tra', X_transformed[df[features_r_uses].query(r).index.values, i]) X_transformed[df[features_r_uses].query(r).index.values, i] = 1 return X_transformed @@ -216,7 +224,8 @@ def _get_rules(self, exclude_zero_coef=False, subregion=None): subregion = np.array(subregion) importance = sum(abs(coef) * abs([x[i] for x in self.winsorizer.trim(subregion)] - self.mean[i])) / len( subregion) - output_rules += [(self.feature_names[i], 'linear', coef, 1, importance)] + output_rules += [(self.feature_names[i], + 'linear', coef, 1, importance)] # Add rules for i in range(0, len(self.rules_)): @@ -224,13 +233,17 @@ def _get_rules(self, exclude_zero_coef=False, subregion=None): coef = self.coef[i + n_features] if subregion is None: - importance = abs(coef) * (rule.support * (1 - rule.support)) ** (1 / 2) + importance = abs(coef) * (rule.support * + (1 - rule.support)) ** (1 / 2) else: rkx = self.transform(subregion, [rule])[:, -1] - importance = sum(abs(coef) * abs(rkx - rule.support)) / len(subregion) + importance = sum( + abs(coef) * abs(rkx - rule.support)) / len(subregion) - output_rules += [(self.rules_[i].rule, 'rule', coef, rule.support, importance)] - rules = pd.DataFrame(output_rules, columns=["rule", "type", "coef", "support", "importance"]) + output_rules += [(self.rules_[i].rule, 'rule', + coef, rule.support, importance)] + rules = pd.DataFrame(output_rules, columns=[ + "rule", "type", "coef", "support", "importance"]) if exclude_zero_coef: rules = rules.ix[rules.coef != 0] return rules @@ -292,7 +305,8 @@ def _score_rules(self, X, y, rules) -> Tuple[List[Rule], List[float], float]: # no rules fit and self.include_linear == False if X_concat.shape[1] == 0: return [], [], 0 - prediction_task = 'regression' if isinstance(self, RegressorMixin) else 'classification' + prediction_task = 'regression' if isinstance( + self, RegressorMixin) else 'classification' return score_linear(X_concat, y, rules, prediction_task=prediction_task, max_rules=self.max_rules, diff --git a/setup.py b/setup.py index cd762093..192eec51 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ required_pypi = [ 'matplotlib', - 'mlxtend>=0.18.0', # some lower version are missing fpgrowth + 'mlxtend>=0.18.0', # some lower versions are missing fpgrowth 'numpy', - 'pandas', + 'pandas<=2.1.4', # pandas 2.2 introduced some issues with the query function 'requests', # used in c4.5 'scipy', - 'scikit-learn>=1.2.0', # recently updates this + 'scikit-learn>=1.2.0', # recently updated this 'tqdm', # used in BART ] diff --git a/tests/classification_continuous_inputs_test.py b/tests/classification_continuous_inputs_test.py index cec8999f..d5b6c343 100644 --- a/tests/classification_continuous_inputs_test.py +++ b/tests/classification_continuous_inputs_test.py @@ -8,18 +8,20 @@ class TestClassClassificationContinuousInputs: '''Tests simple classification for different models. Note: still doesn't test all the models! ''' - def setup(self): + def setup_method(self): np.random.seed(13) random.seed(13) self.n = 40 self.p = 2 self.X_classification_binary = np.random.randn(self.n, self.p) - + # y = x0 > 0 - self.y_classification_binary = (self.X_classification_binary[:, 0] > 0).astype(int) + self.y_classification_binary = ( + self.X_classification_binary[:, 0] > 0).astype(int) # flip labels for last few - self.y_classification_binary[-2:] = 1 - self.y_classification_binary[-2:] + self.y_classification_binary[-2:] = 1 - \ + self.y_classification_binary[-2:] def test_classification_binary(self): '''Test imodels on basic binary classification task @@ -58,7 +60,8 @@ def test_classification_binary(self): preds_proba = m.predict_proba(X) assert len(preds_proba.shape) == 2, 'preds_proba has 2 columns' assert preds_proba.shape[1] == 2, 'preds_proba has 2 columns' - assert np.max(preds_proba) < 1.1, 'preds_proba has no values over 1' + assert np.max( + preds_proba) < 1.1, 'preds_proba has no values over 1' assert (np.argmax(preds_proba, axis=1) == preds).all(), ("predict_proba and " "predict agree") @@ -70,5 +73,5 @@ def test_classification_binary(self): if __name__ == '__main__': t = TestClassClassificationContinuousInputs() - t.setup() + t.setup_method() t.test_classification_binary()