csinva
diff --git a/‎imodels/discretization/discretizer.py
+44-28 b/‎imodels/discretization/discretizer.py
+44-28
diff --git a/‎imodels/util/data_util.py
+7-4 b/‎imodels/util/data_util.py
+7-4
diff --git a/‎imodels/util/extract.py
+36-16 b/‎imodels/util/extract.py
+36-16
diff --git a/‎setup.py
+2-2 b/‎setup.py
+2-2
@@ -100,10 +100,12 @@ def _validate_n_bins(self):
                 )
             self.n_bins = np.full(n_features, orig_bins, dtype=int)
         else:
-            n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
+            n_bins = check_array(orig_bins, dtype=int,
+                                 copy=True, ensure_2d=False)
 
             if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
-                raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
+                raise ValueError(
+                    "n_bins must be a scalar or array of shape (n_features,).")
 
             bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
 
@@ -136,12 +138,12 @@ def _validate_args(self):
 
         valid_encode = ('onehot', 'ordinal')
         if self.encode not in valid_encode:
-            raise ValueError("Valid options for 'encode' are {}. Got encode={!r} instead." \
+            raise ValueError("Valid options for 'encode' are {}. Got encode={!r} instead."
                              .format(valid_encode, self.encode))
 
         valid_strategy = ('uniform', 'quantile', 'kmeans')
         if (self.strategy not in valid_strategy):
-            raise ValueError("Valid options for 'strategy' are {}. Got strategy={!r} instead." \
+            raise ValueError("Valid options for 'strategy' are {}. Got strategy={!r} instead."
                              .format(valid_strategy, self.strategy))
 
     def _discretize_to_bins(self, x, bin_edges,
@@ -174,7 +176,8 @@ def _discretize_to_bins(self, x, bin_edges,
 
         if keep_pointwise_bins:
             # note: min and max values are used to define pointwise bins
-            pointwise_bins = np.unique(bin_edges[pd.Series(bin_edges).duplicated()])
+            pointwise_bins = np.unique(
+                bin_edges[pd.Series(bin_edges).duplicated()])
         else:
             pointwise_bins = np.array([])
 
@@ -183,7 +186,8 @@ def _discretize_to_bins(self, x, bin_edges,
         for idx, split in enumerate(unique_edges):
             if idx == (len(unique_edges) - 1):  # uppermost bin
                 if (idx == 0) & (split in pointwise_bins):
-                    indicator = x > split  # two bins total: (-inf, a], (a, inf)
+                    # two bins total: (-inf, a], (a, inf)
+                    indicator = x > split
                 else:
                     indicator = x >= split  # uppermost bin: [a, inf)
             else:
@@ -217,7 +221,8 @@ def _fit_preprocessing(self, X):
 
         # by default, discretize all numeric columns
         if len(self.dcols) == 0:
-            numeric_cols = [col for col in X.columns if is_numeric_dtype(X[col].dtype)]
+            numeric_cols = [
+                col for col in X.columns if is_numeric_dtype(X[col].dtype)]
             self.dcols_ = numeric_cols
 
         # error checking
@@ -255,7 +260,8 @@ def _transform_postprocessing(self, discretized_df, X):
             try:
                 onehot_col_names = self.onehot_.get_feature_names_out(colnames)
             except:
-                onehot_col_names = self.onehot_.get_feature_names(colnames)  # older versions of sklearn
+                onehot_col_names = self.onehot_.get_feature_names(
+                    colnames)  # older versions of sklearn
             discretized_df = self.onehot_.transform(discretized_df.astype(str))
             discretized_df = pd.DataFrame(discretized_df,
                                           columns=onehot_col_names,
@@ -353,7 +359,7 @@ def fit(self, X, y=None):
         disc_ordinal_df = pd.DataFrame(disc_ordinal_np, columns=self.dcols)
         disc_ordinal_df_str = disc_ordinal_df.astype(int).astype(str)
 
-        encoder = OneHotEncoder(drop=self.onehot_drop, sparse=False)
+        encoder = OneHotEncoder(drop=self.onehot_drop)  # , sparse=False)
         encoder.fit(disc_ordinal_df_str)
         self.encoder_ = encoder
 
@@ -382,7 +388,8 @@ def transform(self, X):
 
         # One-hot encode the ordinal DF
         disc_onehot_np = self.encoder_.transform(disc_ordinal_df_str)
-        disc_onehot = pd.DataFrame(disc_onehot_np, columns=self.encoder_.get_feature_names_out())
+        disc_onehot = pd.DataFrame(
+            disc_onehot_np, columns=self.encoder_.get_feature_names_out())
 
         # Name columns after the interval they represent (e.g. 0.1_to_0.5)
         for col, bin_edges in zip(self.dcols, self.discretizer_.bin_edges_):
@@ -525,7 +532,7 @@ def fit(self, X, y=None):
 
         # fit onehot encoded X if specified
         if self.encode == "onehot":
-            onehot = OneHotEncoder(drop=self.onehot_drop, sparse=False)
+            onehot = OneHotEncoder(drop=self.onehot_drop)  # , sparse=False)
             onehot.fit(discretized_df.astype(str))
             self.onehot_ = onehot
 
@@ -550,7 +557,8 @@ def transform(self, X):
         check_is_fitted(self)
 
         # transform using KBinsDiscretizer
-        discretized_df = self.discretizer_.transform(X[self.dcols_]).astype(int)
+        discretized_df = self.discretizer_.transform(
+            X[self.dcols_]).astype(int)
         discretized_df = pd.DataFrame(discretized_df,
                                       columns=self.dcols_,
                                       index=X.index)
@@ -669,7 +677,7 @@ def _validate_args(self):
         super()._validate_args()
         valid_backup_strategy = ('uniform', 'quantile', 'kmeans')
         if (self.backup_strategy not in valid_backup_strategy):
-            raise ValueError("Valid options for 'strategy' are {}. Got strategy={!r} instead." \
+            raise ValueError("Valid options for 'strategy' are {}. Got strategy={!r} instead."
                              .format(valid_backup_strategy, self.backup_strategy))
 
     def _get_rf_splits(self, col_names):
@@ -738,7 +746,8 @@ def _fit_rf(self, X, y=None):
             # provided rf model has not yet been trained
             if not check_is_fitted(self.rf_model):
                 if y is None:
-                    raise ValueError("Must provide y if rf_model has not been trained.")
+                    raise ValueError(
+                        "Must provide y if rf_model has not been trained.")
                 self.rf_model.fit(X, y)
 
         # get all random forest split points
@@ -785,12 +794,13 @@ def reweight_n_bins(self, X, y=None, by="nsplits"):
         if by == "nsplits":
             # each col gets at least 2 bins; remaining bins get
             # reallocated based on number of RF splits using that feature
-            n_rules = np.array([len(self.rf_splits[col]) for col in self.dcols_])
-            self.n_bins = np.round(n_rules / n_rules.sum() * \
+            n_rules = np.array([len(self.rf_splits[col])
+                               for col in self.dcols_])
+            self.n_bins = np.round(n_rules / n_rules.sum() *
                                    (total_bins - 2 * len(self.dcols_))) + 2
         else:
             valid_by = ('nsplits')
-            raise ValueError("Valid options for 'by' are {}. Got by={!r} instead." \
+            raise ValueError("Valid options for 'by' are {}. Got by={!r} instead."
                              .format(valid_by, by))
 
     def fit(self, X, y=None):
@@ -817,12 +827,12 @@ def fit(self, X, y=None):
         self._fit_rf(X=X, y=y)
 
         # features that were not used in the rf but need to be discretized
-        self.missing_rf_cols_ = list(set(self.dcols_) - \
+        self.missing_rf_cols_ = list(set(self.dcols_) -
                                      set(self.rf_splits.keys()))
         if len(self.missing_rf_cols_) > 0:
-            print("{} did not appear in random forest so were discretized via {} discretization" \
+            print("{} did not appear in random forest so were discretized via {} discretization"
                   .format(self.missing_rf_cols_, self.strategy))
-            missing_n_bins = np.array([self.n_bins[np.array(self.dcols_) == col][0] \
+            missing_n_bins = np.array([self.n_bins[np.array(self.dcols_) == col][0]
                                        for col in self.missing_rf_cols_])
 
             backup_discretizer = BasicDiscretizer(n_bins=missing_n_bins,
@@ -836,7 +846,8 @@ def fit(self, X, y=None):
 
         if self.encode == 'onehot':
             if len(self.missing_rf_cols_) > 0:
-                discretized_df = backup_discretizer.transform(X[self.missing_rf_cols_])
+                discretized_df = backup_discretizer.transform(
+                    X[self.missing_rf_cols_])
             else:
                 discretized_df = pd.DataFrame({}, index=X.index)
 
@@ -848,16 +859,19 @@ def fit(self, X, y=None):
                 if self.strategy == "quantile":
                     q_values = np.linspace(0, 1, int(b) + 1)
                     bin_edges = np.quantile(self.rf_splits[col], q_values)
-                elif strategy == "uniform":
-                    width = (max(self.rf_splits[col]) - min(self.rf_splits[col])) / b
-                    bin_edges = width * np.arange(0, b + 1) + min(self.rf_splits[col])
+                elif self.strategy == "uniform":
+                    width = (max(self.rf_splits[col]) -
+                             min(self.rf_splits[col])) / b
+                    bin_edges = width * \
+                        np.arange(0, b + 1) + min(self.rf_splits[col])
                 self.bin_edges_[col] = bin_edges
                 if self.encode == 'onehot':
-                    discretized_df[col] = self._discretize_to_bins(X[col], bin_edges)
+                    discretized_df[col] = self._discretize_to_bins(
+                        X[col], bin_edges)
 
         # fit onehot encoded X if specified
         if self.encode == "onehot":
-            onehot = OneHotEncoder(drop=self.onehot_drop, sparse=False)
+            onehot = OneHotEncoder(drop=self.onehot_drop)  # , sparse=False)
             onehot.fit(discretized_df[self.dcols_].astype(str))
             self.onehot_ = onehot
 
@@ -883,7 +897,8 @@ def transform(self, X):
 
         # transform features that did not appear in RF
         if len(self.missing_rf_cols_) > 0:
-            discretized_df = self.backup_discretizer_.transform(X[self.missing_rf_cols_])
+            discretized_df = self.backup_discretizer_.transform(
+                X[self.missing_rf_cols_])
             discretized_df = pd.DataFrame(discretized_df,
                                           columns=self.missing_rf_cols_,
                                           index=X.index)
@@ -892,7 +907,8 @@ def transform(self, X):
 
         # do discretization based on rf split thresholds
         for col in self.bin_edges_.keys():
-            discretized_df[col] = self._discretize_to_bins(X[col], self.bin_edges_[col])
+            discretized_df[col] = self._discretize_to_bins(
+                X[col], self.bin_edges_[col])
 
         # return onehot encoded data if specified and
         # join discretized columns with rest of X
 
@@ -35,7 +35,8 @@
         "dataset_name": "readmission_clean",
         "data_source": "imodels",
     },  # big, 100k points
-    "adult": {"dataset_name": "1182", "data_source": "openml"},  # big, 1e6 points
+    # big, 1e6 points
+    "adult": {"dataset_name": "1182", "data_source": "openml"},
     # CDI classification
     "csi_pecarn": {"dataset_name": "csi_pecarn_pred", "data_source": "imodels"},
     "iai_pecarn": {"dataset_name": "iai_pecarn_pred", "data_source": "imodels"},
@@ -221,7 +222,8 @@ def _split(X, y, feature_names):
         return _split(_clean_features(X), y, _clean_feat_names(feature_names))
     elif data_source == "synthetic":
         if dataset_name == "friedman1":
-            X, y = sklearn.datasets.make_friedman1(n_samples=200, n_features=10)
+            X, y = sklearn.datasets.make_friedman1(
+                n_samples=200, n_features=10)
         elif dataset_name == "friedman2":
             X, y = sklearn.datasets.make_friedman2(n_samples=200)
         elif dataset_name == "friedman3":
@@ -234,7 +236,8 @@ def _split(X, y, feature_names):
 
 
 def _download_imodels_dataset(dataset_fname, data_path: str):
-    dataset_fname = dataset_fname.split("/")[-1]  # remove anything about the path
+    dataset_fname = dataset_fname.split(
+        "/")[-1]  # remove anything about the path
     download_path = f"https://raw.githubusercontent.com/csinva/imodels-data/master/data_cleaned/{dataset_fname}"
     r = requests.get(download_path)
     if r.status_code == 404:
@@ -253,7 +256,7 @@ def encode_categories(X, features, encoder=None):
     X_cat = pd.DataFrame({f: X.loc[:, f] for f in features})
 
     if encoder is None:
-        one_hot_encoder = OneHotEncoder(sparse=False, categories="auto")
+        one_hot_encoder = OneHotEncoder(categories="auto")
         X_one_hot = pd.DataFrame(one_hot_encoder.fit_transform(X_cat))
     else:
         one_hot_encoder = encoder
 
@@ -7,16 +7,17 @@
     GradientBoostingClassifier, RandomForestClassifier
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.utils.validation import check_array
-
+import inspect
 from imodels.util import rule, convert
 
 
 def extract_fpgrowth(X,
                      minsupport=0.1,
                      maxcardinality=2,
                      verbose=False) -> List[Tuple]:
-    
-    itemsets_df = mlx.fpgrowth(X, min_support=minsupport, max_len=maxcardinality)
+
+    itemsets_df = mlx.fpgrowth(
+        X, min_support=minsupport, max_len=maxcardinality)
     itemsets_indices = [tuple(s[1]) for s in itemsets_df.values]
     itemsets = [np.array(X.columns)[list(inds)] for inds in itemsets_indices]
     itemsets = list(map(tuple, itemsets))
@@ -49,13 +50,15 @@ def extract_rulefit(X, y, feature_names,
             "RuleFit only works with GradientBoostingClassifier(), GradientBoostingRegressor(), "
             "RandomForestRegressor() or RandomForestClassifier()")
 
-    ## fit tree generator
+    # fit tree generator
     if not exp_rand_tree_size:  # simply fit with constant tree size
         tree_generator.fit(X, y)
     else:  # randomise tree size as per Friedman 2005 Sec 3.3
         np.random.seed(random_state)
-        tree_sizes = np.random.exponential(scale=tree_size - 2, size=n_estimators)
-        tree_sizes = np.asarray([2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes))], dtype=int)
+        tree_sizes = np.random.exponential(
+            scale=tree_size - 2, size=n_estimators)
+        tree_sizes = np.asarray([2 + np.floor(tree_sizes[i_])
+                                for i_ in np.arange(len(tree_sizes))], dtype=int)
         tree_generator.set_params(warm_start=True)
         curr_est_ = 0
         for i_size in np.arange(len(tree_sizes)):
@@ -76,7 +79,7 @@ def extract_rulefit(X, y, feature_names,
 
     seen_rules = set()
     extracted_rules = []
-    for estimator in estimators_: 
+    for estimator in estimators_:
         for rule_value_pair in convert.tree_to_rules(estimator[0], np.array(feature_names), prediction_values=True):
 
             rule_obj = rule.Rule(rule_value_pair[0])
@@ -108,12 +111,21 @@ def extract_skope(X, y, feature_names,
         max_depths = [max_depths]
 
     for max_depth in max_depths:
+
+        # pass different key based on sklearn version
+        estimator = DecisionTreeRegressor(
+            max_depth=max_depth,
+            max_features=max_features,
+            min_samples_split=min_samples_split,
+
+        )
+        init_signature = inspect.signature(BaggingRegressor.__init__)
+        estimator_key = 'estimator' if 'estimator' in init_signature.parameters.keys(
+        ) else 'base_estimator'
+        kwargs = {
+            estimator_key: estimator,
+        }
         bagging_clf = BaggingRegressor(
-            estimator=DecisionTreeRegressor(
-                max_depth=max_depth,
-                max_features=max_features,
-                min_samples_split=min_samples_split
-            ),
             n_estimators=n_estimators,
             max_samples=max_samples,
             max_features=max_samples_features,
@@ -124,7 +136,8 @@ def extract_skope(X, y, feature_names,
             # warm_start=... XXX may be added to increase computation perf.
             n_jobs=n_jobs,
             random_state=random_state,
-            verbose=verbose
+            verbose=verbose,
+            **kwargs
         )
         ensembles.append(bagging_clf)
 
@@ -134,8 +147,8 @@ def extract_skope(X, y, feature_names,
         weights = sample_weight - sample_weight.min()
         contamination = float(sum(y)) / len(y)
         y_reg = (
-                pow(weights, 0.5) * 0.5 / contamination * (y > 0) -
-                pow((weights).mean(), 0.5) * (y == 0)
+            pow(weights, 0.5) * 0.5 / contamination * (y > 0) -
+            pow((weights).mean(), 0.5) * (y == 0)
         )
         y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
 
@@ -153,10 +166,12 @@ def extract_skope(X, y, feature_names,
 
     extracted_rules = []
     for estimator, features in zip(estimators_, estimators_features_):
-        extracted_rules.append(convert.tree_to_rules(estimator, np.array(feature_names)[features]))
+        extracted_rules.append(convert.tree_to_rules(
+            estimator, np.array(feature_names)[features]))
 
     return extracted_rules, estimators_samples_, estimators_features_
 
+
 def extract_marginal_curves(clf, X, max_evals=100):
     """Uses predict_proba to compute marginal curves.
     Assumes clf is a classifier with a predict_proba method and that classifier is additive across features
@@ -193,3 +208,8 @@ def extract_marginal_curves(clf, X, max_evals=100):
         feature_vals_list.append(feature_vals)
         shape_function_vals_list.append(shape_function_vals.tolist())
     return feature_vals_list, shape_function_vals_list
+
+
+if __name__ == '__main__':
+    init_signature = inspect.signature(BaggingRegressor.__init__)
+    print('estimator' in init_signature.parameters.keys())
@@ -13,7 +13,7 @@
     'pandas',
     'requests',  # used in c4.5
     'scipy',
-    'scikit-learn',  # 0.23+ only works on py3.6+
+    'scikit-learn>=1.2.0',  # recently updates this
     'tqdm',  # used in BART
 ]
 
@@ -26,7 +26,7 @@
 
 setuptools.setup(
     name="imodels",
-    version="1.4.1",
+    version="1.4.2",
     author="Chandan Singh, Keyan Nasseri, Matthew Epland, Yan Shuo Tan, Omer Ronen, Tiffany Tang, Abhineet Agarwal, Theo Saarinen, Bin Yu, and others",
     author_email="chandan_singh@berkeley.edu",
     description="Implementations of various interpretable models",