@@ -100,10 +100,12 @@ def _validate_n_bins(self):
100
100
)
101
101
self .n_bins = np .full (n_features , orig_bins , dtype = int )
102
102
else :
103
- n_bins = check_array (orig_bins , dtype = int , copy = True , ensure_2d = False )
103
+ n_bins = check_array (orig_bins , dtype = int ,
104
+ copy = True , ensure_2d = False )
104
105
105
106
if n_bins .ndim > 1 or n_bins .shape [0 ] != n_features :
106
- raise ValueError ("n_bins must be a scalar or array of shape (n_features,)." )
107
+ raise ValueError (
108
+ "n_bins must be a scalar or array of shape (n_features,)." )
107
109
108
110
bad_nbins_value = (n_bins < 2 ) | (n_bins != orig_bins )
109
111
@@ -136,12 +138,12 @@ def _validate_args(self):
136
138
137
139
valid_encode = ('onehot' , 'ordinal' )
138
140
if self .encode not in valid_encode :
139
- raise ValueError ("Valid options for 'encode' are {}. Got encode={!r} instead." \
141
+ raise ValueError ("Valid options for 'encode' are {}. Got encode={!r} instead."
140
142
.format (valid_encode , self .encode ))
141
143
142
144
valid_strategy = ('uniform' , 'quantile' , 'kmeans' )
143
145
if (self .strategy not in valid_strategy ):
144
- raise ValueError ("Valid options for 'strategy' are {}. Got strategy={!r} instead." \
146
+ raise ValueError ("Valid options for 'strategy' are {}. Got strategy={!r} instead."
145
147
.format (valid_strategy , self .strategy ))
146
148
147
149
def _discretize_to_bins (self , x , bin_edges ,
@@ -174,7 +176,8 @@ def _discretize_to_bins(self, x, bin_edges,
174
176
175
177
if keep_pointwise_bins :
176
178
# note: min and max values are used to define pointwise bins
177
- pointwise_bins = np .unique (bin_edges [pd .Series (bin_edges ).duplicated ()])
179
+ pointwise_bins = np .unique (
180
+ bin_edges [pd .Series (bin_edges ).duplicated ()])
178
181
else :
179
182
pointwise_bins = np .array ([])
180
183
@@ -183,7 +186,8 @@ def _discretize_to_bins(self, x, bin_edges,
183
186
for idx , split in enumerate (unique_edges ):
184
187
if idx == (len (unique_edges ) - 1 ): # uppermost bin
185
188
if (idx == 0 ) & (split in pointwise_bins ):
186
- indicator = x > split # two bins total: (-inf, a], (a, inf)
189
+ # two bins total: (-inf, a], (a, inf)
190
+ indicator = x > split
187
191
else :
188
192
indicator = x >= split # uppermost bin: [a, inf)
189
193
else :
@@ -217,7 +221,8 @@ def _fit_preprocessing(self, X):
217
221
218
222
# by default, discretize all numeric columns
219
223
if len (self .dcols ) == 0 :
220
- numeric_cols = [col for col in X .columns if is_numeric_dtype (X [col ].dtype )]
224
+ numeric_cols = [
225
+ col for col in X .columns if is_numeric_dtype (X [col ].dtype )]
221
226
self .dcols_ = numeric_cols
222
227
223
228
# error checking
@@ -255,7 +260,8 @@ def _transform_postprocessing(self, discretized_df, X):
255
260
try :
256
261
onehot_col_names = self .onehot_ .get_feature_names_out (colnames )
257
262
except :
258
- onehot_col_names = self .onehot_ .get_feature_names (colnames ) # older versions of sklearn
263
+ onehot_col_names = self .onehot_ .get_feature_names (
264
+ colnames ) # older versions of sklearn
259
265
discretized_df = self .onehot_ .transform (discretized_df .astype (str ))
260
266
discretized_df = pd .DataFrame (discretized_df ,
261
267
columns = onehot_col_names ,
@@ -353,7 +359,7 @@ def fit(self, X, y=None):
353
359
disc_ordinal_df = pd .DataFrame (disc_ordinal_np , columns = self .dcols )
354
360
disc_ordinal_df_str = disc_ordinal_df .astype (int ).astype (str )
355
361
356
- encoder = OneHotEncoder (drop = self .onehot_drop , sparse = False )
362
+ encoder = OneHotEncoder (drop = self .onehot_drop ) # , sparse=False)
357
363
encoder .fit (disc_ordinal_df_str )
358
364
self .encoder_ = encoder
359
365
@@ -382,7 +388,8 @@ def transform(self, X):
382
388
383
389
# One-hot encode the ordinal DF
384
390
disc_onehot_np = self .encoder_ .transform (disc_ordinal_df_str )
385
- disc_onehot = pd .DataFrame (disc_onehot_np , columns = self .encoder_ .get_feature_names_out ())
391
+ disc_onehot = pd .DataFrame (
392
+ disc_onehot_np , columns = self .encoder_ .get_feature_names_out ())
386
393
387
394
# Name columns after the interval they represent (e.g. 0.1_to_0.5)
388
395
for col , bin_edges in zip (self .dcols , self .discretizer_ .bin_edges_ ):
@@ -525,7 +532,7 @@ def fit(self, X, y=None):
525
532
526
533
# fit onehot encoded X if specified
527
534
if self .encode == "onehot" :
528
- onehot = OneHotEncoder (drop = self .onehot_drop , sparse = False )
535
+ onehot = OneHotEncoder (drop = self .onehot_drop ) # , sparse=False)
529
536
onehot .fit (discretized_df .astype (str ))
530
537
self .onehot_ = onehot
531
538
@@ -550,7 +557,8 @@ def transform(self, X):
550
557
check_is_fitted (self )
551
558
552
559
# transform using KBinsDiscretizer
553
- discretized_df = self .discretizer_ .transform (X [self .dcols_ ]).astype (int )
560
+ discretized_df = self .discretizer_ .transform (
561
+ X [self .dcols_ ]).astype (int )
554
562
discretized_df = pd .DataFrame (discretized_df ,
555
563
columns = self .dcols_ ,
556
564
index = X .index )
@@ -669,7 +677,7 @@ def _validate_args(self):
669
677
super ()._validate_args ()
670
678
valid_backup_strategy = ('uniform' , 'quantile' , 'kmeans' )
671
679
if (self .backup_strategy not in valid_backup_strategy ):
672
- raise ValueError ("Valid options for 'strategy' are {}. Got strategy={!r} instead." \
680
+ raise ValueError ("Valid options for 'strategy' are {}. Got strategy={!r} instead."
673
681
.format (valid_backup_strategy , self .backup_strategy ))
674
682
675
683
def _get_rf_splits (self , col_names ):
@@ -738,7 +746,8 @@ def _fit_rf(self, X, y=None):
738
746
# provided rf model has not yet been trained
739
747
if not check_is_fitted (self .rf_model ):
740
748
if y is None :
741
- raise ValueError ("Must provide y if rf_model has not been trained." )
749
+ raise ValueError (
750
+ "Must provide y if rf_model has not been trained." )
742
751
self .rf_model .fit (X , y )
743
752
744
753
# get all random forest split points
@@ -785,12 +794,13 @@ def reweight_n_bins(self, X, y=None, by="nsplits"):
785
794
if by == "nsplits" :
786
795
# each col gets at least 2 bins; remaining bins get
787
796
# reallocated based on number of RF splits using that feature
788
- n_rules = np .array ([len (self .rf_splits [col ]) for col in self .dcols_ ])
789
- self .n_bins = np .round (n_rules / n_rules .sum () * \
797
+ n_rules = np .array ([len (self .rf_splits [col ])
798
+ for col in self .dcols_ ])
799
+ self .n_bins = np .round (n_rules / n_rules .sum () *
790
800
(total_bins - 2 * len (self .dcols_ ))) + 2
791
801
else :
792
802
valid_by = ('nsplits' )
793
- raise ValueError ("Valid options for 'by' are {}. Got by={!r} instead." \
803
+ raise ValueError ("Valid options for 'by' are {}. Got by={!r} instead."
794
804
.format (valid_by , by ))
795
805
796
806
def fit (self , X , y = None ):
@@ -817,12 +827,12 @@ def fit(self, X, y=None):
817
827
self ._fit_rf (X = X , y = y )
818
828
819
829
# features that were not used in the rf but need to be discretized
820
- self .missing_rf_cols_ = list (set (self .dcols_ ) - \
830
+ self .missing_rf_cols_ = list (set (self .dcols_ ) -
821
831
set (self .rf_splits .keys ()))
822
832
if len (self .missing_rf_cols_ ) > 0 :
823
- print ("{} did not appear in random forest so were discretized via {} discretization" \
833
+ print ("{} did not appear in random forest so were discretized via {} discretization"
824
834
.format (self .missing_rf_cols_ , self .strategy ))
825
- missing_n_bins = np .array ([self .n_bins [np .array (self .dcols_ ) == col ][0 ] \
835
+ missing_n_bins = np .array ([self .n_bins [np .array (self .dcols_ ) == col ][0 ]
826
836
for col in self .missing_rf_cols_ ])
827
837
828
838
backup_discretizer = BasicDiscretizer (n_bins = missing_n_bins ,
@@ -836,7 +846,8 @@ def fit(self, X, y=None):
836
846
837
847
if self .encode == 'onehot' :
838
848
if len (self .missing_rf_cols_ ) > 0 :
839
- discretized_df = backup_discretizer .transform (X [self .missing_rf_cols_ ])
849
+ discretized_df = backup_discretizer .transform (
850
+ X [self .missing_rf_cols_ ])
840
851
else :
841
852
discretized_df = pd .DataFrame ({}, index = X .index )
842
853
@@ -848,16 +859,19 @@ def fit(self, X, y=None):
848
859
if self .strategy == "quantile" :
849
860
q_values = np .linspace (0 , 1 , int (b ) + 1 )
850
861
bin_edges = np .quantile (self .rf_splits [col ], q_values )
851
- elif strategy == "uniform" :
852
- width = (max (self .rf_splits [col ]) - min (self .rf_splits [col ])) / b
853
- bin_edges = width * np .arange (0 , b + 1 ) + min (self .rf_splits [col ])
862
+ elif self .strategy == "uniform" :
863
+ width = (max (self .rf_splits [col ]) -
864
+ min (self .rf_splits [col ])) / b
865
+ bin_edges = width * \
866
+ np .arange (0 , b + 1 ) + min (self .rf_splits [col ])
854
867
self .bin_edges_ [col ] = bin_edges
855
868
if self .encode == 'onehot' :
856
- discretized_df [col ] = self ._discretize_to_bins (X [col ], bin_edges )
869
+ discretized_df [col ] = self ._discretize_to_bins (
870
+ X [col ], bin_edges )
857
871
858
872
# fit onehot encoded X if specified
859
873
if self .encode == "onehot" :
860
- onehot = OneHotEncoder (drop = self .onehot_drop , sparse = False )
874
+ onehot = OneHotEncoder (drop = self .onehot_drop ) # , sparse=False)
861
875
onehot .fit (discretized_df [self .dcols_ ].astype (str ))
862
876
self .onehot_ = onehot
863
877
@@ -883,7 +897,8 @@ def transform(self, X):
883
897
884
898
# transform features that did not appear in RF
885
899
if len (self .missing_rf_cols_ ) > 0 :
886
- discretized_df = self .backup_discretizer_ .transform (X [self .missing_rf_cols_ ])
900
+ discretized_df = self .backup_discretizer_ .transform (
901
+ X [self .missing_rf_cols_ ])
887
902
discretized_df = pd .DataFrame (discretized_df ,
888
903
columns = self .missing_rf_cols_ ,
889
904
index = X .index )
@@ -892,7 +907,8 @@ def transform(self, X):
892
907
893
908
# do discretization based on rf split thresholds
894
909
for col in self .bin_edges_ .keys ():
895
- discretized_df [col ] = self ._discretize_to_bins (X [col ], self .bin_edges_ [col ])
910
+ discretized_df [col ] = self ._discretize_to_bins (
911
+ X [col ], self .bin_edges_ [col ])
896
912
897
913
# return onehot encoded data if specified and
898
914
# join discretized columns with rest of X
0 commit comments