Skip to content

Commit

Permalink
Merge pull request #764 from a-szulc/master
Browse files Browse the repository at this point in the history
fix future warning from pandas in label binarizer
  • Loading branch information
pplonski authored Aug 27, 2024
2 parents 55e48d7 + 3139963 commit a5740db
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 6 deletions.
6 changes: 5 additions & 1 deletion supervised/preprocessing/label_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ def __init__(self):
self._new_columns = []
self._uniq_values = None
self._old_column = None
self._old_column_dtype = None

def fit(self, X, column):
self._old_column = column
self._old_column_dtype = str(X[column].dtype)
self._uniq_values = np.unique(X[column].values)
# self._uniq_values = [str(u) for u in self._uniq_values]

Expand All @@ -34,7 +36,7 @@ def inverse_transform(self, X):
if self._old_column is None:
return X

old_col = X[self._new_columns[0]] * 0
old_col = (X[self._new_columns[0]] * 0).astype(self._old_column_dtype)

for unique_value in self._uniq_values:
new_col = f"{self._old_column}_{unique_value}"
Expand All @@ -53,6 +55,7 @@ def to_json(self):
"new_columns": list(self._new_columns),
"unique_values": self._uniq_values,
"old_column": self._old_column,
"old_column_dtype": self._old_column_dtype,
}

if (
Expand All @@ -68,6 +71,7 @@ def from_json(self, data_json):
self._new_columns = data_json.get("new_columns", None)
self._uniq_values = data_json.get("unique_values", None)
self._old_column = data_json.get("old_column", None)
self._old_column_dtype = data_json.get("old_column_dtype", None)

if (
"True" in self._uniq_values
Expand Down
1 change: 0 additions & 1 deletion tests/tests_preprocessing/test_categorical_integers.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,6 @@ def test_future_warning_pandas_inverse_transform(self):
categorical.fit(df)

df = categorical.transform(df).astype(int)
warnings.filterwarnings("error")
df = categorical.inverse_transform(df)

def test_fit_transform_inverse_transform_integers(self):
Expand Down
32 changes: 28 additions & 4 deletions tests/tests_preprocessing/test_label_binarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,18 +169,20 @@ def test_to_and_from_json_booleans(self):
# do not touch continuous attribute
self.assertTrue("col3" in df_test.columns)

def test_inverse_transform(self):
d = {"col1": ["a", "a", "c"], "col2": ["w", "e", "d"]}
def test_inverse_transform_2_unique_strings(self):
d = {"col1": ["a", "a", "c"]}
df = pd.DataFrame(data=d)
lb = LabelBinarizer()
# check first column
lb.fit(df, "col1")
bb = lb.transform(df, "col1")
self.assertTrue("col1_c" in bb.columns)
self.assertTrue(np.sum(bb["col1_c"]) == 1)
bb = lb.inverse_transform(bb)
self.assertTrue("col1_c" not in bb.columns)
# check second column

def test_inverse_transform_strings(self):
d = {"col2": ["w", "e", "d"]}
df = pd.DataFrame(data=d)
lb = LabelBinarizer()
lb.fit(df, "col2")
bb = lb.transform(df, "col2")
Expand All @@ -191,6 +193,28 @@ def test_inverse_transform(self):
bb = lb.inverse_transform(bb)
self.assertTrue("col2_w" not in bb.columns)

def test_inverse_transform_booleans(self):
d = {"col1": [True, False, True, True]}
df = pd.DataFrame(data=d)
lb = LabelBinarizer()
lb.fit(df, "col1")

bb = lb.transform(df, "col1")
self.assertTrue("col1_True" in bb.columns)
self.assertEqual(bb["col1_True"].dtype, "int64")
self.assertEqual(bb["col1_True"][0], 1)
self.assertEqual(bb["col1_True"][1], 0)
self.assertEqual(bb["col1_True"][2], 1)
self.assertEqual(bb["col1_True"][3], 1)

bb = lb.inverse_transform(bb)
self.assertTrue("col1_True" not in bb.columns)
self.assertEqual(bb["col1"].dtype, "bool")
self.assertEqual(bb["col1"][0], True)
self.assertEqual(bb["col1"][1], False)
self.assertEqual(bb["col1"][2], True)
self.assertEqual(bb["col1"][3], True)


if __name__ == "__main__":
unittest.main()

0 comments on commit a5740db

Please sign in to comment.