From 9c677c381518a03bf83d0325c5ba91058e7394e7 Mon Sep 17 00:00:00 2001 From: Kay Hoogland Date: Thu, 25 Jul 2019 14:48:52 +0200 Subject: [PATCH] Feature/column dropper (#166) * Implementation of the Columndropper class * Added one more test for pipeline use * Fixed the failing tests --- sklego/preprocessing.py | 96 +++++++++++++++++++ .../test_preprocessing/test_columndropper.py | 57 +++++++++++ 2 files changed, 153 insertions(+) create mode 100644 tests/test_preprocessing/test_columndropper.py diff --git a/sklego/preprocessing.py b/sklego/preprocessing.py index 3081002ba..223e78b0f 100644 --- a/sklego/preprocessing.py +++ b/sklego/preprocessing.py @@ -513,3 +513,99 @@ def transform(self, X): # apply the projection and remove the column we won't need X_fair = X @ self.projection_ return np.atleast_2d(np.delete(X_fair, self.col_ids_, axis=1)) + + +class ColumnDropper(BaseEstimator, TransformerMixin): + """ + Allows dropping specific columns from a pandas DataFrame by name. Can be useful in a sklearn Pipeline. + + :param columns: column name ``str`` or list of column names to be selected + + .. note:: + Raises a ``TypeError`` if input provided is not a DataFrame + + Raises a ``ValueError`` if columns provided are not in the input DataFrame + + :Example: + + >>> # Selecting a single column from a pandas DataFrame + >>> import pandas as pd + >>> df = pd.DataFrame({ + ... 'name': ['Swen', 'Victor', 'Alex'], + ... 'length': [1.82, 1.85, 1.80], + ... 'shoesize': [42, 44, 45] + ... }) + >>> ColumnDropper(['name']).fit_transform(df) + length shoesize + 0 1.82 42 + 1 1.85 44 + 2 1.80 45 + + >>> # Selecting multiple columns from a pandas DataFrame + >>> ColumnDropper(['length', 'shoesize']).fit_transform(df) + name + 0 Swen + 1 Victor + 2 Alex + + + >>> # Selecting non-existent columns returns in a KeyError + >>> ColumnDropper(['weight']).fit_transform(df) + Traceback (most recent call last): + ... + KeyError: "['weight'] column(s) not in DataFrame" + + >>> # How to use the ColumnSelector in a sklearn Pipeline + >>> from sklearn.pipeline import Pipeline + >>> from sklearn.preprocessing import StandardScaler + >>> pipe = Pipeline([ + ... ('select', ColumnDropper(['name', 'shoesize'])), + ... ('scale', StandardScaler()), + ... ]) + >>> pipe.fit_transform(df) + array([[-0.16222142], + [ 1.29777137], + [-1.13554995]]) + """ + + def __init__(self, columns: list): + # if the columns parameter is not a list, make it into a list + if not isinstance(columns, list): + columns = [columns] + + self.columns = columns + + def fit(self, X, y=None): + """ + Checks 1) if input is a DataFrame, and 2) if column names are in this DataFrame + + :param X: ``pd.DataFrame`` on which we apply the column selection + :param y: ``pd.Series`` labels for X. unused for column selection + :returns: ``ColumnSelector`` object. + """ + + self._check_X_for_type(X) + self._check_column_names(X) + return self + + def transform(self, X): + """Returns a pandas DataFrame with only the specified columns + + :param X: ``pd.DataFrame`` on which we apply the column selection + :returns: ``pd.DataFrame`` with only the selected columns + """ + if self.columns: + return X.drop(self.columns, axis=1) + return X + + def _check_column_names(self, X): + """Check if one or more of the columns provided doesn't exist in the input DataFrame""" + non_existent_columns = set(self.columns).difference(X.columns) + if len(non_existent_columns) > 0: + raise KeyError(f'{list(non_existent_columns)} column(s) not in DataFrame') + + @staticmethod + def _check_X_for_type(X): + """Checks if input of the Selector is of the required dtype""" + if not isinstance(X, pd.DataFrame): + raise TypeError("Provided variable X is not of type pandas.DataFrame") diff --git a/tests/test_preprocessing/test_columndropper.py b/tests/test_preprocessing/test_columndropper.py new file mode 100644 index 000000000..0f885a864 --- /dev/null +++ b/tests/test_preprocessing/test_columndropper.py @@ -0,0 +1,57 @@ +import pandas as pd +from pandas.testing import assert_frame_equal +from sklearn.pipeline import make_pipeline +import pytest +from sklego.preprocessing import ColumnDropper + + +@pytest.fixture() +def df(): + return pd.DataFrame({"a": [1, 2, 3, 4, 5, 6], + "b": [10, 9, 8, 7, 6, 5], + "c": ["a", "b", "a", "b", "c", "c"], + "d": ["b", "a", "a", "b", "a", "b"], + "e": [0, 1, 0, 1, 0, 1]}) + + +def test_drop_two(df): + result_df = ColumnDropper(['a', 'b']).fit_transform(df) + expected_df = pd.DataFrame({ + "c": ["a", "b", "a", "b", "c", "c"], + "d": ["b", "a", "a", "b", "a", "b"], + "e": [0, 1, 0, 1, 0, 1]}) + + assert_frame_equal(result_df, expected_df) + + +def test_drop_one(df): + result_df = ColumnDropper(['e']).fit_transform(df) + expected_df = pd.DataFrame({ + "a": [1, 2, 3, 4, 5, 6], + "b": [10, 9, 8, 7, 6, 5], + "c": ["a", "b", "a", "b", "c", "c"], + "d": ["b", "a", "a", "b", "a", "b"]}) + + assert_frame_equal(result_df, expected_df) + + +def test_drop_none(df): + result_df = ColumnDropper([]).fit_transform(df) + assert_frame_equal(result_df, df) + + +def test_drop_not_in_frame(df): + with pytest.raises(KeyError): + ColumnDropper(['f']).fit_transform(df) + + +def test_drop_one_in_pipeline(df): + pipe = make_pipeline(ColumnDropper(['e'])) + result_df = pipe.fit_transform(df) + expected_df = pd.DataFrame({ + "a": [1, 2, 3, 4, 5, 6], + "b": [10, 9, 8, 7, 6, 5], + "c": ["a", "b", "a", "b", "c", "c"], + "d": ["b", "a", "a", "b", "a", "b"]}) + + assert_frame_equal(result_df, expected_df)