From 9c677c381518a03bf83d0325c5ba91058e7394e7 Mon Sep 17 00:00:00 2001
From: Kay Hoogland <Kay.Hoogland@klm.com>
Date: Thu, 25 Jul 2019 14:48:52 +0200
Subject: [PATCH] Feature/column dropper (#166)

* Implementation of the Columndropper class

* Added one more test for pipeline use

* Fixed the failing tests
---
 sklego/preprocessing.py                       | 96 +++++++++++++++++++
 .../test_preprocessing/test_columndropper.py  | 57 +++++++++++
 2 files changed, 153 insertions(+)
 create mode 100644 tests/test_preprocessing/test_columndropper.py

diff --git a/sklego/preprocessing.py b/sklego/preprocessing.py
index 3081002ba..223e78b0f 100644
--- a/sklego/preprocessing.py
+++ b/sklego/preprocessing.py
@@ -513,3 +513,99 @@ def transform(self, X):
         # apply the projection and remove the column we won't need
         X_fair = X @ self.projection_
         return np.atleast_2d(np.delete(X_fair, self.col_ids_, axis=1))
+
+
+class ColumnDropper(BaseEstimator, TransformerMixin):
+    """
+    Allows dropping specific columns from a pandas DataFrame by name. Can be useful in a sklearn Pipeline.
+
+    :param columns: column name ``str`` or list of column names to be selected
+
+    .. note::
+        Raises a ``TypeError`` if input provided is not a DataFrame
+
+        Raises a ``ValueError`` if columns provided are not in the input DataFrame
+
+    :Example:
+
+    >>> # Selecting a single column from a pandas DataFrame
+    >>> import pandas as pd
+    >>> df = pd.DataFrame({
+    ...     'name': ['Swen', 'Victor', 'Alex'],
+    ...     'length': [1.82, 1.85, 1.80],
+    ...     'shoesize': [42, 44, 45]
+    ... })
+    >>> ColumnDropper(['name']).fit_transform(df)
+       length  shoesize
+    0    1.82        42
+    1    1.85        44
+    2    1.80        45
+
+    >>> # Selecting multiple columns from a pandas DataFrame
+    >>> ColumnDropper(['length', 'shoesize']).fit_transform(df)
+         name
+    0    Swen
+    1  Victor
+    2    Alex
+
+
+    >>> # Selecting non-existent columns returns in a KeyError
+    >>> ColumnDropper(['weight']).fit_transform(df)
+    Traceback (most recent call last):
+        ...
+    KeyError: "['weight'] column(s) not in DataFrame"
+
+    >>> # How to use the ColumnSelector in a sklearn Pipeline
+    >>> from sklearn.pipeline import Pipeline
+    >>> from sklearn.preprocessing import StandardScaler
+    >>> pipe = Pipeline([
+    ...     ('select', ColumnDropper(['name', 'shoesize'])),
+    ...     ('scale', StandardScaler()),
+    ... ])
+    >>> pipe.fit_transform(df)
+    array([[-0.16222142],
+           [ 1.29777137],
+           [-1.13554995]])
+    """
+
+    def __init__(self, columns: list):
+        # if the columns parameter is not a list, make it into a list
+        if not isinstance(columns, list):
+            columns = [columns]
+
+        self.columns = columns
+
+    def fit(self, X, y=None):
+        """
+        Checks 1) if input is a DataFrame, and 2) if column names are in this DataFrame
+
+        :param X: ``pd.DataFrame`` on which we apply the column selection
+        :param y: ``pd.Series`` labels for X. unused for column selection
+        :returns: ``ColumnSelector`` object.
+        """
+
+        self._check_X_for_type(X)
+        self._check_column_names(X)
+        return self
+
+    def transform(self, X):
+        """Returns a pandas DataFrame with only the specified columns
+
+        :param X: ``pd.DataFrame`` on which we apply the column selection
+        :returns: ``pd.DataFrame`` with only the selected columns
+        """
+        if self.columns:
+            return X.drop(self.columns, axis=1)
+        return X
+
+    def _check_column_names(self, X):
+        """Check if one or more of the columns provided doesn't exist in the input DataFrame"""
+        non_existent_columns = set(self.columns).difference(X.columns)
+        if len(non_existent_columns) > 0:
+            raise KeyError(f'{list(non_existent_columns)} column(s) not in DataFrame')
+
+    @staticmethod
+    def _check_X_for_type(X):
+        """Checks if input of the Selector is of the required dtype"""
+        if not isinstance(X, pd.DataFrame):
+            raise TypeError("Provided variable X is not of type pandas.DataFrame")
diff --git a/tests/test_preprocessing/test_columndropper.py b/tests/test_preprocessing/test_columndropper.py
new file mode 100644
index 000000000..0f885a864
--- /dev/null
+++ b/tests/test_preprocessing/test_columndropper.py
@@ -0,0 +1,57 @@
+import pandas as pd
+from pandas.testing import assert_frame_equal
+from sklearn.pipeline import make_pipeline
+import pytest
+from sklego.preprocessing import ColumnDropper
+
+
+@pytest.fixture()
+def df():
+    return pd.DataFrame({"a": [1, 2, 3, 4, 5, 6],
+                         "b": [10, 9, 8, 7, 6, 5],
+                         "c": ["a", "b", "a", "b", "c", "c"],
+                         "d": ["b", "a", "a", "b", "a", "b"],
+                         "e": [0, 1, 0, 1, 0, 1]})
+
+
+def test_drop_two(df):
+    result_df = ColumnDropper(['a', 'b']).fit_transform(df)
+    expected_df = pd.DataFrame({
+        "c": ["a", "b", "a", "b", "c", "c"],
+        "d": ["b", "a", "a", "b", "a", "b"],
+        "e": [0, 1, 0, 1, 0, 1]})
+
+    assert_frame_equal(result_df, expected_df)
+
+
+def test_drop_one(df):
+    result_df = ColumnDropper(['e']).fit_transform(df)
+    expected_df = pd.DataFrame({
+        "a": [1, 2, 3, 4, 5, 6],
+        "b": [10, 9, 8, 7, 6, 5],
+        "c": ["a", "b", "a", "b", "c", "c"],
+        "d": ["b", "a", "a", "b", "a", "b"]})
+
+    assert_frame_equal(result_df, expected_df)
+
+
+def test_drop_none(df):
+    result_df = ColumnDropper([]).fit_transform(df)
+    assert_frame_equal(result_df, df)
+
+
+def test_drop_not_in_frame(df):
+    with pytest.raises(KeyError):
+        ColumnDropper(['f']).fit_transform(df)
+
+
+def test_drop_one_in_pipeline(df):
+    pipe = make_pipeline(ColumnDropper(['e']))
+    result_df = pipe.fit_transform(df)
+    expected_df = pd.DataFrame({
+        "a": [1, 2, 3, 4, 5, 6],
+        "b": [10, 9, 8, 7, 6, 5],
+        "c": ["a", "b", "a", "b", "c", "c"],
+        "d": ["b", "a", "a", "b", "a", "b"]})
+
+    assert_frame_equal(result_df, expected_df)