Skip to content

Commit

Permalink
Feature/column dropper (#166)
Browse files Browse the repository at this point in the history
* Implementation of the Columndropper class

* Added one more test for pipeline use

* Fixed the failing tests
  • Loading branch information
Kay Hoogland authored and koaning committed Jul 25, 2019
1 parent aaaf485 commit 9c677c3
Show file tree
Hide file tree
Showing 2 changed files with 153 additions and 0 deletions.
96 changes: 96 additions & 0 deletions sklego/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,3 +513,99 @@ def transform(self, X):
# apply the projection and remove the column we won't need
X_fair = X @ self.projection_
return np.atleast_2d(np.delete(X_fair, self.col_ids_, axis=1))


class ColumnDropper(BaseEstimator, TransformerMixin):
"""
Allows dropping specific columns from a pandas DataFrame by name. Can be useful in a sklearn Pipeline.
:param columns: column name ``str`` or list of column names to be selected
.. note::
Raises a ``TypeError`` if input provided is not a DataFrame
Raises a ``ValueError`` if columns provided are not in the input DataFrame
:Example:
>>> # Selecting a single column from a pandas DataFrame
>>> import pandas as pd
>>> df = pd.DataFrame({
... 'name': ['Swen', 'Victor', 'Alex'],
... 'length': [1.82, 1.85, 1.80],
... 'shoesize': [42, 44, 45]
... })
>>> ColumnDropper(['name']).fit_transform(df)
length shoesize
0 1.82 42
1 1.85 44
2 1.80 45
>>> # Selecting multiple columns from a pandas DataFrame
>>> ColumnDropper(['length', 'shoesize']).fit_transform(df)
name
0 Swen
1 Victor
2 Alex
>>> # Selecting non-existent columns returns in a KeyError
>>> ColumnDropper(['weight']).fit_transform(df)
Traceback (most recent call last):
...
KeyError: "['weight'] column(s) not in DataFrame"
>>> # How to use the ColumnSelector in a sklearn Pipeline
>>> from sklearn.pipeline import Pipeline
>>> from sklearn.preprocessing import StandardScaler
>>> pipe = Pipeline([
... ('select', ColumnDropper(['name', 'shoesize'])),
... ('scale', StandardScaler()),
... ])
>>> pipe.fit_transform(df)
array([[-0.16222142],
[ 1.29777137],
[-1.13554995]])
"""

def __init__(self, columns: list):
# if the columns parameter is not a list, make it into a list
if not isinstance(columns, list):
columns = [columns]

self.columns = columns

def fit(self, X, y=None):
"""
Checks 1) if input is a DataFrame, and 2) if column names are in this DataFrame
:param X: ``pd.DataFrame`` on which we apply the column selection
:param y: ``pd.Series`` labels for X. unused for column selection
:returns: ``ColumnSelector`` object.
"""

self._check_X_for_type(X)
self._check_column_names(X)
return self

def transform(self, X):
"""Returns a pandas DataFrame with only the specified columns
:param X: ``pd.DataFrame`` on which we apply the column selection
:returns: ``pd.DataFrame`` with only the selected columns
"""
if self.columns:
return X.drop(self.columns, axis=1)
return X

def _check_column_names(self, X):
"""Check if one or more of the columns provided doesn't exist in the input DataFrame"""
non_existent_columns = set(self.columns).difference(X.columns)
if len(non_existent_columns) > 0:
raise KeyError(f'{list(non_existent_columns)} column(s) not in DataFrame')

@staticmethod
def _check_X_for_type(X):
"""Checks if input of the Selector is of the required dtype"""
if not isinstance(X, pd.DataFrame):
raise TypeError("Provided variable X is not of type pandas.DataFrame")
57 changes: 57 additions & 0 deletions tests/test_preprocessing/test_columndropper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pandas as pd
from pandas.testing import assert_frame_equal
from sklearn.pipeline import make_pipeline
import pytest
from sklego.preprocessing import ColumnDropper


@pytest.fixture()
def df():
return pd.DataFrame({"a": [1, 2, 3, 4, 5, 6],
"b": [10, 9, 8, 7, 6, 5],
"c": ["a", "b", "a", "b", "c", "c"],
"d": ["b", "a", "a", "b", "a", "b"],
"e": [0, 1, 0, 1, 0, 1]})


def test_drop_two(df):
result_df = ColumnDropper(['a', 'b']).fit_transform(df)
expected_df = pd.DataFrame({
"c": ["a", "b", "a", "b", "c", "c"],
"d": ["b", "a", "a", "b", "a", "b"],
"e": [0, 1, 0, 1, 0, 1]})

assert_frame_equal(result_df, expected_df)


def test_drop_one(df):
result_df = ColumnDropper(['e']).fit_transform(df)
expected_df = pd.DataFrame({
"a": [1, 2, 3, 4, 5, 6],
"b": [10, 9, 8, 7, 6, 5],
"c": ["a", "b", "a", "b", "c", "c"],
"d": ["b", "a", "a", "b", "a", "b"]})

assert_frame_equal(result_df, expected_df)


def test_drop_none(df):
result_df = ColumnDropper([]).fit_transform(df)
assert_frame_equal(result_df, df)


def test_drop_not_in_frame(df):
with pytest.raises(KeyError):
ColumnDropper(['f']).fit_transform(df)


def test_drop_one_in_pipeline(df):
pipe = make_pipeline(ColumnDropper(['e']))
result_df = pipe.fit_transform(df)
expected_df = pd.DataFrame({
"a": [1, 2, 3, 4, 5, 6],
"b": [10, 9, 8, 7, 6, 5],
"c": ["a", "b", "a", "b", "c", "c"],
"d": ["b", "a", "a", "b", "a", "b"]})

assert_frame_equal(result_df, expected_df)

0 comments on commit 9c677c3

Please sign in to comment.