Skip to content

Commit

Permalink
New preprocessing method: Massaging (#151)
Browse files Browse the repository at this point in the history
* Added massaging method

* Added documentation

* Added option for user to define used classifer

* Replaced classifier instanciation with util function
  • Loading branch information
reluzita authored Feb 8, 2024
1 parent 7ed7cc4 commit af9742a
Showing 1 changed file with 108 additions and 0 deletions.
108 changes: 108 additions & 0 deletions src/aequitas/flow/methods/preprocessing/massaging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from typing import Optional, Union, Callable

import pandas as pd
import math

from ...utils import create_logger
from ...utils.imports import instantiate_object
from .preprocessing import PreProcessing


class Massaging(PreProcessing):
def __init__(
self,
classifier: Union[str, Callable] = "sklearn.naive_bayes.GaussianNB",
**classifier_args,
):
"""
Instantiates a Massaging preprocessing method.
Flips selected labels to reduce disparity between groups.
"""
self.logger = create_logger("methods.preprocessing.Massaging")
self.logger.info("Instantiating a Massaging preprocessing method.")

self.classifier = instantiate_object(classifier, **classifier_args)
self.logger.info(f"Created base estimator {self.classifier}")

def _rank(
self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]
) -> tuple[list, list]:
features = pd.concat([X, s], axis=1)
features = pd.get_dummies(features)
R = self.classifier.fit(features, y)
scores = pd.Series(R.predict_proba(features)[:, 1], index=X.index)

pr = []
dem = []

for g in s.unique():
prevalence = y[s == g].mean()
if prevalence < y.mean():
pr += list(X.loc[(s == g) & (y == 0)].index)
elif prevalence > y.mean():
dem += list(X.loc[(s == g) & (y == 1)].index)

pr = scores.loc[pr].sort_values(ascending=False).index
dem = scores.loc[dem].sort_values(ascending=True).index

return pr, dem

def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None:
"""Fits a classifier to the data and orders the instances by the predictions.
Promotion candidates are the instances with negative label in the group with
lowest prevalence and demotion candidates are the instances with positive
label in the group with highest prevalence. The number of instances to be
flipped is calculated to equalize the prevalences of the groups.
Parameters
----------
X : pandas.DataFrame
Feature matrix.
y : pandas.Series
Label vector.
s : pandas.Series
Protected attribute vector.
"""
self.logger.info("Fitting Massaging preprocessing method.")
self.pr, self.dem = self._rank(X, y, s)

g_pr = s.loc[self.pr].unique()
g_dem = s.loc[self.dem].unique()

d_b = y.loc[s.isin(g_pr)].mean()
d_w = y.loc[s.isin(g_dem)].mean()
d = d_w - d_b

self.m = math.ceil(
(d * y.loc[s.isin(g_pr)].shape[0] * y.loc[s.isin(g_dem)].shape[0])
/ y.shape[0]
)
self.logger.info("Massaging preprocessing method fitted.")

def transform(
self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series] = None
) -> tuple[pd.DataFrame, pd.Series, pd.Series]:
"""Transforms the data by flipping the calculated number of label of the top
candidates in the promotion and the demotion groups.
Parameters
----------
X : pd.DataFrame
Feature matrix.
y : pd.Series
Label vector.
s : pd.Series, optional
Protected attribute vector.
Returns
-------
tuple[pd.DataFrame, pd.Series, pd.Series]
The transformed input, X, y, and s.
"""
self.logger.info("Transforming data with Massaging preprocessing method.")
y_corrected = y.copy()
y_corrected.loc[self.pr[: self.m]] = 1
y_corrected.loc[self.dem[: self.m]] = 0
self.logger.info("Data transformed with Massaging preprocessing method.")
return X, y_corrected, s

0 comments on commit af9742a

Please sign in to comment.