From 406dafcc7cd7f3a4d44a3c05fe5a2fc9b9daca01 Mon Sep 17 00:00:00 2001 From: Ines Oliveira e Silva Date: Tue, 20 Feb 2024 13:59:01 +0000 Subject: [PATCH] Update options to define features to ignore in label flipping (#170) --- .../methods/preprocessing/label_flipping.py | 48 ++++++++++++++----- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/src/aequitas/flow/methods/preprocessing/label_flipping.py b/src/aequitas/flow/methods/preprocessing/label_flipping.py index 401edd1d..ae3bc96d 100644 --- a/src/aequitas/flow/methods/preprocessing/label_flipping.py +++ b/src/aequitas/flow/methods/preprocessing/label_flipping.py @@ -25,7 +25,7 @@ def __init__( bagging_n_estimators: int = 10, fair_ordering: bool = True, ordering_method: Literal["ensemble_margin", "residuals"] = "ensemble_margin", - unawareness_features: Optional[list] = None, + unawareness_features: Optional[Union[bool, list, str]] = None, seed: int = 42, **base_estimator_args, ): @@ -116,6 +116,33 @@ def __init__( self.used_in_inference = False self.seed = seed + def _feature_suppression(self, X: pd.DataFrame, s: pd.Series) -> pd.DataFrame: + X_transformed = X.copy() + + if self.unawareness_features is None: + if s.name not in X_transformed.columns: + X_transformed[s.name] = s + + else: + if isinstance(self.unawareness_features, bool): + if self.unawareness_features and s.name in X_transformed.columns: + X_transformed = X_transformed.drop(columns=s.name) + elif ( + not self.unawareness_features + and s.name not in X_transformed.columns + ): + X_transformed[s.name] = s + + else: + unawareness_features_list = ( + [self.unawareness_features] + if isinstance(self.unawareness_features, str) + else self.unawareness_features + ) + X_transformed = X_transformed.drop(columns=unawareness_features_list) + + return X_transformed + def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None: """ Fits a bagging classifier to the data. The estimators' can then be used to @@ -135,10 +162,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series, s: Optional[pd.Series]) -> None: self.logger.info("Fitting LabelFlipping.") - X_transformed = X.copy() - if self.unawareness_features is not None: - X_transformed = X_transformed.drop(columns=self.unawareness_features) - + X_transformed = self._feature_suppression(X, s) X_transformed = pd.get_dummies(X_transformed) self.ensemble = BaggingClassifier( @@ -202,9 +226,12 @@ def _calculate_group_flips(self, y: pd.Series, s: pd.Series): max_prevalence = prevalence + self.disparity_target * prevalence group_flips = { - group: math.ceil(min_prevalence * len(y[s == group])) - y[s == group].sum() - if group_prevalences[group] < min_prevalence - else math.floor(max_prevalence * len(y[s == group])) - y[s == group].sum() + group: ( + math.ceil(min_prevalence * len(y[s == group])) - y[s == group].sum() + if group_prevalences[group] < min_prevalence + else math.floor(max_prevalence * len(y[s == group])) + - y[s == group].sum() + ) for group in group_prevalences.index } @@ -307,10 +334,7 @@ def transform( "is True." ) - X_transformed = X.copy() - if self.unawareness_features is not None: - X_transformed = X_transformed.drop(columns=self.unawareness_features) - + X_transformed = self._feature_suppression(X, s) X_transformed = pd.get_dummies(X_transformed) scores = self._score_instances(X_transformed, y)