Skip to content

Commit

Permalink
Removed Categorical dtype from constraint handling to improve stability
Browse files Browse the repository at this point in the history
  • Loading branch information
gherka committed Aug 26, 2024
1 parent b92d400 commit 793b29b
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 9 deletions.
14 changes: 8 additions & 6 deletions exhibit/core/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,12 @@ def process_custom_constraints(self, custom_constraints):

source = self.input if self.output is None else self.output
output_df = source.copy()
# preserve category dtypes at the end of each custom action
output_dtypes = output_df.dtypes[output_df.dtypes == "category"]
# change categorical dtype to normal object to simplify the logic, particularly for
# assigning new values to series affected by custom constraints. You can still set
# the dtype to Categorical on a case by case basis where it improves performance.
cat_dtype_cols = output_df.select_dtypes(include=["category"]).columns
for col in cat_dtype_cols:
output_df[col] = output_df[col].astype("object")

dispatch_dict = {
"make_outlier" : self.make_outlier,
Expand Down Expand Up @@ -209,7 +213,8 @@ def process_custom_constraints(self, custom_constraints):
# overwrite the original DF row IDs with the adjusted ones
output_df.loc[cc_filter_idx] = action_func(
output_df, cc_filter_idx, target_str,
cc_partitions, **_kwargs).astype(output_dtypes)
cc_partitions, **_kwargs)

return output_df

def adjust_dataframe_to_fit_constraint(self, anon_df, basic_constraint):
Expand Down Expand Up @@ -636,9 +641,6 @@ def _make_distinct_within_group(group):
if not group.duplicated().any():
return group

# make sure blank string is available as a category before replacing
if group.dtype.name == "category": #pragma: no cover
group = group.cat.add_categories([""])
new_group = group.where(~group.duplicated(), "").tolist()

return pd.Series(new_group, index=group.index, name=target_col)
Expand Down
8 changes: 6 additions & 2 deletions exhibit/core/exhibit.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,13 +379,16 @@ def execute_spec(self):
miss_gen = MissingDataGenerator(self.spec_dict, anon_df)
anon_df = miss_gen.add_missing_data()

#6.5) GENERATE DERIVED COLUMNS IF ANY ARE SPECIFIED
#6.1) GENERATE DERIVED COLUMNS IF ANY ARE SPECIFIED
if self.derived_columns_first: #pragma: no cover
for name, calc in self.spec_dict["derived_columns"].items():
if "Example" not in name:
anon_df[name] = generate_derived_column(anon_df, calc)

#7) PROCESS BASIC AND CUSTOM CONSTRAINTS (IF ANY)
# note that the ConstraintHandler changes the dtype of categorical columns from
# Categorical to object to reduce dtype-related bugs / unexpected behaviour, like
# when users supply a filter expression or want to add a new value to a column.
ch = ConstraintHandler(self.spec_dict, anon_df)
anon_df = ch.process_constraints()
# if there are any constraints that affect categorical columns, we need to
Expand All @@ -406,6 +409,7 @@ def execute_spec(self):
# check if there are common columns between constraint targets and cat_cols
if cat_cols_set & set(constraint_targets):

# fill NAs with a placeholder - so that correct weights could be used
anon_df.loc[:, cat_cols] = (
anon_df.loc[:, cat_cols].fillna(MISSING_DATA_STR))

Expand Down Expand Up @@ -439,7 +443,7 @@ def execute_spec(self):
if num_col in derived_def:
anon_df[derived_col] = generate_derived_column(anon_df, derived_def)
break
# change the missing data placeholder back to NAs
anon_df.loc[:, cat_cols] = anon_df.loc[:, cat_cols].applymap(
lambda x: np.nan if x == MISSING_DATA_STR else x)

Expand Down
2 changes: 1 addition & 1 deletion exhibit/core/generate/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def add_missing_data(self):
set(self.spec_dict.get("derived_columns", {}).keys()))

if not (any(self.nan_data[cat_cols].isna()) and num_cols):
return self.nan_data
return self.nan_data.astype(self.dtypes)

cat_mask = self.nan_data[cat_cols].isna().any(axis=1)
self.nan_data[cat_cols] = self.nan_data[cat_cols].fillna(MISSING_DATA_STR)
Expand Down
40 changes: 40 additions & 0 deletions exhibit/core/generate/tests/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,46 @@ def test_user_linked_columns_having_missing_data(self):
test_spec_dict=test_dict, return_spec=False)

self.assertTrue(df.query("A == 'eggs'")["B"].isna().any())

def test_categorical_numerical_missing_data_with_make_null_cc(self):
'''
Typing issues (categorical vs object) can cause bugs when we have categorical columns,
a make_null custom constraint, a filter casting categorical column to integers (which
assumes object, not categorical - because you can't cast categorical to int if there
is a Missing data categorical value - without removing unused categories first) AND
a numerical column. Commenting out the numerical column used to pass the test, and
uncommenting it used to fail it - which is wrong.
Without extra checks, AGE.astype('int') will fail if AGE is dtype="category" because
it'll have numbers as strings (which can be cast to int) and "invisible" Missing data
which can't.
'''

test_df = pd.DataFrame(data={
"AGE": ["1", "2", "3", "4", "4"],
"NULLED" : list("ABCAB"),
"NUMS": range(5)
})

test_dict = {
"metadata" : {
"number_of_rows" : 10,
"categorical_columns": ["AGE", "NULLED"],
"numerical_columns" : ["NUMS"]
},
"constraints" : {
"custom_constraints" : {
"test_nulls" : {
"filter" : "AGE.astype('int') > 1",
"targets" : {"NULLED" : "make_null"}
}
}
}
}

_, df = temp_exhibit(filename=test_df, test_spec_dict=test_dict, return_spec=False)

self.assertTrue(df.NULLED.isna().any())

if __name__ == "__main__" and __package__ is None:
#overwrite __package__ builtin as per PEP 366
Expand Down
4 changes: 4 additions & 0 deletions exhibit/core/tests/test_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -1717,6 +1717,10 @@ def test_custom_constraints_assign_value(self):
"C" : ["spam", "spam", "ham", "ham", "ham"] * 4,
})

# remember that all categorical data comes in as "Category" dtype which imposes
# limits on what you can and can't do with those columns, like adding new values.
test_data["A"] = test_data["A"].astype("category")

test_gen = tm.ConstraintHandler(test_dict, test_data)
result = test_gen.process_constraints()

Expand Down

0 comments on commit 793b29b

Please sign in to comment.