Skip to content

Commit

Permalink
Added option to use custom functions in anonymising sets
Browse files Browse the repository at this point in the history
  • Loading branch information
gherka committed Nov 29, 2023
1 parent 7a9fda0 commit 0484997
Show file tree
Hide file tree
Showing 7 changed files with 729 additions and 26 deletions.
45 changes: 44 additions & 1 deletion exhibit/core/generate/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,10 @@ def _generate_anon_series(self, col_name):
# ignoring the standard date genderation parameters, like from / to.
anon_set = col_attrs.get("anonymising_set", None)

# Users can pass custom functions to generate categorical / date columns
if callable(anon_set):
return self._generate_using_custom_function(col_name, anon_set)

# check if the anonymising set is a SQL statement starting with SELECT
# note that for dates, all other parameters, like from / to will be ignored
if anon_set is not None and anon_set.strip().upper()[:6] == "SELECT":
Expand Down Expand Up @@ -501,8 +505,14 @@ def _generate_using_external_table(self, col_name, anon_set):
# duplicates in case user didn't specify DISTINC in his SQL query;
# the anon_df would typically be from UUIDs that are generated before
# categorical columns.

# self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
if self.anon_df is None or self.anon_df.empty:
existing_data = pd.concat(self.generated_dfs, axis=1)
# self.generated_dfs has cat. columns generated BEFORE this particular column
if not self.generated_dfs: #pragma: no cover
existing_data = pd.DataFrame()
else:
existing_data = pd.concat(self.generated_dfs, axis=1)
else:
existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)

Expand Down Expand Up @@ -600,3 +610,36 @@ def _generate_using_external_table(self, col_name, anon_set):
final_result = final_result.astype("datetime64[ns]")

return final_result

def _generate_using_custom_function(self, col_name, anon_set):
'''
_summary_
Parameters
----------
col_name : _type_
_description_
anon_set : _type_
_description_
'''
# self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
if self.anon_df is None or self.anon_df.empty:
# self.generated_dfs has cat. columns generated BEFORE this particular column
if not self.generated_dfs:
existing_data = pd.DataFrame()
else:
existing_data = pd.concat(self.generated_dfs, axis=1)
else: #pragma: no cover
existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)

if existing_data.empty:
result = pd.Series(
data=[anon_set(pd.Series) for _ in range(self.num_rows)],
name=col_name
)
return result

result = existing_data.apply(anon_set, axis=1)
result.name = col_name

return result
119 changes: 119 additions & 0 deletions exhibit/core/generate/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
'''

# Standard library imports
import datetime
import unittest
import tempfile
from unittest.mock import Mock, patch
Expand Down Expand Up @@ -532,6 +533,124 @@ def test_date_column_with_impossible_combination_of_from_to_and_period(self):

self.assertWarns(RuntimeWarning, gen.generate)

def test_generate_column_with_custom_function_in_anonymised_set(self):
'''
This option is only valid for when Exhibit is used as a script. For
specification-based generation, use custom ML models. Note that
while numerical weights are respected, probability vectors are not.
'''

def _generate_spam(_):
'''
Basic function to generate menu items in a fictitious bistro.
Parameters
----------
_ : None
the anonymising_set function return one value at a time
and has access to the current row in the DF generated so far.
This argument is mandatory to include, even if it's unused.
Returns
----------
Scalar value
'''

rng = np.random.default_rng()
val = rng.choice([
"Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam",
"Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam",
"Spam, bacon, sausage, and Spam", "Lobster Thermidor",
])

return val

test_dict = {
"_rng" : np.random.default_rng(seed=0),
"metadata": {
"categorical_columns" : ["menu"],
"inline_limit" : 5,
"id" : "main"
},
"columns": {
"menu": {
"type": "categorical",
"uniques" : 7,
"original_values" : pd.DataFrame(),
"paired_columns": None,
"anonymising_set" : _generate_spam,
"cross_join_all_unique_values" : False,
},
}
}

gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50)
result = gen.generate()

self.assertIsInstance(result, pd.DataFrame)
self.assertEqual(result.shape[0], 50)

def test_generate_column_with_custom_date_function_in_anonymised_set(self):
'''
This option is only valid for when Exhibit is used as a script. For
specification-based generation, use custom ML models. Note that
while numerical weights are respected, probability vectors are not.
'''

def _increment_date(row):
'''
Basic function to generate menu items in a fictitious bistro.
Parameters
----------
row : pd.Series
the anonymising_set function return one value at a time
and has access to the current row in the DF generated so far.
This argument is mandatory to include, even if it's unused.
Returns
----------
Scalar value
'''
rng = np.random.default_rng()
cur_date = row["date"]
new_date = cur_date + datetime.timedelta(days=int(rng.integers(1, 10)))

return new_date

test_dict = {
"_rng" : np.random.default_rng(seed=0),
"metadata": {
"date_columns" : ["date", "future_date"],
"inline_limit" : 5,
"id" : "main"
},
"columns": {
"date": {
"type": "date",
"from": "2023-01-01",
"to" : "2024-01-01",
"uniques" : 50,
"frequency" : "D",
"cross_join_all_unique_values" : False,
},
"future_date": {
"type": "date",
"from": "2023-01-01",
"to" : "2024-01-01",
"uniques" : 50,
"frequency" : "D",
"cross_join_all_unique_values" : False,
"anonymising_set" : _increment_date
}
}
}

gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50)
result = gen.generate()

self.assertTrue((result["future_date"] > result["date"]).all())

if __name__ == "__main__" and __package__ is None:
#overwrite __package__ builtin as per PEP 366
__package__ = "exhibit"
Expand Down
12 changes: 11 additions & 1 deletion exhibit/core/generate/weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,19 @@ def target_columns_for_weights_table(spec_dict):
cat_cols = spec_dict["metadata"]["categorical_columns"] #includes linked
cat_cols_set = set(cat_cols)

#drop paired columns and regex columns
#drop columns, like(paired / regex columns) that we don't expect to have num. weights
for cat_col in cat_cols:
anon_set = spec_dict["columns"][cat_col]["anonymising_set"]

# if we're missing original_values, there can be no weights
orig_vals = spec_dict["columns"][cat_col]["original_values"]
if orig_vals is None or (isinstance(orig_vals, pd.DataFrame) and orig_vals.empty): #pragma: no cover
cat_cols_set.remove(cat_col)
continue

# skip the checks for custom functions
if callable(anon_set):
continue
if (
is_paired(spec_dict, cat_col) or
# we keep the columns if they are in fixed sets or have custom SQL;
Expand Down
64 changes: 64 additions & 0 deletions exhibit/core/tests/test_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,70 @@ def test_categorical_column_initialised_from_list(self):
anon_df = exhibit_data.generate()

self.assertEqual(anon_df.shape, (100, 1))

def test_mix_of_categorical_and_numerical_columns_with_incomplete_weights(self):
'''
This test covers both categorical and continuous column generation.
Remember that weights are relative to each other, meaning that if we provide
weights for just one value, it doesn't matter because it has no reference point.
If we provide weights for two values, they will be rescaled to sum to 1, while
other values without weights, will be treated as 1, meaning providing incomplete
weights will lead to smaller values relative to missing values.
'''

def _generate_spam(_):
'''
Basic function to generate menu items in a fictitious bistro.
Parameters
----------
_ : None
the anonymising_set function return one value at a time
and has access to the current row in the DF generated so far.
This argument is mandatory to include, even if it's unused.
Returns
----------
Scalar value
'''

rng = np.random.default_rng()
val = rng.choice([
"Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam",
"Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam",
"Spam, bacon, sausage, and Spam", "Lobster Thermidor",
])

return val

spec = tm.Spec()
spec_dict = spec.generate()

spec_dict["metadata"]["number_of_rows"] = 50
spec_dict["metadata"]["categorical_columns"] = ["menu"]
spec_dict["metadata"]["numerical_columns"] = ["price"]
spec_dict["metadata"]["id"] = "main"

menu_df = pd.DataFrame(data={
"menu" : ["Egg and bacon", "Lobster Thermidor", "Missing Data"],
"price": [0.5, 0.5, 0.0]
})

spec_dict["columns"]["menu"] = tm.CategoricalColumn("menu", uniques=7, original_values=menu_df, anon_set=_generate_spam)
spec_dict["columns"]["price"] = tm.NumericalColumn(distribution_parameters={"target_sum" : 1000, "dispersion": 0.2})

exhibit_data = xbt.Exhibit(command="fromspec", source=spec_dict, output="dataframe")
anon_df = exhibit_data.generate()

test_items = ["Egg and bacon", "Lobster Thermidor"]

# check that the average price of the two test items is about half the rest
self.assertAlmostEqual(
anon_df[anon_df["menu"].isin(test_items)]["price"].mean() * 2,
anon_df[~anon_df["menu"].isin(test_items)]["price"].mean(),
delta=3
)

if __name__ == "__main__" and __package__ is None:
#overwrite __package__ builtin as per PEP 366
Expand Down
3 changes: 3 additions & 0 deletions exhibit/core/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,9 @@ def validate_anonymising_set_length(self, spec_dict=None):
attr="anonymising_set",
col_names=True,
types=["categorical"]):
# ignore anonymising_sets that have custom functions
if callable(v):
return True

if v.split(".")[0] in self.fixed_sql_sets:
col_uniques = spec_dict["columns"][c]["uniques"]
Expand Down
Loading

0 comments on commit 0484997

Please sign in to comment.