diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py index bebb74d..2bdd7ab 100644 --- a/exhibit/core/generate/categorical.py +++ b/exhibit/core/generate/categorical.py @@ -219,6 +219,10 @@ def _generate_anon_series(self, col_name): # ignoring the standard date genderation parameters, like from / to. anon_set = col_attrs.get("anonymising_set", None) + # Users can pass custom functions to generate categorical / date columns + if callable(anon_set): + return self._generate_using_custom_function(col_name, anon_set) + # check if the anonymising set is a SQL statement starting with SELECT # note that for dates, all other parameters, like from / to will be ignored if anon_set is not None and anon_set.strip().upper()[:6] == "SELECT": @@ -501,8 +505,14 @@ def _generate_using_external_table(self, col_name, anon_set): # duplicates in case user didn't specify DISTINC in his SQL query; # the anon_df would typically be from UUIDs that are generated before # categorical columns. + + # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns if self.anon_df is None or self.anon_df.empty: - existing_data = pd.concat(self.generated_dfs, axis=1) + # self.generated_dfs has cat. columns generated BEFORE this particular column + if not self.generated_dfs: #pragma: no cover + existing_data = pd.DataFrame() + else: + existing_data = pd.concat(self.generated_dfs, axis=1) else: existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1) @@ -600,3 +610,36 @@ def _generate_using_external_table(self, col_name, anon_set): final_result = final_result.astype("datetime64[ns]") return final_result + + def _generate_using_custom_function(self, col_name, anon_set): + ''' + _summary_ + + Parameters + ---------- + col_name : _type_ + _description_ + anon_set : _type_ + _description_ + ''' + # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns + if self.anon_df is None or self.anon_df.empty: + # self.generated_dfs has cat. columns generated BEFORE this particular column + if not self.generated_dfs: + existing_data = pd.DataFrame() + else: + existing_data = pd.concat(self.generated_dfs, axis=1) + else: #pragma: no cover + existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1) + + if existing_data.empty: + result = pd.Series( + data=[anon_set(pd.Series) for _ in range(self.num_rows)], + name=col_name + ) + return result + + result = existing_data.apply(anon_set, axis=1) + result.name = col_name + + return result diff --git a/exhibit/core/generate/tests/test_categorical.py b/exhibit/core/generate/tests/test_categorical.py index e838d1f..c3e26d5 100644 --- a/exhibit/core/generate/tests/test_categorical.py +++ b/exhibit/core/generate/tests/test_categorical.py @@ -3,6 +3,7 @@ ''' # Standard library imports +import datetime import unittest import tempfile from unittest.mock import Mock, patch @@ -532,6 +533,124 @@ def test_date_column_with_impossible_combination_of_from_to_and_period(self): self.assertWarns(RuntimeWarning, gen.generate) + def test_generate_column_with_custom_function_in_anonymised_set(self): + ''' + This option is only valid for when Exhibit is used as a script. For + specification-based generation, use custom ML models. Note that + while numerical weights are respected, probability vectors are not. + ''' + + def _generate_spam(_): + ''' + Basic function to generate menu items in a fictitious bistro. + + Parameters + ---------- + _ : None + the anonymising_set function return one value at a time + and has access to the current row in the DF generated so far. + This argument is mandatory to include, even if it's unused. + + Returns + ---------- + Scalar value + ''' + + rng = np.random.default_rng() + val = rng.choice([ + "Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam", + "Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam", + "Spam, bacon, sausage, and Spam", "Lobster Thermidor", + ]) + + return val + + test_dict = { + "_rng" : np.random.default_rng(seed=0), + "metadata": { + "categorical_columns" : ["menu"], + "inline_limit" : 5, + "id" : "main" + }, + "columns": { + "menu": { + "type": "categorical", + "uniques" : 7, + "original_values" : pd.DataFrame(), + "paired_columns": None, + "anonymising_set" : _generate_spam, + "cross_join_all_unique_values" : False, + }, + } + } + + gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50) + result = gen.generate() + + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape[0], 50) + + def test_generate_column_with_custom_date_function_in_anonymised_set(self): + ''' + This option is only valid for when Exhibit is used as a script. For + specification-based generation, use custom ML models. Note that + while numerical weights are respected, probability vectors are not. + ''' + + def _increment_date(row): + ''' + Basic function to generate menu items in a fictitious bistro. + + Parameters + ---------- + row : pd.Series + the anonymising_set function return one value at a time + and has access to the current row in the DF generated so far. + This argument is mandatory to include, even if it's unused. + + Returns + ---------- + Scalar value + ''' + rng = np.random.default_rng() + cur_date = row["date"] + new_date = cur_date + datetime.timedelta(days=int(rng.integers(1, 10))) + + return new_date + + test_dict = { + "_rng" : np.random.default_rng(seed=0), + "metadata": { + "date_columns" : ["date", "future_date"], + "inline_limit" : 5, + "id" : "main" + }, + "columns": { + "date": { + "type": "date", + "from": "2023-01-01", + "to" : "2024-01-01", + "uniques" : 50, + "frequency" : "D", + "cross_join_all_unique_values" : False, + }, + "future_date": { + "type": "date", + "from": "2023-01-01", + "to" : "2024-01-01", + "uniques" : 50, + "frequency" : "D", + "cross_join_all_unique_values" : False, + "anonymising_set" : _increment_date + } + } + } + + gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50) + result = gen.generate() + + self.assertTrue((result["future_date"] > result["date"]).all()) + if __name__ == "__main__" and __package__ is None: #overwrite __package__ builtin as per PEP 366 __package__ = "exhibit" diff --git a/exhibit/core/generate/weights.py b/exhibit/core/generate/weights.py index 416ff5e..c888a1f 100644 --- a/exhibit/core/generate/weights.py +++ b/exhibit/core/generate/weights.py @@ -180,9 +180,19 @@ def target_columns_for_weights_table(spec_dict): cat_cols = spec_dict["metadata"]["categorical_columns"] #includes linked cat_cols_set = set(cat_cols) - #drop paired columns and regex columns + #drop columns, like(paired / regex columns) that we don't expect to have num. weights for cat_col in cat_cols: anon_set = spec_dict["columns"][cat_col]["anonymising_set"] + + # if we're missing original_values, there can be no weights + orig_vals = spec_dict["columns"][cat_col]["original_values"] + if orig_vals is None or (isinstance(orig_vals, pd.DataFrame) and orig_vals.empty): #pragma: no cover + cat_cols_set.remove(cat_col) + continue + + # skip the checks for custom functions + if callable(anon_set): + continue if ( is_paired(spec_dict, cat_col) or # we keep the columns if they are in fixed sets or have custom SQL; diff --git a/exhibit/core/tests/test_spec.py b/exhibit/core/tests/test_spec.py index e716a8e..94b8ad3 100644 --- a/exhibit/core/tests/test_spec.py +++ b/exhibit/core/tests/test_spec.py @@ -147,6 +147,70 @@ def test_categorical_column_initialised_from_list(self): anon_df = exhibit_data.generate() self.assertEqual(anon_df.shape, (100, 1)) + + def test_mix_of_categorical_and_numerical_columns_with_incomplete_weights(self): + ''' + This test covers both categorical and continuous column generation. + + Remember that weights are relative to each other, meaning that if we provide + weights for just one value, it doesn't matter because it has no reference point. + If we provide weights for two values, they will be rescaled to sum to 1, while + other values without weights, will be treated as 1, meaning providing incomplete + weights will lead to smaller values relative to missing values. + ''' + + def _generate_spam(_): + ''' + Basic function to generate menu items in a fictitious bistro. + + Parameters + ---------- + _ : None + the anonymising_set function return one value at a time + and has access to the current row in the DF generated so far. + This argument is mandatory to include, even if it's unused. + + Returns + ---------- + Scalar value + ''' + + rng = np.random.default_rng() + val = rng.choice([ + "Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam", + "Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam", + "Spam, bacon, sausage, and Spam", "Lobster Thermidor", + ]) + + return val + + spec = tm.Spec() + spec_dict = spec.generate() + + spec_dict["metadata"]["number_of_rows"] = 50 + spec_dict["metadata"]["categorical_columns"] = ["menu"] + spec_dict["metadata"]["numerical_columns"] = ["price"] + spec_dict["metadata"]["id"] = "main" + + menu_df = pd.DataFrame(data={ + "menu" : ["Egg and bacon", "Lobster Thermidor", "Missing Data"], + "price": [0.5, 0.5, 0.0] + }) + + spec_dict["columns"]["menu"] = tm.CategoricalColumn("menu", uniques=7, original_values=menu_df, anon_set=_generate_spam) + spec_dict["columns"]["price"] = tm.NumericalColumn(distribution_parameters={"target_sum" : 1000, "dispersion": 0.2}) + + exhibit_data = xbt.Exhibit(command="fromspec", source=spec_dict, output="dataframe") + anon_df = exhibit_data.generate() + + test_items = ["Egg and bacon", "Lobster Thermidor"] + + # check that the average price of the two test items is about half the rest + self.assertAlmostEqual( + anon_df[anon_df["menu"].isin(test_items)]["price"].mean() * 2, + anon_df[~anon_df["menu"].isin(test_items)]["price"].mean(), + delta=3 + ) if __name__ == "__main__" and __package__ is None: #overwrite __package__ builtin as per PEP 366 diff --git a/exhibit/core/validator.py b/exhibit/core/validator.py index 9cbf36b..24a13ad 100644 --- a/exhibit/core/validator.py +++ b/exhibit/core/validator.py @@ -206,6 +206,9 @@ def validate_anonymising_set_length(self, spec_dict=None): attr="anonymising_set", col_names=True, types=["categorical"]): + # ignore anonymising_sets that have custom functions + if callable(v): + return True if v.split(".")[0] in self.fixed_sql_sets: col_uniques = spec_dict["columns"][c]["uniques"] diff --git a/recipes/Create peer groups.ipynb b/recipes/Create peer groups.ipynb index 36ab791..faf7ab5 100644 --- a/recipes/Create peer groups.ipynb +++ b/recipes/Create peer groups.ipynb @@ -59,7 +59,7 @@ " \n", "
\n", "\n", + " | date | \n", + "future_date | \n", + "
---|---|---|
0 | \n", + "2023-02-20 | \n", + "2023-02-28 | \n", + "
1 | \n", + "2023-01-17 | \n", + "2023-01-20 | \n", + "
2 | \n", + "2023-02-25 | \n", + "2023-03-04 | \n", + "
3 | \n", + "2023-01-09 | \n", + "2023-01-14 | \n", + "
4 | \n", + "2023-02-15 | \n", + "2023-02-21 | \n", + "
... | \n", + "... | \n", + "... | \n", + "
95 | \n", + "2023-01-06 | \n", + "2023-01-13 | \n", + "
96 | \n", + "2023-03-17 | \n", + "2023-03-21 | \n", + "
97 | \n", + "2023-06-22 | \n", + "2023-06-27 | \n", + "
98 | \n", + "2023-03-13 | \n", + "2023-03-20 | \n", + "
99 | \n", + "2023-05-05 | \n", + "2023-05-09 | \n", + "
100 rows × 2 columns
\n", + "\n", + " | id | \n", + "name | \n", + "address | \n", + "smoker | \n", + "
---|---|---|---|---|
0 | \n", + "22 | \n", + "Mckenzie Cruz | \n", + "762 Baker Point\\nPort Kevin, MN 42282 | \n", + "N | \n", + "
1 | \n", + "64 | \n", + "Michael Williams | \n", + "481 Madison Fords\\nNew Donnaview, CO 27959 | \n", + "N | \n", + "
2 | \n", + "11 | \n", + "Jeanne Smith | \n", + "NaN | \n", + "Y | \n", + "
3 | \n", + "46 | \n", + "Joanna Franklin | \n", + "NaN | \n", + "No Answer | \n", + "
4 | \n", + "82 | \n", + "Daniel Martinez | \n", + "479 Jean Falls Suite 185\\nDeanbury, WV 72875 | \n", + "N | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
95 | \n", + "5 | \n", + "Jack Harrison | \n", + "Unit 0802 Box 5382\\nDPO AP 08329 | \n", + "N | \n", + "
96 | \n", + "40 | \n", + "Courtney Sanchez | \n", + "04326 Wallace Circles\\nNorth Anthonybury, IN 8... | \n", + "N | \n", + "
97 | \n", + "56 | \n", + "Thomas Anderson | \n", + "28821 Clark Drive Apt. 170\\nPort John, CO 44092 | \n", + "N | \n", + "
98 | \n", + "76 | \n", + "Julie Flowers | \n", + "74248 Ball Land Apt. 027\\nPowersfurt, RI 70556 | \n", + "N | \n", + "
99 | \n", + "15 | \n", + "Jennifer King | \n", + "805 Richard Port\\nEast Mario, VT 18338 | \n", + "No Answer | \n", + "
100 rows × 4 columns
\n", + "