Added option to use custom functions in anonymising sets

gherka · Nov 29, 2023 · 0484997 · 0484997
1 parent 7a9fda0
commit 0484997
Show file tree

Hide file tree

Showing 7 changed files with 729 additions and 26 deletions.
diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py
@@ -219,6 +219,10 @@ def _generate_anon_series(self, col_name):
         # ignoring the standard date genderation parameters, like from / to.        
         anon_set = col_attrs.get("anonymising_set", None)
 
+        # Users can pass custom functions to generate categorical / date columns
+        if callable(anon_set):
+            return self._generate_using_custom_function(col_name, anon_set)
+
         # check if the anonymising set is a SQL statement starting with SELECT
         # note that for dates, all other parameters, like from / to will be ignored
         if anon_set is not None and anon_set.strip().upper()[:6] == "SELECT":
@@ -501,8 +505,14 @@ def _generate_using_external_table(self, col_name, anon_set):
         # duplicates in case user didn't specify DISTINC in his SQL query;
         # the anon_df would typically be from UUIDs that are generated before
         # categorical columns.
+
+        # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
         if self.anon_df is None or self.anon_df.empty:
-            existing_data = pd.concat(self.generated_dfs, axis=1)
+            # self.generated_dfs has cat. columns generated BEFORE this particular column
+            if not self.generated_dfs: #pragma: no cover
+                existing_data = pd.DataFrame()
+            else:
+                existing_data = pd.concat(self.generated_dfs, axis=1)
         else:
             existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
 
@@ -600,3 +610,36 @@ def _generate_using_external_table(self, col_name, anon_set):
             final_result = final_result.astype("datetime64[ns]")
 
         return final_result
+
+    def _generate_using_custom_function(self, col_name, anon_set):
+        '''
+        _summary_
+
+        Parameters
+        ----------
+        col_name : _type_
+            _description_
+        anon_set : _type_
+            _description_
+        '''
+        # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
+        if self.anon_df is None or self.anon_df.empty:
+            # self.generated_dfs has cat. columns generated BEFORE this particular column
+            if not self.generated_dfs:
+                existing_data = pd.DataFrame()
+            else:
+                existing_data = pd.concat(self.generated_dfs, axis=1)
+        else: #pragma: no cover
+            existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
+
+        if existing_data.empty:
+            result = pd.Series(
+                data=[anon_set(pd.Series) for _ in range(self.num_rows)],
+                name=col_name
+            )
+            return result
+
+        result = existing_data.apply(anon_set, axis=1)
+        result.name = col_name
+
+        return result
diff --git a/exhibit/core/generate/tests/test_categorical.py b/exhibit/core/generate/tests/test_categorical.py
@@ -3,6 +3,7 @@
 '''
 
 # Standard library imports
+import datetime
 import unittest
 import tempfile
 from unittest.mock import Mock, patch
@@ -532,6 +533,124 @@ def test_date_column_with_impossible_combination_of_from_to_and_period(self):
 
         self.assertWarns(RuntimeWarning, gen.generate)
 
+    def test_generate_column_with_custom_function_in_anonymised_set(self):
+        '''
+        This option is only valid for when Exhibit is used as a script. For
+        specification-based generation, use custom ML models. Note that 
+        while numerical weights are respected, probability vectors are not.
+        '''
+
+        def _generate_spam(_):
+            '''
+            Basic function to generate menu items in a fictitious bistro.
+
+            Parameters
+            ----------
+            _ : None
+                the anonymising_set function return one value at a time
+                and has access to the current row in the DF generated so far.
+                This argument is mandatory to include, even if it's unused.
+
+            Returns
+            ----------
+            Scalar value
+            '''
+
+            rng = np.random.default_rng()
+            val = rng.choice([
+                "Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam",
+                "Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam",
+                "Spam, bacon, sausage, and Spam", "Lobster Thermidor",
+            ])
+
+            return val
+
+        test_dict = {
+            "_rng" : np.random.default_rng(seed=0),
+            "metadata": {
+                "categorical_columns" : ["menu"],
+                "inline_limit" : 5,
+                "id" : "main"
+                },
+            "columns": {
+                "menu": {
+                    "type": "categorical",
+                    "uniques" : 7,
+                    "original_values" : pd.DataFrame(),
+                    "paired_columns": None,
+                    "anonymising_set" : _generate_spam,
+                    "cross_join_all_unique_values" : False,
+                },
+            }
+        }
+
+        gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50)
+        result = gen.generate()
+
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertEqual(result.shape[0], 50)
+
+    def test_generate_column_with_custom_date_function_in_anonymised_set(self):
+        '''
+        This option is only valid for when Exhibit is used as a script. For
+        specification-based generation, use custom ML models. Note that 
+        while numerical weights are respected, probability vectors are not.
+        '''
+
+        def _increment_date(row):
+            '''
+            Basic function to generate menu items in a fictitious bistro.
+
+            Parameters
+            ----------
+            row : pd.Series
+                the anonymising_set function return one value at a time
+                and has access to the current row in the DF generated so far.
+                This argument is mandatory to include, even if it's unused.
+
+            Returns
+            ----------
+            Scalar value
+            '''
+            rng = np.random.default_rng()
+            cur_date = row["date"]
+            new_date = cur_date + datetime.timedelta(days=int(rng.integers(1, 10)))
+
+            return new_date
+
+        test_dict = {
+            "_rng" : np.random.default_rng(seed=0),
+            "metadata": {
+                "date_columns" : ["date", "future_date"],
+                "inline_limit" : 5,
+                "id" : "main"
+                },
+            "columns": {
+                "date": {
+                    "type": "date",
+                    "from": "2023-01-01",
+                    "to"  : "2024-01-01",
+                    "uniques" : 50,
+                    "frequency" : "D",
+                    "cross_join_all_unique_values" : False,
+                },
+                "future_date": {
+                    "type": "date",
+                    "from": "2023-01-01",
+                    "to"  : "2024-01-01",
+                    "uniques" : 50,
+                    "frequency" : "D",
+                    "cross_join_all_unique_values" : False,
+                    "anonymising_set" : _increment_date
+                }
+            }
+        }
+
+        gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50)
+        result = gen.generate()
+
+        self.assertTrue((result["future_date"] > result["date"]).all())
+
 if __name__ == "__main__" and __package__ is None:
     #overwrite __package__ builtin as per PEP 366
     __package__ = "exhibit"

diff --git a/exhibit/core/generate/weights.py b/exhibit/core/generate/weights.py
@@ -180,9 +180,19 @@ def target_columns_for_weights_table(spec_dict):
     cat_cols = spec_dict["metadata"]["categorical_columns"] #includes linked
     cat_cols_set = set(cat_cols)
 
-    #drop paired columns and regex columns
+    #drop columns, like(paired / regex columns) that we don't expect to have num. weights
     for cat_col in cat_cols:
         anon_set = spec_dict["columns"][cat_col]["anonymising_set"]
+
+        # if we're missing original_values, there can be no weights
+        orig_vals = spec_dict["columns"][cat_col]["original_values"]
+        if orig_vals is None or (isinstance(orig_vals, pd.DataFrame) and orig_vals.empty): #pragma: no cover
+            cat_cols_set.remove(cat_col)
+            continue
+
+        # skip the checks for custom functions
+        if callable(anon_set):
+            continue
         if (
             is_paired(spec_dict, cat_col) or
             # we keep the columns if they are in fixed sets or have custom SQL;

diff --git a/exhibit/core/tests/test_spec.py b/exhibit/core/tests/test_spec.py
@@ -147,6 +147,70 @@ def test_categorical_column_initialised_from_list(self):
         anon_df = exhibit_data.generate()
 
         self.assertEqual(anon_df.shape, (100, 1))
+
+    def test_mix_of_categorical_and_numerical_columns_with_incomplete_weights(self):
+        '''
+        This test covers both categorical and continuous column generation.
+
+        Remember that weights are relative to each other, meaning that if we provide
+        weights for just one value, it doesn't matter because it has no reference point.
+        If we provide weights for two values, they will be rescaled to sum to 1, while
+        other values without weights, will be treated as 1, meaning providing incomplete
+        weights will lead to smaller values relative to missing values. 
+        '''
+
+        def _generate_spam(_):
+            '''
+            Basic function to generate menu items in a fictitious bistro.
+
+            Parameters
+            ----------
+            _ : None
+                the anonymising_set function return one value at a time
+                and has access to the current row in the DF generated so far.
+                This argument is mandatory to include, even if it's unused.
+
+            Returns
+            ----------
+            Scalar value
+            '''
+
+            rng = np.random.default_rng()
+            val = rng.choice([
+                "Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam",
+                "Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam",
+                "Spam, bacon, sausage, and Spam", "Lobster Thermidor",
+            ])
+
+            return val
+
+        spec = tm.Spec()
+        spec_dict = spec.generate()
+
+        spec_dict["metadata"]["number_of_rows"] = 50
+        spec_dict["metadata"]["categorical_columns"] = ["menu"]
+        spec_dict["metadata"]["numerical_columns"] = ["price"]
+        spec_dict["metadata"]["id"] = "main"
+
+        menu_df = pd.DataFrame(data={
+            "menu" : ["Egg and bacon", "Lobster Thermidor", "Missing Data"],
+            "price": [0.5, 0.5, 0.0]
+        })
+
+        spec_dict["columns"]["menu"] = tm.CategoricalColumn("menu", uniques=7, original_values=menu_df, anon_set=_generate_spam)
+        spec_dict["columns"]["price"] = tm.NumericalColumn(distribution_parameters={"target_sum" : 1000, "dispersion": 0.2})
+
+        exhibit_data = xbt.Exhibit(command="fromspec", source=spec_dict, output="dataframe")
+        anon_df = exhibit_data.generate()
+
+        test_items = ["Egg and bacon", "Lobster Thermidor"]
+
+        # check that the average price of the two test items is about half the rest
+        self.assertAlmostEqual(
+            anon_df[anon_df["menu"].isin(test_items)]["price"].mean() * 2,
+            anon_df[~anon_df["menu"].isin(test_items)]["price"].mean(),
+            delta=3
+        )
 
 if __name__ == "__main__" and __package__ is None:
     #overwrite __package__ builtin as per PEP 366

diff --git a/exhibit/core/validator.py b/exhibit/core/validator.py
@@ -206,6 +206,9 @@ def validate_anonymising_set_length(self, spec_dict=None):
                 attr="anonymising_set",
                 col_names=True,
                 types=["categorical"]):
+            # ignore anonymising_sets that have custom functions 
+            if callable(v):
+                return True
 
             if v.split(".")[0] in self.fixed_sql_sets:
                 col_uniques = spec_dict["columns"][c]["uniques"]