Fixed a few bugs related to SQL-based anonymised sets and added a new…

… recipe
gherka · Nov 27, 2023 · 7a9fda0 · 7a9fda0
1 parent 7513305
commit 7a9fda0
Show file tree

Hide file tree

Showing 5 changed files with 426 additions and 9 deletions.
diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py
@@ -501,10 +501,10 @@ def _generate_using_external_table(self, col_name, anon_set):
         # duplicates in case user didn't specify DISTINC in his SQL query;
         # the anon_df would typically be from UUIDs that are generated before
         # categorical columns.
-        if self.anon_df is not None:
-            existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
-        else:
+        if self.anon_df is None or self.anon_df.empty:
             existing_data = pd.concat(self.generated_dfs, axis=1)
+        else:
+            existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
 
         # ensure the data going into DB is processed identically for join keys
         for col in join_columns:

diff --git a/exhibit/core/generate/weights.py b/exhibit/core/generate/weights.py
@@ -185,7 +185,12 @@ def target_columns_for_weights_table(spec_dict):
         anon_set = spec_dict["columns"][cat_col]["anonymising_set"]
         if (
             is_paired(spec_dict, cat_col) or
-            anon_set.split(".")[0] not in fixed_sql_sets):
+            # we keep the columns if they are in fixed sets or have custom SQL;
+            # because regex can be very variable, we assume that if anoymising set is not in
+            # fixed sets, and doesn't start with SELECT, it's regex and shouldn't have weights
+            (anon_set.split(".")[0] not in fixed_sql_sets and
+            anon_set.strip().upper()[:6] != "SELECT") 
+            ):
             cat_cols_set.remove(cat_col)
 
     return cat_cols_set

diff --git a/exhibit/core/spec.py b/exhibit/core/spec.py
@@ -462,7 +462,7 @@ def __init__(self, uuid_seed=0, freq_dist=None, miss_proba=0, anon_set="uuid"):
         miss_proba : float
             Percentage of records to be nulled.
         anon_set   : string
-            One of "uuid" or "range".
+            One of "uuid", "range", "pseudo_chi".
         '''
 
         self["type"] = "uuid"
@@ -634,7 +634,7 @@ def __init__(
             Optional SQL SELECT statement to pick the date values from.
         '''
 
-        if from_date is None and to_date is None: #pragma: no cover
+        if (from_date is None and to_date is None) and anonymising_set is None: #pragma: no cover
             raise RuntimeError(
                 f"{col_name} is missing at least one of from_date or to_date")
 

diff --git a/recipes/Using Exhibit in a script.py b/recipes/Using Exhibit in a script.py
@@ -37,9 +37,9 @@
 spec_dict["metadata"]["date_columns"] = ["discharge_date"]
 
 spec_dict["columns"]["id"] = UUIDColumn(uuid_seed=0)
-spec_dict["columns"]["hospital"] = CategoricalColumn("hospital", original_values="regex", anon_set="HOSP[1-9]{2}")
-spec_dict["columns"]["count"] = NumericalColumn(distribution_parameters={"target_min":1, "target_max":1000})
-spec_dict["columns"]["discharge_date"] = DateColumn("2020-01-01", 360 * 2, cross_join=False)
+spec_dict["columns"]["hospital"] = CategoricalColumn("hospital", uniques=5, original_values="regex", anon_set="HOSP[1-9]{2}")
+spec_dict["columns"]["count"] = NumericalColumn(distribution="normal", distribution_parameters={"target_min":1, "target_max":1000})
+spec_dict["columns"]["discharge_date"] = DateColumn("discharge_date", uniques=360 * 2, from_date="2020-01-01", cross_join=False)
 
 exhibit_data = xbt.Exhibit(command="fromspec", source=spec_dict, output="dataframe")
 anon_df = exhibit_data.generate()