Skip to content

Commit

Permalink
Fixed a few bugs related to SQL-based anonymised sets and added a new…
Browse files Browse the repository at this point in the history
… recipe
  • Loading branch information
gherka committed Nov 27, 2023
1 parent 7513305 commit 7a9fda0
Show file tree
Hide file tree
Showing 5 changed files with 426 additions and 9 deletions.
6 changes: 3 additions & 3 deletions exhibit/core/generate/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,10 +501,10 @@ def _generate_using_external_table(self, col_name, anon_set):
# duplicates in case user didn't specify DISTINC in his SQL query;
# the anon_df would typically be from UUIDs that are generated before
# categorical columns.
if self.anon_df is not None:
existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
else:
if self.anon_df is None or self.anon_df.empty:
existing_data = pd.concat(self.generated_dfs, axis=1)
else:
existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)

# ensure the data going into DB is processed identically for join keys
for col in join_columns:
Expand Down
7 changes: 6 additions & 1 deletion exhibit/core/generate/weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,12 @@ def target_columns_for_weights_table(spec_dict):
anon_set = spec_dict["columns"][cat_col]["anonymising_set"]
if (
is_paired(spec_dict, cat_col) or
anon_set.split(".")[0] not in fixed_sql_sets):
# we keep the columns if they are in fixed sets or have custom SQL;
# because regex can be very variable, we assume that if anoymising set is not in
# fixed sets, and doesn't start with SELECT, it's regex and shouldn't have weights
(anon_set.split(".")[0] not in fixed_sql_sets and
anon_set.strip().upper()[:6] != "SELECT")
):
cat_cols_set.remove(cat_col)

return cat_cols_set
Expand Down
4 changes: 2 additions & 2 deletions exhibit/core/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ def __init__(self, uuid_seed=0, freq_dist=None, miss_proba=0, anon_set="uuid"):
miss_proba : float
Percentage of records to be nulled.
anon_set : string
One of "uuid" or "range".
One of "uuid", "range", "pseudo_chi".
'''

self["type"] = "uuid"
Expand Down Expand Up @@ -634,7 +634,7 @@ def __init__(
Optional SQL SELECT statement to pick the date values from.
'''

if from_date is None and to_date is None: #pragma: no cover
if (from_date is None and to_date is None) and anonymising_set is None: #pragma: no cover
raise RuntimeError(
f"{col_name} is missing at least one of from_date or to_date")

Expand Down
6 changes: 3 additions & 3 deletions recipes/Using Exhibit in a script.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@
spec_dict["metadata"]["date_columns"] = ["discharge_date"]

spec_dict["columns"]["id"] = UUIDColumn(uuid_seed=0)
spec_dict["columns"]["hospital"] = CategoricalColumn("hospital", original_values="regex", anon_set="HOSP[1-9]{2}")
spec_dict["columns"]["count"] = NumericalColumn(distribution_parameters={"target_min":1, "target_max":1000})
spec_dict["columns"]["discharge_date"] = DateColumn("2020-01-01", 360 * 2, cross_join=False)
spec_dict["columns"]["hospital"] = CategoricalColumn("hospital", uniques=5, original_values="regex", anon_set="HOSP[1-9]{2}")
spec_dict["columns"]["count"] = NumericalColumn(distribution="normal", distribution_parameters={"target_min":1, "target_max":1000})
spec_dict["columns"]["discharge_date"] = DateColumn("discharge_date", uniques=360 * 2, from_date="2020-01-01", cross_join=False)

exhibit_data = xbt.Exhibit(command="fromspec", source=spec_dict, output="dataframe")
anon_df = exhibit_data.generate()
Expand Down
Loading

0 comments on commit 7a9fda0

Please sign in to comment.