From b92d400552ccb612c374d11fb283d90ff6aa15d3 Mon Sep 17 00:00:00 2001 From: gherka Date: Fri, 29 Dec 2023 12:46:26 +0000 Subject: [PATCH] Removed incorrectly hard-coded Missing data values and fixed a few related bugs --- exhibit/core/formatters.py | 16 ++++++++++------ exhibit/core/generate/categorical.py | 2 +- exhibit/core/generate/tests/test_categorical.py | 15 ++++++++------- exhibit/core/linkage/matrix.py | 12 +++++++++++- .../linkage/tests/test_linkage_hierarchical.py | 6 +++--- exhibit/core/spec.py | 14 +++++++------- exhibit/core/tests/test_spec.py | 11 ++++++++--- recipes/Using SQL in anonymising sets.ipynb | 4 ++-- ...ng custom functions in anonymising sets.ipynb | 2 +- 9 files changed, 51 insertions(+), 31 deletions(-) diff --git a/exhibit/core/formatters.py b/exhibit/core/formatters.py index 1d548f5..196acf7 100644 --- a/exhibit/core/formatters.py +++ b/exhibit/core/formatters.py @@ -138,12 +138,16 @@ def build_list_of_probability_vectors(dataframe, original_series_name, ew=False) total_count = len(original_series) - temp_vectors = (original_series - .fillna(MISSING_DATA_STR) - .value_counts() - .sort_index(kind="mergesort") - .apply(lambda x: 0 if x == 0 else max(0.001, x / total_count)) - ) + # we need to ensure that the type of the original values is str, not mixed (object) + # after we've filled the NAs because otherwise NAs become 'nan' and are not handled right + temp_vectors_value_counts = (original_series + .fillna(MISSING_DATA_STR) + .value_counts()) + + temp_vectors = (temp_vectors_value_counts + .set_axis(temp_vectors_value_counts.index.astype(str)) + .sort_index(kind="mergesort") + .apply(lambda x: 0 if x == 0 else max(0.001, x / total_count))) if MISSING_DATA_STR not in temp_vectors: temp_vectors = pd.concat([temp_vectors, pd.Series( diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py index 04b336f..861e673 100644 --- a/exhibit/core/generate/categorical.py +++ b/exhibit/core/generate/categorical.py @@ -286,7 +286,7 @@ def _generate_anon_series(self, col_name): aliased_df = orig_df.applymap(lambda x: aliases.get(x, x)) self.spec_dict["columns"][col_name]["original_values"] = aliased_df - # we ignore Missing Data probability when we originally create the variable + # we ignore Missing data probability when we originally create the variable idx = self.rng.choice(a=len(sql_df), p=col_prob, size=self.num_rows) anon_list = [sql_df.iloc[x, :].values for x in idx] anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list) diff --git a/exhibit/core/generate/tests/test_categorical.py b/exhibit/core/generate/tests/test_categorical.py index 5bbf0f8..0e7474a 100644 --- a/exhibit/core/generate/tests/test_categorical.py +++ b/exhibit/core/generate/tests/test_categorical.py @@ -20,6 +20,7 @@ from exhibit.db import db_util from exhibit.core.sql import create_temp_table from exhibit.core.tests.test_reference import temp_exhibit +from exhibit.core.constants import MISSING_DATA_STR # Module under test from exhibit.core.generate import categorical as tm @@ -246,7 +247,7 @@ def test_column_with_categorical_values_based_on_conditonal_sql(self): "type": "categorical", "uniques" : 2, "original_values" : pd.DataFrame(data={ - "gender" : ["M", "F", "Missing Data"], + "gender" : ["M", "F", MISSING_DATA_STR], "probability_vector" : [0.5, 0.5, 0] }), "paired_columns": None, @@ -307,7 +308,7 @@ def test_column_with_external_date_values_in_conditonal_sql(self): "type": "categorical", "uniques" : 2, "original_values" : pd.DataFrame(data={ - "gender" : ["M", "F", "Missing Data"], + "gender" : ["M", "F", MISSING_DATA_STR], "probability_vector" : [0.5, 0.5, 0] }), "paired_columns": None, @@ -402,7 +403,7 @@ def test_column_with_using_case_statement_in_conditonal_sql(self): "type": "categorical", "uniques" : 2, "original_values" : pd.DataFrame(data={ - "age" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, "Missing Data"], + "age" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, MISSING_DATA_STR], "probability_vector" : [0.5] * 10 + [0] }), "paired_columns": None, @@ -454,7 +455,7 @@ def test_column_with_original_values_in_conditonal_sql(self): "type": "categorical", "uniques" : 10, "original_values" : pd.DataFrame(data={ - "age_at_birth" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, "Missing Data"], + "age_at_birth" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, MISSING_DATA_STR], "probability_vector" : [0.5] * 10 + [0] }), "paired_columns": None, @@ -465,7 +466,7 @@ def test_column_with_original_values_in_conditonal_sql(self): "type": "categorical", "uniques" : 10, "original_values" : pd.DataFrame(data={ - "age_at_death" : [5, 10, 25, 30, 40, 50, 60, 80, 90, 111, "Missing Data"], + "age_at_death" : [5, 10, 25, 30, 40, 50, 60, 80, 90, 111, MISSING_DATA_STR], "probability_vector" : [0.5] * 10 + [0] }), "paired_columns": None, @@ -500,7 +501,7 @@ def test_column_with_external_sql_values_and_probablities(self): }) original_vals = pd.DataFrame(data={ - "condition" : ["A", "B", "C", "D", "E", "Missing Data"], + "condition" : ["A", "B", "C", "D", "E", MISSING_DATA_STR], "probability_vector" : [0.1, 0.1, 0.5, 0.1, 0.2, 0.0], }) @@ -519,7 +520,7 @@ def test_column_with_external_sql_values_and_probablities(self): "type": "categorical", "uniques" : 2, "original_values" : pd.DataFrame(data={ - "gender" : ["M", "F", "Missing Data"], + "gender" : ["M", "F", MISSING_DATA_STR], "probability_vector" : [0.5, 0.5, 0] }), "paired_columns": None, diff --git a/exhibit/core/linkage/matrix.py b/exhibit/core/linkage/matrix.py index 379cf19..43c662a 100644 --- a/exhibit/core/linkage/matrix.py +++ b/exhibit/core/linkage/matrix.py @@ -120,7 +120,9 @@ def add_prefix(df, sep="__"): data_dict = {} for col in df.columns: - data_dict[col] = np.add(f"{col}{sep}", df[col].fillna(MISSING_DATA_STR).values) + # cast to str in case we're dealing with integer-based categorical columns, like age + df_col_str = df[col].fillna(MISSING_DATA_STR).astype(str) + data_dict[col] = np.add(f"{col}{sep}", df_col_str.values) return pd.DataFrame(data_dict) @@ -272,6 +274,14 @@ def process_row( label_matrix, proba_lookup, lcd, rng, ref_array, acc_array, i) target_proba = np.array([proba_lookup[x] for x in valid_targets]) + + # typically, there will be more than 1 value in target_proba, but we have to guard against + # possibility of there being just one value, and if its probability is zero (Missing data) + # then summing it to 1 will result in NA (division by zero). As a workaround, set proba to + # 1 whenever it's the only possible value - since having it less than 1 doesn't make sense. + if len(target_proba) == 1: + target_proba = np.array([1]) + # make sure the probabilities sum up to 1 target_proba = target_proba * (1 / sum(target_proba)) diff --git a/exhibit/core/linkage/tests/test_linkage_hierarchical.py b/exhibit/core/linkage/tests/test_linkage_hierarchical.py index f908302..0cb3f8e 100644 --- a/exhibit/core/linkage/tests/test_linkage_hierarchical.py +++ b/exhibit/core/linkage/tests/test_linkage_hierarchical.py @@ -334,13 +334,13 @@ def test_scenario_1(self): This happens when the number of unique values in each column exceeds the user-specified threshold. In this case, the values are stored in exhibit DB and the user has no way to specify bespoke - probabilities. All SQL DB linked tables will have Missing Data as + probabilities. All SQL DB linked tables will have Missing data as the last row. ''' sql_df = pd.DataFrame(data={ - "A":list(sorted([f"A{i}" for i in range(5)]*2)) + ["Missing data"], - "B": [f"B{i}" for i in range(10)] + ["Missing data"] + "A":list(sorted([f"A{i}" for i in range(5)]*2)) + [MISSING_DATA_STR], + "B": [f"B{i}" for i in range(10)] + [MISSING_DATA_STR] }) #we're bypassing __init__ and going straight to testing scenario code diff --git a/exhibit/core/spec.py b/exhibit/core/spec.py index 8eb6cda..9708007 100644 --- a/exhibit/core/spec.py +++ b/exhibit/core/spec.py @@ -107,7 +107,7 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs self.user_linked_cols = kwargs.get("user_linked_cols", []) self.uuid_cols = kwargs.get("uuid_cols", set()) self.db_prob_cols = kwargs.get("save_probabilities", set()) - self.id = generate_table_id() + self.id = kwargs.get("id", generate_table_id()) self.numerical_cols = ( set(self.df.select_dtypes(include=np.number).columns.values) - @@ -484,7 +484,7 @@ class CategoricalColumn(dict): def __init__(self, name, original_values, original_probs=None, paired_columns=None, uniques=None, cross_join=False, - miss_proba=0, anon_set="random", dispersion=0): + miss_proba=None, anon_set="random", dispersion=0): ''' Parameters ---------- @@ -493,8 +493,8 @@ def __init__(self, name to ensure smooth operation of the synthesis. original_values : str | list | pd.DataFrame A flexible way to provide instructions on what values to synthesise. You don't - need to provide the Missing Data value and its probability; these are added - automatically with Missing Data having zero probability. + need to provide the Missing data value and its probability; these are added + automatically with Missing data having zero probability. original_probs : list Only valid if original_values were provided as a list. The order of probabilities must match the order of original_values. Defauls to equal @@ -527,7 +527,7 @@ def __init__(self, self["paired_columns"] = [] if paired_columns is None else paired_columns self["uniques"] = 0 if uniques is None else uniques self["cross_join_all_unique_values"] = cross_join - self["miss_probability"] = miss_proba + self["miss_probability"] = 0 if miss_proba is None else miss_proba self["anonymising_set"] = anon_set self["dispersion"] = dispersion @@ -547,7 +547,7 @@ def __init__(self, # if we have missing data in the original list, we have two possibilities: # we have a probability vector in which case it's taken care of, or not. # we assume that missing data is the last item in the original values / probas - if MISSING_DATA_STR in original_values: + if MISSING_DATA_STR in original_values and miss_proba is None: if original_probs is None: # take the equal probability we've derived earlier self["miss_probability"] = prob_vector[0] @@ -572,7 +572,7 @@ def __init__(self, if isinstance(original_values, pd.DataFrame): # check for missing data in the provided data frame - if MISSING_DATA_STR in original_values[name].unique(): + if MISSING_DATA_STR in original_values[name].unique() and miss_proba is None: ov_arr = original_values[name].to_numpy() proba_arr = original_values["probability_vector"].to_numpy() self["miss_probability"] = proba_arr[ov_arr== MISSING_DATA_STR].item() diff --git a/exhibit/core/tests/test_spec.py b/exhibit/core/tests/test_spec.py index 421e60c..f5230d7 100644 --- a/exhibit/core/tests/test_spec.py +++ b/exhibit/core/tests/test_spec.py @@ -193,8 +193,13 @@ def _generate_spam(_): spec_dict["metadata"]["numerical_columns"] = ["price"] spec_dict["metadata"]["id"] = "main" + # note that even though original_values only include 2 values (+ missing data), + # the synthetic dataset will have more, it's just the weights / probabilities will + # only affect these two - to save users from listing all values if they only want to + # change a couple. menu_df = pd.DataFrame(data={ - "menu" : ["Egg and bacon", "Lobster Thermidor", "Missing Data"], + "menu" : ["Egg and bacon", "Lobster Thermidor", MISSING_DATA_STR], + "probability_vector" : [0.5, 0.5, 0.0], "price": [0.5, 0.5, 0.0] }) @@ -216,7 +221,7 @@ def _generate_spam(_): def test_categorical_column_initialised_from_dataframe_with_missing_data(self): ''' If users don't explicitly provide a miss_proba argument to CategoricalColumn, - but original_data has Missing Data value, we'll take the probability of that + but original_data has Missing data value, we'll take the probability of that and use it as miss_proba - otherwise, no missing data will be added. ''' @@ -236,7 +241,7 @@ def test_categorical_column_initialised_from_dataframe_with_missing_data(self): original_values=["spam", "ham", "eggs", "spamspam", MISSING_DATA_STR], ) - # standard list without Missing Data, but with miss proba argument + # standard list without Missing data, but with miss proba argument spec_dict["columns"]["list_3"] = tm.CategoricalColumn("list_3", original_values=["spam", "ham", "eggs", "spamspam"], miss_proba=0.5 diff --git a/recipes/Using SQL in anonymising sets.ipynb b/recipes/Using SQL in anonymising sets.ipynb index 12ce9e4..dc714e0 100644 --- a/recipes/Using SQL in anonymising sets.ipynb +++ b/recipes/Using SQL in anonymising sets.ipynb @@ -75,7 +75,7 @@ "# You can specify custom probabilities and weights for numerical columns,\n", "# just like you would for a standard categorical column\n", "condition_data = pd.DataFrame(data={\n", - " \"condition\" : [\"A\", \"B\", \"C\", \"D\", \"E\", \"Missing Data\"],\n", + " \"condition\" : [\"A\", \"B\", \"C\", \"D\", \"E\", \"Missing data\"],\n", " \"probability_vector\" : [0.1, 0.1, 0.5, 0.1, 0.2, 0.0],\n", " \"count\" : [0.1, 0.1, 0.1, 0.1, 0.6, 0.0],\n", "})\n", @@ -89,7 +89,7 @@ "\"\"\"\n", "\n", "gender_data = pd.DataFrame(data={\n", - " \"gender\" : [\"M\", \"F\", \"Missing Data\"],\n", + " \"gender\" : [\"M\", \"F\", \"Missing data\"],\n", " \"probability_vector\" : [0.5, 0.5, 0],\n", "})\n", "\n", diff --git a/recipes/Using custom functions in anonymising sets.ipynb b/recipes/Using custom functions in anonymising sets.ipynb index bf61190..ae00547 100644 --- a/recipes/Using custom functions in anonymising sets.ipynb +++ b/recipes/Using custom functions in anonymising sets.ipynb @@ -271,7 +271,7 @@ "spec_dict[\"metadata\"][\"id\"] = \"main\"\n", "\n", "smoker_data = pd.DataFrame(data={\n", - " \"smoker\": [\"Y\", \"N\", \"No Answer\", \"Missing Data\"],\n", + " \"smoker\": [\"Y\", \"N\", \"No Answer\", \"Missing data\"],\n", " \"probability_vector\": [0.2, 0.7, 0.1, 0]\n", "})\n", "\n",