From b92d400552ccb612c374d11fb283d90ff6aa15d3 Mon Sep 17 00:00:00 2001
From: gherka <germanpriks@gmail.com>
Date: Fri, 29 Dec 2023 12:46:26 +0000
Subject: [PATCH] Removed incorrectly hard-coded Missing data values and fixed
 a few related bugs

---
 exhibit/core/formatters.py                       | 16 ++++++++++------
 exhibit/core/generate/categorical.py             |  2 +-
 exhibit/core/generate/tests/test_categorical.py  | 15 ++++++++-------
 exhibit/core/linkage/matrix.py                   | 12 +++++++++++-
 .../linkage/tests/test_linkage_hierarchical.py   |  6 +++---
 exhibit/core/spec.py                             | 14 +++++++-------
 exhibit/core/tests/test_spec.py                  | 11 ++++++++---
 recipes/Using SQL in anonymising sets.ipynb      |  4 ++--
 ...ng custom functions in anonymising sets.ipynb |  2 +-
 9 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/exhibit/core/formatters.py b/exhibit/core/formatters.py
index 1d548f5..196acf7 100644
--- a/exhibit/core/formatters.py
+++ b/exhibit/core/formatters.py
@@ -138,12 +138,16 @@ def build_list_of_probability_vectors(dataframe, original_series_name, ew=False)
 
     total_count = len(original_series)
 
-    temp_vectors = (original_series
-                     .fillna(MISSING_DATA_STR)
-                     .value_counts()
-                     .sort_index(kind="mergesort")
-                     .apply(lambda x: 0 if x == 0 else max(0.001, x / total_count))
-    )
+    # we need to ensure that the type of the original values is str, not mixed (object)
+    # after we've filled the NAs because otherwise NAs become 'nan' and are not handled right
+    temp_vectors_value_counts = (original_series
+                    .fillna(MISSING_DATA_STR)
+                    .value_counts())
+    
+    temp_vectors = (temp_vectors_value_counts
+                    .set_axis(temp_vectors_value_counts.index.astype(str))
+                    .sort_index(kind="mergesort")
+                    .apply(lambda x: 0 if x == 0 else max(0.001, x / total_count)))
 
     if MISSING_DATA_STR not in temp_vectors:
         temp_vectors = pd.concat([temp_vectors, pd.Series(
diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py
index 04b336f..861e673 100644
--- a/exhibit/core/generate/categorical.py
+++ b/exhibit/core/generate/categorical.py
@@ -286,7 +286,7 @@ def _generate_anon_series(self, col_name):
         aliased_df = orig_df.applymap(lambda x: aliases.get(x, x))
         self.spec_dict["columns"][col_name]["original_values"] = aliased_df
 
-        # we ignore Missing Data probability when we originally create the variable
+        # we ignore Missing data probability when we originally create the variable
         idx = self.rng.choice(a=len(sql_df), p=col_prob, size=self.num_rows)
         anon_list = [sql_df.iloc[x, :].values for x in idx]
         anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list)
diff --git a/exhibit/core/generate/tests/test_categorical.py b/exhibit/core/generate/tests/test_categorical.py
index 5bbf0f8..0e7474a 100644
--- a/exhibit/core/generate/tests/test_categorical.py
+++ b/exhibit/core/generate/tests/test_categorical.py
@@ -20,6 +20,7 @@
 from exhibit.db import db_util
 from exhibit.core.sql import create_temp_table
 from exhibit.core.tests.test_reference import temp_exhibit
+from exhibit.core.constants import MISSING_DATA_STR
 
 # Module under test
 from exhibit.core.generate import categorical as tm
@@ -246,7 +247,7 @@ def test_column_with_categorical_values_based_on_conditonal_sql(self):
                     "type": "categorical",
                     "uniques" : 2,
                     "original_values" : pd.DataFrame(data={
-                        "gender" : ["M", "F", "Missing Data"],
+                        "gender" : ["M", "F", MISSING_DATA_STR],
                         "probability_vector" : [0.5, 0.5, 0]
                     }),
                     "paired_columns": None,
@@ -307,7 +308,7 @@ def test_column_with_external_date_values_in_conditonal_sql(self):
                     "type": "categorical",
                     "uniques" : 2,
                     "original_values" : pd.DataFrame(data={
-                        "gender" : ["M", "F", "Missing Data"],
+                        "gender" : ["M", "F", MISSING_DATA_STR],
                         "probability_vector" : [0.5, 0.5, 0]
                     }),
                     "paired_columns": None,
@@ -402,7 +403,7 @@ def test_column_with_using_case_statement_in_conditonal_sql(self):
                     "type": "categorical",
                     "uniques" : 2,
                     "original_values" : pd.DataFrame(data={
-                        "age" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, "Missing Data"],
+                        "age" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, MISSING_DATA_STR],
                         "probability_vector" : [0.5] * 10 + [0]
                     }),
                     "paired_columns": None,
@@ -454,7 +455,7 @@ def test_column_with_original_values_in_conditonal_sql(self):
                     "type": "categorical",
                     "uniques" : 10,
                     "original_values" : pd.DataFrame(data={
-                        "age_at_birth" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, "Missing Data"],
+                        "age_at_birth" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, MISSING_DATA_STR],
                         "probability_vector" : [0.5] * 10 + [0]
                     }),
                     "paired_columns": None,
@@ -465,7 +466,7 @@ def test_column_with_original_values_in_conditonal_sql(self):
                     "type": "categorical",
                     "uniques" : 10,
                     "original_values" : pd.DataFrame(data={
-                        "age_at_death" : [5, 10, 25, 30, 40, 50, 60, 80, 90, 111, "Missing Data"],
+                        "age_at_death" : [5, 10, 25, 30, 40, 50, 60, 80, 90, 111, MISSING_DATA_STR],
                         "probability_vector" : [0.5] * 10 + [0]
                     }),
                     "paired_columns": None,
@@ -500,7 +501,7 @@ def test_column_with_external_sql_values_and_probablities(self):
         })
 
         original_vals = pd.DataFrame(data={
-            "condition"          : ["A", "B", "C", "D", "E", "Missing Data"],
+            "condition"          : ["A", "B", "C", "D", "E", MISSING_DATA_STR],
             "probability_vector" : [0.1, 0.1, 0.5, 0.1, 0.2, 0.0],
         })
 
@@ -519,7 +520,7 @@ def test_column_with_external_sql_values_and_probablities(self):
                     "type": "categorical",
                     "uniques" : 2,
                     "original_values" : pd.DataFrame(data={
-                        "gender" : ["M", "F", "Missing Data"],
+                        "gender" : ["M", "F", MISSING_DATA_STR],
                         "probability_vector" : [0.5, 0.5, 0]
                     }),
                     "paired_columns": None,
diff --git a/exhibit/core/linkage/matrix.py b/exhibit/core/linkage/matrix.py
index 379cf19..43c662a 100644
--- a/exhibit/core/linkage/matrix.py
+++ b/exhibit/core/linkage/matrix.py
@@ -120,7 +120,9 @@ def add_prefix(df, sep="__"):
     data_dict = {}
     
     for col in df.columns:
-        data_dict[col] = np.add(f"{col}{sep}", df[col].fillna(MISSING_DATA_STR).values)
+        # cast to str in case we're dealing with integer-based categorical columns, like age
+        df_col_str = df[col].fillna(MISSING_DATA_STR).astype(str)
+        data_dict[col] = np.add(f"{col}{sep}", df_col_str.values)
         
     return pd.DataFrame(data_dict)
 
@@ -272,6 +274,14 @@ def process_row(
             label_matrix, proba_lookup, lcd, rng, ref_array, acc_array, i)
         
     target_proba = np.array([proba_lookup[x] for x in valid_targets])
+
+    # typically, there will be more than 1 value in target_proba, but we have to guard against
+    # possibility of there being just one value, and if its probability is zero (Missing data)
+    # then summing it to 1 will result in NA (division by zero). As a workaround, set proba to
+    # 1 whenever it's the only possible value - since having it less than 1 doesn't make sense.
+    if len(target_proba) == 1:
+        target_proba = np.array([1])
+
     # make sure the probabilities sum up to 1
     target_proba = target_proba * (1 / sum(target_proba))
 
diff --git a/exhibit/core/linkage/tests/test_linkage_hierarchical.py b/exhibit/core/linkage/tests/test_linkage_hierarchical.py
index f908302..0cb3f8e 100644
--- a/exhibit/core/linkage/tests/test_linkage_hierarchical.py
+++ b/exhibit/core/linkage/tests/test_linkage_hierarchical.py
@@ -334,13 +334,13 @@ def test_scenario_1(self):
         This happens when the number of unique values in each column
         exceeds the user-specified threshold. In this case, the values
         are stored in exhibit DB and the user has no way to specify bespoke
-        probabilities. All SQL DB linked tables will have Missing Data as
+        probabilities. All SQL DB linked tables will have Missing data as
         the last row.
         '''
 
         sql_df = pd.DataFrame(data={
-            "A":list(sorted([f"A{i}" for i in range(5)]*2)) + ["Missing data"],
-            "B": [f"B{i}" for i in range(10)] + ["Missing data"]
+            "A":list(sorted([f"A{i}" for i in range(5)]*2)) + [MISSING_DATA_STR],
+            "B": [f"B{i}" for i in range(10)] + [MISSING_DATA_STR]
         })
 
         #we're bypassing __init__ and going straight to testing scenario code
diff --git a/exhibit/core/spec.py b/exhibit/core/spec.py
index 8eb6cda..9708007 100644
--- a/exhibit/core/spec.py
+++ b/exhibit/core/spec.py
@@ -107,7 +107,7 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs
             self.user_linked_cols = kwargs.get("user_linked_cols", [])
             self.uuid_cols = kwargs.get("uuid_cols", set())
             self.db_prob_cols = kwargs.get("save_probabilities", set())
-            self.id = generate_table_id()
+            self.id = kwargs.get("id", generate_table_id())
             
             self.numerical_cols = (
                 set(self.df.select_dtypes(include=np.number).columns.values) -
@@ -484,7 +484,7 @@ class CategoricalColumn(dict):
     def __init__(self,
         name, original_values, original_probs=None,
         paired_columns=None, uniques=None, cross_join=False,
-        miss_proba=0, anon_set="random", dispersion=0):
+        miss_proba=None, anon_set="random", dispersion=0):
         '''
         Parameters
         ----------
@@ -493,8 +493,8 @@ def __init__(self,
             name to ensure smooth operation of the synthesis.
         original_values   : str | list | pd.DataFrame
             A flexible way to provide instructions on what values to synthesise. You don't
-            need to provide the Missing Data value and its probability; these are added
-            automatically with Missing Data having zero probability.
+            need to provide the Missing data value and its probability; these are added
+            automatically with Missing data having zero probability.
         original_probs    : list
             Only valid if original_values were provided as a list. The order of
             probabilities must match the order of original_values. Defauls to equal
@@ -527,7 +527,7 @@ def __init__(self,
         self["paired_columns"] = [] if paired_columns is None else paired_columns
         self["uniques"] = 0 if uniques is None else uniques
         self["cross_join_all_unique_values"] = cross_join
-        self["miss_probability"] = miss_proba
+        self["miss_probability"] = 0 if miss_proba is None else miss_proba
         self["anonymising_set"] = anon_set
         self["dispersion"] = dispersion
         
@@ -547,7 +547,7 @@ def __init__(self,
             # if we have missing data in the original list, we have two possibilities:
             # we have a probability vector in which case it's taken care of, or not.
             # we assume that missing data is the last item in the original values / probas
-            if MISSING_DATA_STR in original_values:
+            if MISSING_DATA_STR in original_values and miss_proba is None:
                 if original_probs is None:
                     # take the equal probability we've derived earlier
                     self["miss_probability"] = prob_vector[0]
@@ -572,7 +572,7 @@ def __init__(self,
 
         if isinstance(original_values, pd.DataFrame):
             # check for missing data in the provided data frame
-            if MISSING_DATA_STR in original_values[name].unique():
+            if MISSING_DATA_STR in original_values[name].unique() and miss_proba is None:
                 ov_arr = original_values[name].to_numpy()
                 proba_arr = original_values["probability_vector"].to_numpy()
                 self["miss_probability"] = proba_arr[ov_arr== MISSING_DATA_STR].item()
diff --git a/exhibit/core/tests/test_spec.py b/exhibit/core/tests/test_spec.py
index 421e60c..f5230d7 100644
--- a/exhibit/core/tests/test_spec.py
+++ b/exhibit/core/tests/test_spec.py
@@ -193,8 +193,13 @@ def _generate_spam(_):
         spec_dict["metadata"]["numerical_columns"] = ["price"]
         spec_dict["metadata"]["id"] = "main"
 
+        # note that even though original_values only include 2 values (+ missing data),
+        # the synthetic dataset will have more, it's just the weights / probabilities will
+        # only affect these two - to save users from listing all values if they only want to
+        # change a couple.
         menu_df = pd.DataFrame(data={
-            "menu" : ["Egg and bacon", "Lobster Thermidor", "Missing Data"],
+            "menu" : ["Egg and bacon", "Lobster Thermidor", MISSING_DATA_STR],
+            "probability_vector" : [0.5, 0.5, 0.0],
             "price": [0.5, 0.5, 0.0]
         })
 
@@ -216,7 +221,7 @@ def _generate_spam(_):
     def test_categorical_column_initialised_from_dataframe_with_missing_data(self):
         '''
         If users don't explicitly provide a miss_proba argument to CategoricalColumn, 
-        but original_data has Missing Data value, we'll take the probability of that
+        but original_data has Missing data value, we'll take the probability of that
         and use it as miss_proba - otherwise, no missing data will be added.
         '''
 
@@ -236,7 +241,7 @@ def test_categorical_column_initialised_from_dataframe_with_missing_data(self):
             original_values=["spam", "ham", "eggs", "spamspam", MISSING_DATA_STR],
         )
 
-        # standard list without Missing Data, but with miss proba argument
+        # standard list without Missing data, but with miss proba argument
         spec_dict["columns"]["list_3"] = tm.CategoricalColumn("list_3",
             original_values=["spam", "ham", "eggs", "spamspam"],
             miss_proba=0.5
diff --git a/recipes/Using SQL in anonymising sets.ipynb b/recipes/Using SQL in anonymising sets.ipynb
index 12ce9e4..dc714e0 100644
--- a/recipes/Using SQL in anonymising sets.ipynb	
+++ b/recipes/Using SQL in anonymising sets.ipynb	
@@ -75,7 +75,7 @@
     "# You can specify custom probabilities and weights for numerical columns,\n",
     "# just like you would for a standard categorical column\n",
     "condition_data = pd.DataFrame(data={\n",
-    "    \"condition\"          : [\"A\", \"B\", \"C\", \"D\", \"E\", \"Missing Data\"],\n",
+    "    \"condition\"          : [\"A\", \"B\", \"C\", \"D\", \"E\", \"Missing data\"],\n",
     "    \"probability_vector\" : [0.1, 0.1, 0.5, 0.1, 0.2, 0.0],\n",
     "    \"count\"              : [0.1, 0.1, 0.1, 0.1, 0.6, 0.0],\n",
     "})\n",
@@ -89,7 +89,7 @@
     "\"\"\"\n",
     "\n",
     "gender_data = pd.DataFrame(data={\n",
-    "    \"gender\" : [\"M\", \"F\", \"Missing Data\"],\n",
+    "    \"gender\" : [\"M\", \"F\", \"Missing data\"],\n",
     "    \"probability_vector\" : [0.5, 0.5, 0],\n",
     "})\n",
     "\n",
diff --git a/recipes/Using custom functions in anonymising sets.ipynb b/recipes/Using custom functions in anonymising sets.ipynb
index bf61190..ae00547 100644
--- a/recipes/Using custom functions in anonymising sets.ipynb	
+++ b/recipes/Using custom functions in anonymising sets.ipynb	
@@ -271,7 +271,7 @@
     "spec_dict[\"metadata\"][\"id\"] = \"main\"\n",
     "\n",
     "smoker_data = pd.DataFrame(data={\n",
-    "    \"smoker\":             [\"Y\", \"N\", \"No Answer\", \"Missing Data\"],\n",
+    "    \"smoker\":             [\"Y\", \"N\", \"No Answer\", \"Missing data\"],\n",
     "    \"probability_vector\": [0.2, 0.7, 0.1, 0]\n",
     "})\n",
     "\n",