Skip to content

Commit

Permalink
Removed incorrectly hard-coded Missing data values and fixed a few re…
Browse files Browse the repository at this point in the history
…lated bugs
  • Loading branch information
gherka committed Jan 3, 2024
1 parent 733769c commit b92d400
Show file tree
Hide file tree
Showing 9 changed files with 51 additions and 31 deletions.
16 changes: 10 additions & 6 deletions exhibit/core/formatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,16 @@ def build_list_of_probability_vectors(dataframe, original_series_name, ew=False)

total_count = len(original_series)

temp_vectors = (original_series
.fillna(MISSING_DATA_STR)
.value_counts()
.sort_index(kind="mergesort")
.apply(lambda x: 0 if x == 0 else max(0.001, x / total_count))
)
# we need to ensure that the type of the original values is str, not mixed (object)
# after we've filled the NAs because otherwise NAs become 'nan' and are not handled right
temp_vectors_value_counts = (original_series
.fillna(MISSING_DATA_STR)
.value_counts())

temp_vectors = (temp_vectors_value_counts
.set_axis(temp_vectors_value_counts.index.astype(str))
.sort_index(kind="mergesort")
.apply(lambda x: 0 if x == 0 else max(0.001, x / total_count)))

if MISSING_DATA_STR not in temp_vectors:
temp_vectors = pd.concat([temp_vectors, pd.Series(
Expand Down
2 changes: 1 addition & 1 deletion exhibit/core/generate/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ def _generate_anon_series(self, col_name):
aliased_df = orig_df.applymap(lambda x: aliases.get(x, x))
self.spec_dict["columns"][col_name]["original_values"] = aliased_df

# we ignore Missing Data probability when we originally create the variable
# we ignore Missing data probability when we originally create the variable
idx = self.rng.choice(a=len(sql_df), p=col_prob, size=self.num_rows)
anon_list = [sql_df.iloc[x, :].values for x in idx]
anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list)
Expand Down
15 changes: 8 additions & 7 deletions exhibit/core/generate/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from exhibit.db import db_util
from exhibit.core.sql import create_temp_table
from exhibit.core.tests.test_reference import temp_exhibit
from exhibit.core.constants import MISSING_DATA_STR

# Module under test
from exhibit.core.generate import categorical as tm
Expand Down Expand Up @@ -246,7 +247,7 @@ def test_column_with_categorical_values_based_on_conditonal_sql(self):
"type": "categorical",
"uniques" : 2,
"original_values" : pd.DataFrame(data={
"gender" : ["M", "F", "Missing Data"],
"gender" : ["M", "F", MISSING_DATA_STR],
"probability_vector" : [0.5, 0.5, 0]
}),
"paired_columns": None,
Expand Down Expand Up @@ -307,7 +308,7 @@ def test_column_with_external_date_values_in_conditonal_sql(self):
"type": "categorical",
"uniques" : 2,
"original_values" : pd.DataFrame(data={
"gender" : ["M", "F", "Missing Data"],
"gender" : ["M", "F", MISSING_DATA_STR],
"probability_vector" : [0.5, 0.5, 0]
}),
"paired_columns": None,
Expand Down Expand Up @@ -402,7 +403,7 @@ def test_column_with_using_case_statement_in_conditonal_sql(self):
"type": "categorical",
"uniques" : 2,
"original_values" : pd.DataFrame(data={
"age" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, "Missing Data"],
"age" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, MISSING_DATA_STR],
"probability_vector" : [0.5] * 10 + [0]
}),
"paired_columns": None,
Expand Down Expand Up @@ -454,7 +455,7 @@ def test_column_with_original_values_in_conditonal_sql(self):
"type": "categorical",
"uniques" : 10,
"original_values" : pd.DataFrame(data={
"age_at_birth" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, "Missing Data"],
"age_at_birth" : [1, 2, 5, 10, 17, 18, 19, 25, 50, 110, MISSING_DATA_STR],
"probability_vector" : [0.5] * 10 + [0]
}),
"paired_columns": None,
Expand All @@ -465,7 +466,7 @@ def test_column_with_original_values_in_conditonal_sql(self):
"type": "categorical",
"uniques" : 10,
"original_values" : pd.DataFrame(data={
"age_at_death" : [5, 10, 25, 30, 40, 50, 60, 80, 90, 111, "Missing Data"],
"age_at_death" : [5, 10, 25, 30, 40, 50, 60, 80, 90, 111, MISSING_DATA_STR],
"probability_vector" : [0.5] * 10 + [0]
}),
"paired_columns": None,
Expand Down Expand Up @@ -500,7 +501,7 @@ def test_column_with_external_sql_values_and_probablities(self):
})

original_vals = pd.DataFrame(data={
"condition" : ["A", "B", "C", "D", "E", "Missing Data"],
"condition" : ["A", "B", "C", "D", "E", MISSING_DATA_STR],
"probability_vector" : [0.1, 0.1, 0.5, 0.1, 0.2, 0.0],
})

Expand All @@ -519,7 +520,7 @@ def test_column_with_external_sql_values_and_probablities(self):
"type": "categorical",
"uniques" : 2,
"original_values" : pd.DataFrame(data={
"gender" : ["M", "F", "Missing Data"],
"gender" : ["M", "F", MISSING_DATA_STR],
"probability_vector" : [0.5, 0.5, 0]
}),
"paired_columns": None,
Expand Down
12 changes: 11 additions & 1 deletion exhibit/core/linkage/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,9 @@ def add_prefix(df, sep="__"):
data_dict = {}

for col in df.columns:
data_dict[col] = np.add(f"{col}{sep}", df[col].fillna(MISSING_DATA_STR).values)
# cast to str in case we're dealing with integer-based categorical columns, like age
df_col_str = df[col].fillna(MISSING_DATA_STR).astype(str)
data_dict[col] = np.add(f"{col}{sep}", df_col_str.values)

return pd.DataFrame(data_dict)

Expand Down Expand Up @@ -272,6 +274,14 @@ def process_row(
label_matrix, proba_lookup, lcd, rng, ref_array, acc_array, i)

target_proba = np.array([proba_lookup[x] for x in valid_targets])

# typically, there will be more than 1 value in target_proba, but we have to guard against
# possibility of there being just one value, and if its probability is zero (Missing data)
# then summing it to 1 will result in NA (division by zero). As a workaround, set proba to
# 1 whenever it's the only possible value - since having it less than 1 doesn't make sense.
if len(target_proba) == 1:
target_proba = np.array([1])

# make sure the probabilities sum up to 1
target_proba = target_proba * (1 / sum(target_proba))

Expand Down
6 changes: 3 additions & 3 deletions exhibit/core/linkage/tests/test_linkage_hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,13 +334,13 @@ def test_scenario_1(self):
This happens when the number of unique values in each column
exceeds the user-specified threshold. In this case, the values
are stored in exhibit DB and the user has no way to specify bespoke
probabilities. All SQL DB linked tables will have Missing Data as
probabilities. All SQL DB linked tables will have Missing data as
the last row.
'''

sql_df = pd.DataFrame(data={
"A":list(sorted([f"A{i}" for i in range(5)]*2)) + ["Missing data"],
"B": [f"B{i}" for i in range(10)] + ["Missing data"]
"A":list(sorted([f"A{i}" for i in range(5)]*2)) + [MISSING_DATA_STR],
"B": [f"B{i}" for i in range(10)] + [MISSING_DATA_STR]
})

#we're bypassing __init__ and going straight to testing scenario code
Expand Down
14 changes: 7 additions & 7 deletions exhibit/core/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __init__(self, data=None, inline_limit=30, ew=False, random_seed=0, **kwargs
self.user_linked_cols = kwargs.get("user_linked_cols", [])
self.uuid_cols = kwargs.get("uuid_cols", set())
self.db_prob_cols = kwargs.get("save_probabilities", set())
self.id = generate_table_id()
self.id = kwargs.get("id", generate_table_id())

self.numerical_cols = (
set(self.df.select_dtypes(include=np.number).columns.values) -
Expand Down Expand Up @@ -484,7 +484,7 @@ class CategoricalColumn(dict):
def __init__(self,
name, original_values, original_probs=None,
paired_columns=None, uniques=None, cross_join=False,
miss_proba=0, anon_set="random", dispersion=0):
miss_proba=None, anon_set="random", dispersion=0):
'''
Parameters
----------
Expand All @@ -493,8 +493,8 @@ def __init__(self,
name to ensure smooth operation of the synthesis.
original_values : str | list | pd.DataFrame
A flexible way to provide instructions on what values to synthesise. You don't
need to provide the Missing Data value and its probability; these are added
automatically with Missing Data having zero probability.
need to provide the Missing data value and its probability; these are added
automatically with Missing data having zero probability.
original_probs : list
Only valid if original_values were provided as a list. The order of
probabilities must match the order of original_values. Defauls to equal
Expand Down Expand Up @@ -527,7 +527,7 @@ def __init__(self,
self["paired_columns"] = [] if paired_columns is None else paired_columns
self["uniques"] = 0 if uniques is None else uniques
self["cross_join_all_unique_values"] = cross_join
self["miss_probability"] = miss_proba
self["miss_probability"] = 0 if miss_proba is None else miss_proba
self["anonymising_set"] = anon_set
self["dispersion"] = dispersion

Expand All @@ -547,7 +547,7 @@ def __init__(self,
# if we have missing data in the original list, we have two possibilities:
# we have a probability vector in which case it's taken care of, or not.
# we assume that missing data is the last item in the original values / probas
if MISSING_DATA_STR in original_values:
if MISSING_DATA_STR in original_values and miss_proba is None:
if original_probs is None:
# take the equal probability we've derived earlier
self["miss_probability"] = prob_vector[0]
Expand All @@ -572,7 +572,7 @@ def __init__(self,

if isinstance(original_values, pd.DataFrame):
# check for missing data in the provided data frame
if MISSING_DATA_STR in original_values[name].unique():
if MISSING_DATA_STR in original_values[name].unique() and miss_proba is None:
ov_arr = original_values[name].to_numpy()
proba_arr = original_values["probability_vector"].to_numpy()
self["miss_probability"] = proba_arr[ov_arr== MISSING_DATA_STR].item()
Expand Down
11 changes: 8 additions & 3 deletions exhibit/core/tests/test_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,13 @@ def _generate_spam(_):
spec_dict["metadata"]["numerical_columns"] = ["price"]
spec_dict["metadata"]["id"] = "main"

# note that even though original_values only include 2 values (+ missing data),
# the synthetic dataset will have more, it's just the weights / probabilities will
# only affect these two - to save users from listing all values if they only want to
# change a couple.
menu_df = pd.DataFrame(data={
"menu" : ["Egg and bacon", "Lobster Thermidor", "Missing Data"],
"menu" : ["Egg and bacon", "Lobster Thermidor", MISSING_DATA_STR],
"probability_vector" : [0.5, 0.5, 0.0],
"price": [0.5, 0.5, 0.0]
})

Expand All @@ -216,7 +221,7 @@ def _generate_spam(_):
def test_categorical_column_initialised_from_dataframe_with_missing_data(self):
'''
If users don't explicitly provide a miss_proba argument to CategoricalColumn,
but original_data has Missing Data value, we'll take the probability of that
but original_data has Missing data value, we'll take the probability of that
and use it as miss_proba - otherwise, no missing data will be added.
'''

Expand All @@ -236,7 +241,7 @@ def test_categorical_column_initialised_from_dataframe_with_missing_data(self):
original_values=["spam", "ham", "eggs", "spamspam", MISSING_DATA_STR],
)

# standard list without Missing Data, but with miss proba argument
# standard list without Missing data, but with miss proba argument
spec_dict["columns"]["list_3"] = tm.CategoricalColumn("list_3",
original_values=["spam", "ham", "eggs", "spamspam"],
miss_proba=0.5
Expand Down
4 changes: 2 additions & 2 deletions recipes/Using SQL in anonymising sets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"# You can specify custom probabilities and weights for numerical columns,\n",
"# just like you would for a standard categorical column\n",
"condition_data = pd.DataFrame(data={\n",
" \"condition\" : [\"A\", \"B\", \"C\", \"D\", \"E\", \"Missing Data\"],\n",
" \"condition\" : [\"A\", \"B\", \"C\", \"D\", \"E\", \"Missing data\"],\n",
" \"probability_vector\" : [0.1, 0.1, 0.5, 0.1, 0.2, 0.0],\n",
" \"count\" : [0.1, 0.1, 0.1, 0.1, 0.6, 0.0],\n",
"})\n",
Expand All @@ -89,7 +89,7 @@
"\"\"\"\n",
"\n",
"gender_data = pd.DataFrame(data={\n",
" \"gender\" : [\"M\", \"F\", \"Missing Data\"],\n",
" \"gender\" : [\"M\", \"F\", \"Missing data\"],\n",
" \"probability_vector\" : [0.5, 0.5, 0],\n",
"})\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion recipes/Using custom functions in anonymising sets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@
"spec_dict[\"metadata\"][\"id\"] = \"main\"\n",
"\n",
"smoker_data = pd.DataFrame(data={\n",
" \"smoker\": [\"Y\", \"N\", \"No Answer\", \"Missing Data\"],\n",
" \"smoker\": [\"Y\", \"N\", \"No Answer\", \"Missing data\"],\n",
" \"probability_vector\": [0.2, 0.7, 0.1, 0]\n",
"})\n",
"\n",
Expand Down

0 comments on commit b92d400

Please sign in to comment.