From 283ef501632fec1b7f683d60221dc5ff6f04f0f3 Mon Sep 17 00:00:00 2001 From: gherka Date: Mon, 2 Sep 2024 15:35:28 +0100 Subject: [PATCH] Fixed errors and deprecation warnings from Pandas 2.2.2 --- exhibit/core/constraints.py | 18 +- exhibit/core/exhibit.py | 7 +- exhibit/core/generate/categorical.py | 1324 +++++++------- exhibit/core/generate/missing.py | 690 +++---- exhibit/core/generate/tests/test_derived.py | 206 +-- exhibit/core/generate/tests/test_missing.py | 1236 ++++++------- exhibit/core/linkage/hierarchical.py | 4 +- exhibit/core/linkage/matrix.py | 778 ++++---- .../tests/test_linkage_hierarchical.py | 6 +- exhibit/core/tests/test_reference.py | 1592 ++++++++--------- exhibit/core/tests/test_spec.py | 2 +- exhibit/core/tests/test_utils.py | 7 +- exhibit/core/utils.py | 4 +- 13 files changed, 2943 insertions(+), 2931 deletions(-) diff --git a/exhibit/core/constraints.py b/exhibit/core/constraints.py index 42ebe6f..1588cf3 100644 --- a/exhibit/core/constraints.py +++ b/exhibit/core/constraints.py @@ -210,11 +210,20 @@ def process_custom_constraints(self, custom_constraints): _kwargs = kwargs_dict.get(action, {}) _kwargs.update(spec_action_kwargs) - # overwrite the original DF row IDs with the adjusted ones - output_df.loc[cc_filter_idx] = action_func( + # because the result of the action can be a different dtype compared + # to the original (like int to float, particularly involving NULLs) + # we need to capture the resultant dtype first, and then cast the + # original df to match it to avoid Pandas errors. + action_df = action_func( output_df, cc_filter_idx, target_str, cc_partitions, **_kwargs) + + action_dtypes = action_df.dtypes + output_df = output_df.astype(action_dtypes) + + # overwrite the original DF row IDs with the adjusted ones + output_df.loc[cc_filter_idx] = action_df return output_df def adjust_dataframe_to_fit_constraint(self, anon_df, basic_constraint): @@ -1231,12 +1240,13 @@ def shift_distribution( final_result.append(new_series) continue - + + # return the DF, matching the dtypes of the original (relevant for dates) new_df = pd.concat( final_result + [df.loc[filter_idx, [x for x in df.columns if x not in target_cols]]], axis=1 - ).reindex(columns=df.columns) + ).reindex(columns=df.columns).astype(df.dtypes) return new_df diff --git a/exhibit/core/exhibit.py b/exhibit/core/exhibit.py index 72f4012..0b0a397 100644 --- a/exhibit/core/exhibit.py +++ b/exhibit/core/exhibit.py @@ -344,9 +344,10 @@ def execute_spec(self): ) if col in geo_action_targets: - # add placeholders to avoid errors when generating missing data + # add float placeholders to avoid errors when generating missing data geo_cols = [f"{col}_latitude", f"{col}_longitude"] - anon_df[geo_cols] = 0 + # use 0.0 to ensure column dtype is float so that we could null them later + anon_df[geo_cols] = 0.0 continue h3_table_name = self.spec_dict["columns"][col]["h3_table"] @@ -444,7 +445,7 @@ def execute_spec(self): anon_df[derived_col] = generate_derived_column(anon_df, derived_def) break # change the missing data placeholder back to NAs - anon_df.loc[:, cat_cols] = anon_df.loc[:, cat_cols].applymap( + anon_df.loc[:, cat_cols] = anon_df.loc[:, cat_cols].map( lambda x: np.nan if x == MISSING_DATA_STR else x) #8) GENERATE DERIVED COLUMNS IF ANY ARE SPECIFIED diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py index 861e673..7be717b 100644 --- a/exhibit/core/generate/categorical.py +++ b/exhibit/core/generate/categorical.py @@ -1,662 +1,662 @@ -''' -Methods to generate categorical columns / values -''' - -# Standard library imports -from collections import namedtuple -from itertools import chain -import warnings - -# External library imports -import pandas as pd -import numpy as np -from sql_metadata import Parser -from pandas.api.types import is_numeric_dtype, is_datetime64_dtype - -# Exhibit imports -from ..constants import ORIGINAL_VALUES_REGEX, ORIGINAL_VALUES_PAIRED -from ..utils import get_attr_values, shuffle_data -from ..sql import query_exhibit_database, check_table_exists, execute_sql, create_temp_table -from ..linkage.hierarchical import generate_linked_anon_df -from ..linkage.matrix import generate_user_linked_anon_df -from .regex import generate_regex_column - -# EXPORTABLE METHODS -# ================== -class CategoricalDataGenerator: - ''' - Although this class is pretty bare, it still helps avoid passing - the same variables through functions and also mirrors the setup - for generation of linked data. - - One area that potentially needs looking at is if the user makes - manual changes to column values that were initially put into SQL - (where uniques > inline_limit) - for now, this works only for linked data. - ''' - - def __init__(self, spec_dict, core_rows, anon_df=None): - ''' - This class is covering the entire spec_dict as far as the - generation of non-numerical data is concerned. - ''' - - self.spec_dict = spec_dict - self.rng = spec_dict["_rng"] - self.num_rows = core_rows - self.fixed_anon_sets = ["random", "mountains", "patients", "birds", "dates"] - # we need UUID dataset (if it exists) for possible conditional SQL that - # references already-generated columns in the spec - self.generated_dfs = [] - self.anon_df = anon_df - - (self.all_cols, - self.complete_cols, - self.paired_cols, - self.skipped_cols) = self._get_column_types() - - def generate(self): - ''' - Brings together all the components of non-numerical data generation. - - Returns - ------- - A dataframe with all categorical columns - ''' - - #1) GENERATE LINKED DFs FROM EACH LINKED COLUMNS GROUP - for linked_group in (self.spec_dict.get("linked_columns") or []): - - # zero-numbered linked group is reserved for user-defined groupings - if linked_group[0] == 0: - - u_linked_df = generate_user_linked_anon_df( - spec_dict=self.spec_dict, - linked_cols=linked_group[1], - num_rows=self.num_rows - ) - - self.generated_dfs.append(u_linked_df) - - else: - - linked_df = generate_linked_anon_df( - spec_dict=self.spec_dict, - linked_group=linked_group, - num_rows=self.num_rows) - - self.generated_dfs.append(linked_df) - - #2) GENERATE NON-LINKED DFs - for col in [col for col in self.all_cols if col not in self.skipped_cols]: - s = self._generate_anon_series(col) - self.generated_dfs.append(s) - - #3) CONCAT GENERATED DFs AND SERIES - temp_anon_df = pd.concat(self.generated_dfs, axis=1) - - #4) GENERATE SERIES WITH "COMPLETE", CROSS-JOINED COLUMNS - complete_series = [] - - # Complete series can sort the data again - for col in self.complete_cols: - s = self._generate_complete_series(col) - #paired columns return None - if not s is None: - complete_series.append(s) - - #5) OUTER JOIN - temp_anon_df["key"] = 1 - - for s in complete_series: - - temp_anon_df = pd.merge( - temp_anon_df, - pd.DataFrame(s).assign(key=1), - how="outer", - on="key" - ) - - #6) TIDY UP - anon_df = temp_anon_df.drop("key", axis=1) - - return anon_df - - def _generate_timeseries(self, col_name, complete=False): - ''' - Basic generator of randomised / complete timeseries data - - Parameters: - ---------- - col_name : str - time column to generate (type checks are made upstream) - complete : boolean - if timeseries is meant to be "complete", return full series - without picking N=num_rows random values from the pool - - Returns: - -------- - pd.Series - ''' - - # see which date parameters we have access to - start = self.spec_dict["columns"][col_name].get("from", None) - end = self.spec_dict["columns"][col_name].get("to", None) - - # frequency and periods are always required - freq = self.spec_dict["columns"][col_name]["frequency"] - periods = self.spec_dict["columns"][col_name]["uniques"] - - # if we have both start and end, we generate all values in-between and pick the - # dates at random to match the number of periods, without repeats - if start is not None and end is not None: - - all_pos_dates = pd.date_range(start=start, end=end, freq=freq) - # when the number of requested periods is greater than the total possible - # range between from and to, given the frequency, we issue a warning, then - # omit the date_to and generate N=periods unique dates from date_from. - if len(all_pos_dates) < periods: - warnings.warn( - f"The number of unique dates at frequency {freq} between {start} " - f"and {end} is smaller than the number of requested periods" - f"({periods}). The date_to parameter will be ignored.", - RuntimeWarning - ) - all_pos_dates = pd.date_range(start=start, periods=periods, freq=freq) - - all_pos_dates = self.rng.choice(all_pos_dates, periods, replace=False) - - else: - # one of the start / end is None - all_pos_dates = pd.date_range( - start=start, end=end, periods=periods, freq=freq) - - if complete: - return pd.Series(all_pos_dates, name=col_name) - - random_dates = self.rng.choice(all_pos_dates, self.num_rows) - - return shuffle_data(pd.Series(random_dates, name=col_name)) - - def _generate_anon_series(self, col_name): - ''' - Generate basic categorical series anonymised according to user input. - - Note that in all cases except external tables, the final series is shuffled - and index reset. Series generated from external tables are an exception because - their values are linked to columns that have already been generated. - - The code can take different paths depending on these things: - - whether a the anonymising method is set to random or a custom set - - whether the number of unique values exceeds the threshold - - whether the column has any paired columns - - The paths differ primarily in terms of where the data sits: as part - of the spec in original_values or in exhibit DB. - - Things are further complicated if users want to use a single column - from an anonymising table, like mountains.peak - - Parameters: - ----------- - col_name : str - column name to process & anonymise - - Returns: - ------- - Pandas Series object or a Dataframe - ''' - - col_attrs = self.spec_dict["columns"][col_name] - col_type = col_attrs["type"] - - # capture categorical-only information, with fallback for date columns - paired_cols = col_attrs.get("paired_columns", None) - orig_vals = col_attrs.get("original_values", None) - target_uniques = col_attrs.get("uniques", None) - - # typically, only categorical columns will have an anonymising set, but time - # columns can use it for SQL to pull conditional values from external table - # ignoring the standard date genderation parameters, like from / to. - anon_set = col_attrs.get("anonymising_set", None) - - # Users can pass custom functions to generate categorical / date columns - if callable(anon_set): - return self._generate_using_custom_function(col_name, anon_set) - - # check if the anonymising set is a SQL statement starting with SELECT - # note that for dates, all other parameters, like from / to will be ignored - if anon_set is not None and anon_set.strip().upper()[:6] == "SELECT": - return self._generate_using_external_table(col_name, anon_set) - - # normal date columns generated using from / to / number of uniques - if col_type == "date": - return self._generate_timeseries(col_name, complete=False) - - # generate values based on a regular expression specified in the anonymising_set - if isinstance(orig_vals, str) and orig_vals == ORIGINAL_VALUES_REGEX: - return generate_regex_column( - anon_set, col_name, self.num_rows, target_uniques) - - # values were stored in SQL; randomise based on uniform distribution - if col_attrs["uniques"] > self.spec_dict["metadata"]["inline_limit"]: - return self._generate_from_sql(col_name, col_attrs) - - # we have access to original_values and the paths are dependant on anon_set - # take every row except last which is reserved for Missing data - col_df = col_attrs["original_values"].iloc[:-1, :] - col_prob = np.array(col_df["probability_vector"]).astype(float) - - if col_prob.sum() != 1: - col_prob /= col_prob.sum() - - if anon_set == "random": - - col_values = col_df[col_name].to_list() - - original_series = pd.Series( - data=self.rng.choice(a=col_values, size=self.num_rows, p=col_prob), - name=col_name) - - if paired_cols: - paired_df = ( - col_df[[col_name] + [f"paired_{x}" for x in paired_cols]] - .rename(columns=lambda x: x.replace("paired_", "")) - ) - - return shuffle_data( - pd.merge(original_series, paired_df, how="left", on=col_name)) - - return shuffle_data(original_series) - - # finally, if we have original_values, but anon_set is not random - # we pick the N distinct values from the anonymysing set, replace - # the original values + paired column values in the original_values - # DATAFRAME, making sure the changes happen in-place which means - # that downstream, the weights table will be built based on the - # modified "original_values" dataframe. - - sql_df = self._generate_from_sql(col_name, col_attrs, complete=True) - - # includes Missing data row as opposed to col_df which doesn't - orig_df = col_attrs["original_values"] - - # missing data is the last row - repl = sql_df[col_name].unique() - aliases = dict(zip(orig_df[col_name].values[:-1], repl)) - aliased_df = orig_df.applymap(lambda x: aliases.get(x, x)) - self.spec_dict["columns"][col_name]["original_values"] = aliased_df - - # we ignore Missing data probability when we originally create the variable - idx = self.rng.choice(a=len(sql_df), p=col_prob, size=self.num_rows) - anon_list = [sql_df.iloc[x, :].values for x in idx] - anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list) - - return shuffle_data(anon_df) - - def _generate_from_sql(self, col_name, col_attrs, complete=False, db_path=None): - ''' - Whatever the anonymising method, if a column has more unique values than - allowed by the inline_limit parameter, it will be put into SQLite3 db. - ''' - - anon_set = col_attrs["anonymising_set"] - uniques = col_attrs["uniques"] - paired_cols = col_attrs["paired_columns"] or [] - - #1) QUERY SQL TO GET VALUES USED TO BUILD THE DATAFRAME - if anon_set == "random": - - safe_col_name = col_name.replace(" ", "$") - table_name = f"temp_{self.spec_dict['metadata']['id']}_{safe_col_name}" - sql_df = query_exhibit_database( - table_name, exclude_missing=True, db_path=db_path) - - else: - table_name, *sql_column = anon_set.split(".") - sql_df = query_exhibit_database(table_name, sql_column, uniques) - - # if sql df is an anonymising set with different column names, like mountaints, - # we want to rename them to the actual column names used in the spec; - # alternatively, if the sql df is a lookup and column there match the spec, we - # make sure to take those columns that match. - if set([col_name] + paired_cols).issubset(set(sql_df.columns)): - sql_df = sql_df[[col_name] + paired_cols] - - # rename sql_df columns to be same as original + paired; zip is - # only going to pair up columns up to the shorter list! - sql_df.rename( - columns=dict(zip( - sql_df.columns, - [col_name] + paired_cols - )), - inplace=True - ) - - #2) GENERATE ANONYMISED ROWS - if complete: - anon_df = sql_df.drop(columns="probability_vector", errors="ignore") - else: - if "probability_vector" in sql_df.columns: - probs = sql_df["probability_vector"].astype(float).values - probs = probs / probs.sum() - sql_df.drop(columns="probability_vector", inplace=True) - idx = self.rng.choice(a=len(sql_df), p=probs, size=self.num_rows) - else: - idx = self.rng.choice(len(sql_df), self.num_rows) - - anon_list = [sql_df.iloc[x, :].values for x in idx] - anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list) - - #3) HANDLE MISSING PAIRED COLUMNS IN SQL - # if the column has paired columns and a non-random anonymising set, - # the anonymising set must also provide the paired columns or the same - # values will be used for the original + paired columns - missing_paired_cols = set(paired_cols) - set(sql_df.columns[1:]) - - if missing_paired_cols: - missing_df = pd.DataFrame( - data=zip(*[anon_df[col_name]] * len(missing_paired_cols)), - # sets are no longer allowed as column names - columns=list(missing_paired_cols) - ) - - anon_df = pd.concat([anon_df, missing_df], axis=1) - - return shuffle_data(anon_df) - - def _generate_complete_series(self, col_name): - ''' - This function doesn't take num_rows argument because - we are always generating the full number of rows - for this column as specified in the spec. - - Function path depends on the column type: date or categorical - - Returns - ------- - pd.Series for non-paired columns and pd.DataFrame for pairs - - For now, the function doesn't support columns where values are - stored in the DB because the number of their uniques exceeds - category threshold or if they are anonymised using a set from DB. - ''' - - col_attrs = self.spec_dict["columns"][col_name] - - if col_attrs["type"] == "date": - - return self._generate_timeseries(col_name, complete=True) - - # if paired column, skip, and add pairs as part of parent column's processing - if col_name in self.paired_cols: - return None - - # if column has paired columns, return a dataframe with it + paired cols - paired_cols = col_attrs["paired_columns"] - - # all cat. columns have a missing data placeholder as -1 row so we exclude it - if paired_cols: - paired_complete_df = ( - col_attrs["original_values"].iloc[:-1, 0:len(paired_cols)+1]) - paired_complete_df.rename( - columns=lambda x: x.replace("paired_", ""), inplace=True) - - return paired_complete_df - - return pd.Series(col_attrs["original_values"].iloc[:-1, 0], name=col_name) - - def _get_column_types(self): - ''' - Convenience function to categorise columns into 4 types: - - nested linked columns (generated separately as part of linkage.py) - - complete columns - all values are used - - columns where original values are paired with a "main" column - - All of the above are treated in a special way either in a separate - generation routine (like linked columns) or are generated as a - by-product of another routine (like paired columns). Columns that remain, - are generated in a "normal" way as part of this module. - - Returns - ------- - namedtuple("Columns", ["all", "complete", "paired", "skipped"]) - ''' - - Columns = namedtuple("Columns", ["all", "complete", "paired", "skipped"]) - - # there might be cases when you want to generate just the date columns or just - # the categorical columns so they might be missing from the metadata section - all_cols = ( - (self.spec_dict["metadata"].get("categorical_columns", [])) + - (self.spec_dict["metadata"].get("date_columns", [])) - ) - - nested_linked_cols = [ - sublist for n, sublist in (self.spec_dict.get("linked_columns") or []) - ] - - complete_cols = [c for c, v in get_attr_values( - self.spec_dict, - "cross_join_all_unique_values", - col_names=True, - types=["categorical", "date"]) if v] - - list_of_orig_val_tuples = get_attr_values( - self.spec_dict, - "original_values", - col_names=True, - types=["categorical", "date"]) - - paired_cols = [ - k for k, v in list_of_orig_val_tuples if str(v) == ORIGINAL_VALUES_PAIRED] - - skipped_cols = ( - list(chain.from_iterable(nested_linked_cols)) + - complete_cols + - paired_cols - ) - - column_types = Columns(all_cols, complete_cols, paired_cols, skipped_cols) - - return column_types - - def _generate_using_external_table(self, col_name, anon_set): - ''' - We assume that the aliased column is the one you want to pick the values from - and the rest of the columns in the select statement are going to be the join - keys. - ''' - - parser = Parser(anon_set) - sql_tables = parser.tables - aliased_columns = parser.columns_aliases_names - source_table_id = self.spec_dict["metadata"]["id"] - - if len(aliased_columns) != 1 or aliased_columns[0] != col_name: - raise RuntimeError( - f"Please make sure the SQL SELECT statement in {col_name}'s " - f"anonymising_set includes exactly one aliased column named {col_name}." - ) - - # "join" columns are all non-aliased columns from the source table - # "join" here refers to joining back the data from the SQL statment to the - # original source data, not any join columns that are part of the JOIN section - # of SQL proper. - - join_columns = [] - for qualified_column in parser.columns_dict["select"]: - table, column = qualified_column.split(".") - if table == f"temp_{source_table_id}" and column != col_name: - join_columns.append(column) - - # "source" table aka existing table is always put into exhibit DB, but if - # SQL is trying to reference an external table, we should check if it exists - ext_tables = [ - t for t in sql_tables if t not in ["temp_original_values", f"temp_{source_table_id}"] - ] - - # check the "external" table is in exhibit.db - for ext_table in ext_tables: - if not check_table_exists(ext_table): - raise RuntimeError( - f"Please make sure that {ext_table} used in the anonymising_set SQL" - f" for column {col_name} exists in the Exhibit database." - ) - - # insert the dataframe generated so far into the DB; we make sure to drop - # duplicates in case user didn't specify DISTINCT in his SQL query; - # the anon_df would typically be from UUIDs that are generated before - # categorical columns. - - # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns - if self.anon_df is None or self.anon_df.empty: - # self.generated_dfs has cat. columns generated BEFORE this particular column - if not self.generated_dfs: #pragma: no cover - existing_data = pd.DataFrame() - else: - existing_data = pd.concat(self.generated_dfs, axis=1) - else: - existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1) - - # for convenience, we can reference original_values as a table - this could be - # original_values as they appear in the SPEC or in the SQL (not implemented yet) - if "temp_original_values" in sql_tables: - ov_df = self.spec_dict["columns"][col_name]["original_values"][[col_name]] - create_temp_table( - table_name="temp_original_values", - col_names=[col_name], - data=ov_df - ) - - # ensure the data going into DB is processed identically for join keys - for col in join_columns: - if is_numeric_dtype(existing_data[col]): - existing_data[col] = existing_data[col].astype(float) - elif is_datetime64_dtype(existing_data[col]): - existing_data[col] = existing_data[col].dt.strftime("%Y-%m-%d") - else: - existing_data[col] = existing_data[col].astype(str).str.strip() - - # dropping duplicates is a filter operation (even though it returns new data) - # unless we make an explicit copy of the de-duplicated dataframe, Pandas will - # trigger SettingWithCopy warning when trying to change any values. - existing_data_distinct = existing_data.drop_duplicates(subset=join_columns).copy() - existing_data_cols = list(existing_data.columns) - - # this function converts list of tuples into a dataframe anyway - create_temp_table( - table_name=f"temp_{source_table_id}", - col_names=existing_data_cols, - data=existing_data_distinct - ) - - # run the SQL from anon_set; note that the type of SQL query we'll likely see - # will be a cross-join (e.g. dates) so any speed optimisations would be welcome - result = execute_sql(anon_set) - - # create the dataframe with SQL data - sql_df = pd.DataFrame(data=result, columns=join_columns + aliased_columns) - - # ensure that the column of interest (the one we're potentially matching to original - # values) is typed to string - and not datetime or int, coming out of SQL. We will - # convert to datetime at the end, if that's what the type in the spec is. - sql_df[col_name] = sql_df[col_name].astype("str") - - # get the probabilities for the selected column in the external table - # at the level of the join key - use a hash for the combination of columns! - - # Rather than use existing probabilities from the spec, treat them as a weight - # and apply them to the conditional, per-join key probabilities from external - # table. - probas = {} - orig_vals = None - - try: - orig_vals = self.spec_dict["columns"][col_name]["original_values"] - if isinstance(orig_vals, pd.DataFrame): - orig_vals = orig_vals.set_index(col_name) - # if we don't have original_values in the column spec, it's a date - except KeyError: - pass - - groups = sql_df.groupby(join_columns) - for i, group in groups: - - total_count = len(group) - proba_arr = (group - .value_counts() - .apply(lambda x: 0 if x == 0 else max(0.001, x / total_count)) - .reset_index(level=col_name) - .to_numpy(dtype="str") - ) - a, p = np.split(proba_arr, 2, axis=1) - a = a.flatten() - p = p.flatten().astype(float) - - if orig_vals is not None: - for j, val in enumerate(a): - if val in orig_vals.index: - p_weight = float(orig_vals.loc[val, "probability_vector"]) - p[j] = p[j] * p_weight - - # enusre p sums up to 1 - p = p * (1 / sum(p)) - probas[i[0]] = (a, p) - - # take the data generated so far and generate appropriate values based on key - groups = existing_data.groupby(join_columns).groups - temp_result = [] - - for group_key, group_index in groups.items(): - # if the key is missing, then the SQL filtered out the data for that key - # having a COALESCE in SQL would fix it, but in case it's also missing, - # we try to catch this edge case in code as well. - try: - new_data = self.rng.choice( - a=probas[group_key][0], p=probas[group_key][1], size=len(group_index)) - except KeyError: #pragma: no cover - new_data = [np.nan] * len(group_index) - - temp_result.append(pd.Series(data=new_data, index=group_index, name=col_name)) - - final_result = pd.concat(temp_result) - - # ensure we return the correct type for date columns - col_type = self.spec_dict["columns"][col_name]["type"] - if col_type == "date": - final_result = final_result.astype("datetime64[ns]") - - return final_result - - def _generate_using_custom_function(self, col_name, anon_set): - ''' - _summary_ - - Parameters - ---------- - col_name : _type_ - _description_ - anon_set : _type_ - _description_ - ''' - # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns - if self.anon_df is None or self.anon_df.empty: - # self.generated_dfs has cat. columns generated BEFORE this particular column - if not self.generated_dfs: - existing_data = pd.DataFrame() - else: - existing_data = pd.concat(self.generated_dfs, axis=1) - else: #pragma: no cover - existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1) - - if existing_data.empty: - result = pd.Series( - data=[anon_set(pd.Series) for _ in range(self.num_rows)], - name=col_name - ) - return result - - result = existing_data.apply(anon_set, axis=1) - result.name = col_name - - return result +''' +Methods to generate categorical columns / values +''' + +# Standard library imports +from collections import namedtuple +from itertools import chain +import warnings + +# External library imports +import pandas as pd +import numpy as np +from sql_metadata import Parser +from pandas.api.types import is_numeric_dtype, is_datetime64_dtype + +# Exhibit imports +from ..constants import ORIGINAL_VALUES_REGEX, ORIGINAL_VALUES_PAIRED +from ..utils import get_attr_values, shuffle_data +from ..sql import query_exhibit_database, check_table_exists, execute_sql, create_temp_table +from ..linkage.hierarchical import generate_linked_anon_df +from ..linkage.matrix import generate_user_linked_anon_df +from .regex import generate_regex_column + +# EXPORTABLE METHODS +# ================== +class CategoricalDataGenerator: + ''' + Although this class is pretty bare, it still helps avoid passing + the same variables through functions and also mirrors the setup + for generation of linked data. + + One area that potentially needs looking at is if the user makes + manual changes to column values that were initially put into SQL + (where uniques > inline_limit) - for now, this works only for linked data. + ''' + + def __init__(self, spec_dict, core_rows, anon_df=None): + ''' + This class is covering the entire spec_dict as far as the + generation of non-numerical data is concerned. + ''' + + self.spec_dict = spec_dict + self.rng = spec_dict["_rng"] + self.num_rows = core_rows + self.fixed_anon_sets = ["random", "mountains", "patients", "birds", "dates"] + # we need UUID dataset (if it exists) for possible conditional SQL that + # references already-generated columns in the spec + self.generated_dfs = [] + self.anon_df = anon_df + + (self.all_cols, + self.complete_cols, + self.paired_cols, + self.skipped_cols) = self._get_column_types() + + def generate(self): + ''' + Brings together all the components of non-numerical data generation. + + Returns + ------- + A dataframe with all categorical columns + ''' + + #1) GENERATE LINKED DFs FROM EACH LINKED COLUMNS GROUP + for linked_group in (self.spec_dict.get("linked_columns") or []): + + # zero-numbered linked group is reserved for user-defined groupings + if linked_group[0] == 0: + + u_linked_df = generate_user_linked_anon_df( + spec_dict=self.spec_dict, + linked_cols=linked_group[1], + num_rows=self.num_rows + ) + + self.generated_dfs.append(u_linked_df) + + else: + + linked_df = generate_linked_anon_df( + spec_dict=self.spec_dict, + linked_group=linked_group, + num_rows=self.num_rows) + + self.generated_dfs.append(linked_df) + + #2) GENERATE NON-LINKED DFs + for col in [col for col in self.all_cols if col not in self.skipped_cols]: + s = self._generate_anon_series(col) + self.generated_dfs.append(s) + + #3) CONCAT GENERATED DFs AND SERIES + temp_anon_df = pd.concat(self.generated_dfs, axis=1) + + #4) GENERATE SERIES WITH "COMPLETE", CROSS-JOINED COLUMNS + complete_series = [] + + # Complete series can sort the data again + for col in self.complete_cols: + s = self._generate_complete_series(col) + #paired columns return None + if not s is None: + complete_series.append(s) + + #5) OUTER JOIN + temp_anon_df["key"] = 1 + + for s in complete_series: + + temp_anon_df = pd.merge( + temp_anon_df, + pd.DataFrame(s).assign(key=1), + how="outer", + on="key" + ) + + #6) TIDY UP + anon_df = temp_anon_df.drop("key", axis=1) + + return anon_df + + def _generate_timeseries(self, col_name, complete=False): + ''' + Basic generator of randomised / complete timeseries data + + Parameters: + ---------- + col_name : str + time column to generate (type checks are made upstream) + complete : boolean + if timeseries is meant to be "complete", return full series + without picking N=num_rows random values from the pool + + Returns: + -------- + pd.Series + ''' + + # see which date parameters we have access to + start = self.spec_dict["columns"][col_name].get("from", None) + end = self.spec_dict["columns"][col_name].get("to", None) + + # frequency and periods are always required + freq = self.spec_dict["columns"][col_name]["frequency"] + periods = self.spec_dict["columns"][col_name]["uniques"] + + # if we have both start and end, we generate all values in-between and pick the + # dates at random to match the number of periods, without repeats + if start is not None and end is not None: + + all_pos_dates = pd.date_range(start=start, end=end, freq=freq) + # when the number of requested periods is greater than the total possible + # range between from and to, given the frequency, we issue a warning, then + # omit the date_to and generate N=periods unique dates from date_from. + if len(all_pos_dates) < periods: + warnings.warn( + f"The number of unique dates at frequency {freq} between {start} " + f"and {end} is smaller than the number of requested periods" + f"({periods}). The date_to parameter will be ignored.", + RuntimeWarning + ) + all_pos_dates = pd.date_range(start=start, periods=periods, freq=freq) + + all_pos_dates = self.rng.choice(all_pos_dates, periods, replace=False) + + else: + # one of the start / end is None + all_pos_dates = pd.date_range( + start=start, end=end, periods=periods, freq=freq) + + if complete: + return pd.Series(all_pos_dates, name=col_name) + + random_dates = self.rng.choice(all_pos_dates, self.num_rows) + + return shuffle_data(pd.Series(random_dates, name=col_name)) + + def _generate_anon_series(self, col_name): + ''' + Generate basic categorical series anonymised according to user input. + + Note that in all cases except external tables, the final series is shuffled + and index reset. Series generated from external tables are an exception because + their values are linked to columns that have already been generated. + + The code can take different paths depending on these things: + - whether a the anonymising method is set to random or a custom set + - whether the number of unique values exceeds the threshold + - whether the column has any paired columns + + The paths differ primarily in terms of where the data sits: as part + of the spec in original_values or in exhibit DB. + + Things are further complicated if users want to use a single column + from an anonymising table, like mountains.peak + + Parameters: + ----------- + col_name : str + column name to process & anonymise + + Returns: + ------- + Pandas Series object or a Dataframe + ''' + + col_attrs = self.spec_dict["columns"][col_name] + col_type = col_attrs["type"] + + # capture categorical-only information, with fallback for date columns + paired_cols = col_attrs.get("paired_columns", None) + orig_vals = col_attrs.get("original_values", None) + target_uniques = col_attrs.get("uniques", None) + + # typically, only categorical columns will have an anonymising set, but time + # columns can use it for SQL to pull conditional values from external table + # ignoring the standard date genderation parameters, like from / to. + anon_set = col_attrs.get("anonymising_set", None) + + # Users can pass custom functions to generate categorical / date columns + if callable(anon_set): + return self._generate_using_custom_function(col_name, anon_set) + + # check if the anonymising set is a SQL statement starting with SELECT + # note that for dates, all other parameters, like from / to will be ignored + if anon_set is not None and anon_set.strip().upper()[:6] == "SELECT": + return self._generate_using_external_table(col_name, anon_set) + + # normal date columns generated using from / to / number of uniques + if col_type == "date": + return self._generate_timeseries(col_name, complete=False) + + # generate values based on a regular expression specified in the anonymising_set + if isinstance(orig_vals, str) and orig_vals == ORIGINAL_VALUES_REGEX: + return generate_regex_column( + anon_set, col_name, self.num_rows, target_uniques) + + # values were stored in SQL; randomise based on uniform distribution + if col_attrs["uniques"] > self.spec_dict["metadata"]["inline_limit"]: + return self._generate_from_sql(col_name, col_attrs) + + # we have access to original_values and the paths are dependant on anon_set + # take every row except last which is reserved for Missing data + col_df = col_attrs["original_values"].iloc[:-1, :] + col_prob = np.array(col_df["probability_vector"]).astype(float) + + if col_prob.sum() != 1: + col_prob /= col_prob.sum() + + if anon_set == "random": + + col_values = col_df[col_name].to_list() + + original_series = pd.Series( + data=self.rng.choice(a=col_values, size=self.num_rows, p=col_prob), + name=col_name) + + if paired_cols: + paired_df = ( + col_df[[col_name] + [f"paired_{x}" for x in paired_cols]] + .rename(columns=lambda x: x.replace("paired_", "")) + ) + + return shuffle_data( + pd.merge(original_series, paired_df, how="left", on=col_name)) + + return shuffle_data(original_series) + + # finally, if we have original_values, but anon_set is not random + # we pick the N distinct values from the anonymysing set, replace + # the original values + paired column values in the original_values + # DATAFRAME, making sure the changes happen in-place which means + # that downstream, the weights table will be built based on the + # modified "original_values" dataframe. + + sql_df = self._generate_from_sql(col_name, col_attrs, complete=True) + + # includes Missing data row as opposed to col_df which doesn't + orig_df = col_attrs["original_values"] + + # missing data is the last row + repl = sql_df[col_name].unique() + aliases = dict(zip(orig_df[col_name].values[:-1], repl)) + aliased_df = orig_df.map(lambda x: aliases.get(x, x)) + self.spec_dict["columns"][col_name]["original_values"] = aliased_df + + # we ignore Missing data probability when we originally create the variable + idx = self.rng.choice(a=len(sql_df), p=col_prob, size=self.num_rows) + anon_list = [sql_df.iloc[x, :].values for x in idx] + anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list) + + return shuffle_data(anon_df) + + def _generate_from_sql(self, col_name, col_attrs, complete=False, db_path=None): + ''' + Whatever the anonymising method, if a column has more unique values than + allowed by the inline_limit parameter, it will be put into SQLite3 db. + ''' + + anon_set = col_attrs["anonymising_set"] + uniques = col_attrs["uniques"] + paired_cols = col_attrs["paired_columns"] or [] + + #1) QUERY SQL TO GET VALUES USED TO BUILD THE DATAFRAME + if anon_set == "random": + + safe_col_name = col_name.replace(" ", "$") + table_name = f"temp_{self.spec_dict['metadata']['id']}_{safe_col_name}" + sql_df = query_exhibit_database( + table_name, exclude_missing=True, db_path=db_path) + + else: + table_name, *sql_column = anon_set.split(".") + sql_df = query_exhibit_database(table_name, sql_column, uniques) + + # if sql df is an anonymising set with different column names, like mountaints, + # we want to rename them to the actual column names used in the spec; + # alternatively, if the sql df is a lookup and column there match the spec, we + # make sure to take those columns that match. + if set([col_name] + paired_cols).issubset(set(sql_df.columns)): + sql_df = sql_df[[col_name] + paired_cols] + + # rename sql_df columns to be same as original + paired; zip is + # only going to pair up columns up to the shorter list! + sql_df.rename( + columns=dict(zip( + sql_df.columns, + [col_name] + paired_cols + )), + inplace=True + ) + + #2) GENERATE ANONYMISED ROWS + if complete: + anon_df = sql_df.drop(columns="probability_vector", errors="ignore") + else: + if "probability_vector" in sql_df.columns: + probs = sql_df["probability_vector"].astype(float).values + probs = probs / probs.sum() + sql_df.drop(columns="probability_vector", inplace=True) + idx = self.rng.choice(a=len(sql_df), p=probs, size=self.num_rows) + else: + idx = self.rng.choice(len(sql_df), self.num_rows) + + anon_list = [sql_df.iloc[x, :].values for x in idx] + anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list) + + #3) HANDLE MISSING PAIRED COLUMNS IN SQL + # if the column has paired columns and a non-random anonymising set, + # the anonymising set must also provide the paired columns or the same + # values will be used for the original + paired columns + missing_paired_cols = set(paired_cols) - set(sql_df.columns[1:]) + + if missing_paired_cols: + missing_df = pd.DataFrame( + data=zip(*[anon_df[col_name]] * len(missing_paired_cols)), + # sets are no longer allowed as column names + columns=list(missing_paired_cols) + ) + + anon_df = pd.concat([anon_df, missing_df], axis=1) + + return shuffle_data(anon_df) + + def _generate_complete_series(self, col_name): + ''' + This function doesn't take num_rows argument because + we are always generating the full number of rows + for this column as specified in the spec. + + Function path depends on the column type: date or categorical + + Returns + ------- + pd.Series for non-paired columns and pd.DataFrame for pairs + + For now, the function doesn't support columns where values are + stored in the DB because the number of their uniques exceeds + category threshold or if they are anonymised using a set from DB. + ''' + + col_attrs = self.spec_dict["columns"][col_name] + + if col_attrs["type"] == "date": + + return self._generate_timeseries(col_name, complete=True) + + # if paired column, skip, and add pairs as part of parent column's processing + if col_name in self.paired_cols: + return None + + # if column has paired columns, return a dataframe with it + paired cols + paired_cols = col_attrs["paired_columns"] + + # all cat. columns have a missing data placeholder as -1 row so we exclude it + if paired_cols: + paired_complete_df = ( + col_attrs["original_values"].iloc[:-1, 0:len(paired_cols)+1]) + paired_complete_df.rename( + columns=lambda x: x.replace("paired_", ""), inplace=True) + + return paired_complete_df + + return pd.Series(col_attrs["original_values"].iloc[:-1, 0], name=col_name) + + def _get_column_types(self): + ''' + Convenience function to categorise columns into 4 types: + - nested linked columns (generated separately as part of linkage.py) + - complete columns - all values are used + - columns where original values are paired with a "main" column + + All of the above are treated in a special way either in a separate + generation routine (like linked columns) or are generated as a + by-product of another routine (like paired columns). Columns that remain, + are generated in a "normal" way as part of this module. + + Returns + ------- + namedtuple("Columns", ["all", "complete", "paired", "skipped"]) + ''' + + Columns = namedtuple("Columns", ["all", "complete", "paired", "skipped"]) + + # there might be cases when you want to generate just the date columns or just + # the categorical columns so they might be missing from the metadata section + all_cols = ( + (self.spec_dict["metadata"].get("categorical_columns", [])) + + (self.spec_dict["metadata"].get("date_columns", [])) + ) + + nested_linked_cols = [ + sublist for n, sublist in (self.spec_dict.get("linked_columns") or []) + ] + + complete_cols = [c for c, v in get_attr_values( + self.spec_dict, + "cross_join_all_unique_values", + col_names=True, + types=["categorical", "date"]) if v] + + list_of_orig_val_tuples = get_attr_values( + self.spec_dict, + "original_values", + col_names=True, + types=["categorical", "date"]) + + paired_cols = [ + k for k, v in list_of_orig_val_tuples if str(v) == ORIGINAL_VALUES_PAIRED] + + skipped_cols = ( + list(chain.from_iterable(nested_linked_cols)) + + complete_cols + + paired_cols + ) + + column_types = Columns(all_cols, complete_cols, paired_cols, skipped_cols) + + return column_types + + def _generate_using_external_table(self, col_name, anon_set): + ''' + We assume that the aliased column is the one you want to pick the values from + and the rest of the columns in the select statement are going to be the join + keys. + ''' + + parser = Parser(anon_set) + sql_tables = parser.tables + aliased_columns = parser.columns_aliases_names + source_table_id = self.spec_dict["metadata"]["id"] + + if len(aliased_columns) != 1 or aliased_columns[0] != col_name: + raise RuntimeError( + f"Please make sure the SQL SELECT statement in {col_name}'s " + f"anonymising_set includes exactly one aliased column named {col_name}." + ) + + # "join" columns are all non-aliased columns from the source table + # "join" here refers to joining back the data from the SQL statment to the + # original source data, not any join columns that are part of the JOIN section + # of SQL proper. + + join_columns = [] + for qualified_column in parser.columns_dict["select"]: + table, column = qualified_column.split(".") + if table == f"temp_{source_table_id}" and column != col_name: + join_columns.append(column) + + # "source" table aka existing table is always put into exhibit DB, but if + # SQL is trying to reference an external table, we should check if it exists + ext_tables = [ + t for t in sql_tables if t not in ["temp_original_values", f"temp_{source_table_id}"] + ] + + # check the "external" table is in exhibit.db + for ext_table in ext_tables: + if not check_table_exists(ext_table): + raise RuntimeError( + f"Please make sure that {ext_table} used in the anonymising_set SQL" + f" for column {col_name} exists in the Exhibit database." + ) + + # insert the dataframe generated so far into the DB; we make sure to drop + # duplicates in case user didn't specify DISTINCT in his SQL query; + # the anon_df would typically be from UUIDs that are generated before + # categorical columns. + + # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns + if self.anon_df is None or self.anon_df.empty: + # self.generated_dfs has cat. columns generated BEFORE this particular column + if not self.generated_dfs: #pragma: no cover + existing_data = pd.DataFrame() + else: + existing_data = pd.concat(self.generated_dfs, axis=1) + else: + existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1) + + # for convenience, we can reference original_values as a table - this could be + # original_values as they appear in the SPEC or in the SQL (not implemented yet) + if "temp_original_values" in sql_tables: + ov_df = self.spec_dict["columns"][col_name]["original_values"][[col_name]] + create_temp_table( + table_name="temp_original_values", + col_names=[col_name], + data=ov_df + ) + + # ensure the data going into DB is processed identically for join keys + for col in join_columns: + if is_numeric_dtype(existing_data[col]): + existing_data[col] = existing_data[col].astype(float) + elif is_datetime64_dtype(existing_data[col]): + existing_data[col] = existing_data[col].dt.strftime("%Y-%m-%d") + else: + existing_data[col] = existing_data[col].astype(str).str.strip() + + # dropping duplicates is a filter operation (even though it returns new data) + # unless we make an explicit copy of the de-duplicated dataframe, Pandas will + # trigger SettingWithCopy warning when trying to change any values. + existing_data_distinct = existing_data.drop_duplicates(subset=join_columns).copy() + existing_data_cols = list(existing_data.columns) + + # this function converts list of tuples into a dataframe anyway + create_temp_table( + table_name=f"temp_{source_table_id}", + col_names=existing_data_cols, + data=existing_data_distinct + ) + + # run the SQL from anon_set; note that the type of SQL query we'll likely see + # will be a cross-join (e.g. dates) so any speed optimisations would be welcome + result = execute_sql(anon_set) + + # create the dataframe with SQL data + sql_df = pd.DataFrame(data=result, columns=join_columns + aliased_columns) + + # ensure that the column of interest (the one we're potentially matching to original + # values) is typed to string - and not datetime or int, coming out of SQL. We will + # convert to datetime at the end, if that's what the type in the spec is. + sql_df[col_name] = sql_df[col_name].astype("str") + + # get the probabilities for the selected column in the external table + # at the level of the join key - use a hash for the combination of columns! + + # Rather than use existing probabilities from the spec, treat them as a weight + # and apply them to the conditional, per-join key probabilities from external + # table. + probas = {} + orig_vals = None + + try: + orig_vals = self.spec_dict["columns"][col_name]["original_values"] + if isinstance(orig_vals, pd.DataFrame): + orig_vals = orig_vals.set_index(col_name) + # if we don't have original_values in the column spec, it's a date + except KeyError: + pass + + groups = sql_df.groupby(join_columns) + for i, group in groups: + + total_count = len(group) + proba_arr = (group + .value_counts() + .apply(lambda x: 0 if x == 0 else max(0.001, x / total_count)) + .reset_index(level=col_name) + .to_numpy(dtype="str") + ) + a, p = np.split(proba_arr, 2, axis=1) + a = a.flatten() + p = p.flatten().astype(float) + + if orig_vals is not None: + for j, val in enumerate(a): + if val in orig_vals.index: + p_weight = float(orig_vals.loc[val, "probability_vector"]) + p[j] = p[j] * p_weight + + # enusre p sums up to 1 + p = p * (1 / sum(p)) + probas[i[0]] = (a, p) + + # take the data generated so far and generate appropriate values based on key + groups = existing_data.groupby(join_columns).groups + temp_result = [] + + for group_key, group_index in groups.items(): + # if the key is missing, then the SQL filtered out the data for that key + # having a COALESCE in SQL would fix it, but in case it's also missing, + # we try to catch this edge case in code as well. + try: + new_data = self.rng.choice( + a=probas[group_key][0], p=probas[group_key][1], size=len(group_index)) + except KeyError: #pragma: no cover + new_data = [np.nan] * len(group_index) + + temp_result.append(pd.Series(data=new_data, index=group_index, name=col_name)) + + final_result = pd.concat(temp_result) + + # ensure we return the correct type for date columns + col_type = self.spec_dict["columns"][col_name]["type"] + if col_type == "date": + final_result = final_result.astype("datetime64[ns]") + + return final_result + + def _generate_using_custom_function(self, col_name, anon_set): + ''' + _summary_ + + Parameters + ---------- + col_name : _type_ + _description_ + anon_set : _type_ + _description_ + ''' + # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns + if self.anon_df is None or self.anon_df.empty: + # self.generated_dfs has cat. columns generated BEFORE this particular column + if not self.generated_dfs: + existing_data = pd.DataFrame() + else: + existing_data = pd.concat(self.generated_dfs, axis=1) + else: #pragma: no cover + existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1) + + if existing_data.empty: + result = pd.Series( + data=[anon_set(pd.Series) for _ in range(self.num_rows)], + name=col_name + ) + return result + + result = existing_data.apply(anon_set, axis=1) + result.name = col_name + + return result diff --git a/exhibit/core/generate/missing.py b/exhibit/core/generate/missing.py index 76eb76c..8575c50 100644 --- a/exhibit/core/generate/missing.py +++ b/exhibit/core/generate/missing.py @@ -1,345 +1,345 @@ -''' -Methods to generate / deal with missing data -''' - -# Standard library imports -from itertools import groupby - -# External library imports -import numpy as np -import pandas as pd - -# Exhibit -from ..constants import MISSING_DATA_STR -from ..constraints import clean_up_constraint_string, get_constraint_mask -from ..utils import get_attr_values -from .continuous import generate_cont_val, scale_continuous_column - -# EXPORTABLE METHODS & CLASSES -# ============================ - -class MissingDataGenerator: - ''' - The class will copy the nearly complete anonimised dataframe - which has implications on the RAM footprint of the package - ''' - - def __init__(self, spec_dict, data): - ''' - Doc string - ''' - - self.spec_dict = spec_dict - self.data = data - self.dtypes = data.dtypes - self.nan_data = data - self.wt = spec_dict.get("weights_table", None) - - # only copy the data if there are conditional constraints meaning - # we can't be sure the required columns HADN'T HAD data already made - # missing in an earlier step. - if spec_dict["constraints"]["custom_constraints"]: - self.nan_data = data.copy() - - def add_missing_data(self): - ''' - Returns the original data, modified in place to include nan values - - Since Missing data (categorical) has its own weights, if we're adding - any Missing data to the dataframe, we must re-generate the contunious - variables to make sure we use the Missing data weights and not the original. - - We also need to re-scale each continuous column where we either added a nan - or where the categorical columns had Missing data added to them. - - 1) Find cells to exclude - there can't be nans in them - 2) Find linked and paired columns - nulls are propagated from the root column - 3) Add nulls to the remaining columns, always mindful of the indices from 1) - ''' - - missing_link_cols = self._find_columns_with_linked_missing_data() - geospatial_cols = [c for c, _ in get_attr_values( - self.spec_dict, "type", col_names=True, types=["geospatial"])] - - standalone_cols = ( - set(self.spec_dict["columns"].keys()) - - {col for col_set in missing_link_cols for col in col_set} - - set(self.spec_dict.get("derived_columns", {}).keys()) - - set(self.spec_dict["metadata"].get("uuid_columns", set()) or set()) - - set(geospatial_cols) - ) - - #1) Generate nulls in standalone columns, including continuous - # make sure to change the seed for each standalone column to avoid creating - # relationships where NA in Column A is also NA is column B if both have the - # same miss_probability. - for i, col_name in enumerate(sorted(standalone_cols)): - - # reset the generator for each column - rng = np.random.default_rng(seed=i) - - # special case for user linked columns which can have "Missing data" already - # if it appeared in the source for the linkage along with its probability. - # hierarchical linkage is not affected because having multiple NAN CAs for - # different HBs, for example, means the linkage is no longer hierarchical - # and doesn't map 1 to many and is instead many to many. - - if any(self.nan_data[col_name] == MISSING_DATA_STR): - self.nan_data[col_name] = ( - self.nan_data[col_name].map( - lambda x: pd.NA if x == MISSING_DATA_STR else x)) - continue - - miss_pct = self.spec_dict["columns"][col_name]["miss_probability"] - rands = rng.random(size=self.nan_data.shape[0]) # pylint: disable=no-member - col_type = self.spec_dict["columns"][col_name]["type"] - miss_value = pd.NaT if col_type == "date" else np.NaN - repl_column = self.nan_data[col_name] - - # numpy default type detection messes up date columns in Pandas - if col_type == "date": - repl_column = np.array(self.nan_data[col_name], dtype=object) - - self.nan_data[col_name] = np.where( - rands < miss_pct, - miss_value, repl_column) - - if col_type == "continuous": - precision = self.spec_dict["columns"][col_name].get("precision", None) - if precision == "integer": - self.nan_data[col_name] = ( - self.nan_data[col_name].astype("float").round().astype("Int64")) - - #2) Generate nulls in linked and paired columns - for cols in missing_link_cols: - - # reset the generator for each column (keeping the seed to maintain links) - rng = np.random.default_rng(seed=0) - - # miss probability will be the same for all columns in cols - miss_pct = self.spec_dict["columns"][next(iter(cols))]["miss_probability"] - # rands is shared for all columns in cols - rands = rng.random(size=self.nan_data.shape[0]) # pylint: disable=no-member - - self.nan_data.loc[:, list(cols)] = np.where( - (rands < miss_pct)[..., None], - (np.NaN, ) * len(cols), - self.nan_data.loc[:, list(cols)] - ) - - #3) Generate nulls in geospacial columns (lat / long) - # Similar to linked / paired, keeping the random seed the same between - # lat and long, only changing if more than one column to generate. - - for col in geospatial_cols: - geo_cols = [f"{col}_latitude", f"{col}_longitude"] - rng = np.random.default_rng(seed=0) - miss_pct = self.spec_dict["columns"][col]["miss_probability"] - rands = rng.random(size=self.nan_data.shape[0]) - - self.nan_data.loc[:, geo_cols] = np.where( - (rands < miss_pct)[..., None], - (np.NaN, ) * len(geo_cols), - self.nan_data.loc[:, geo_cols] - ) - - #4) Generate nulls in indices explicitly defined in custom_constraints - make_null_idx = self._find_make_null_idx() - - for idx, col_name in make_null_idx: - self.nan_data.loc[idx, col_name] = np.NaN - - #5) Re-introduce the saved no_nulls rows from the original data - not_null_idx = self._find_not_null_idx() - for idx, col_name in not_null_idx: - self.nan_data.loc[idx, col_name] = self.data.loc[idx, col_name] - - #6) Replace NA with missing data placeholder for categorical columns and - # re-generate continuous variables for those rows according to proper weights - # only go through this step if there are nulls in categorical columns - # and the spec_dict includes numerical columns that would be affected - # otherwise, return early. - cat_cols = list(self.spec_dict["metadata"]["categorical_columns"]) - num_cols = ( - set(self.spec_dict["metadata"]["numerical_columns"]) - - set(self.spec_dict.get("derived_columns", {}).keys())) - - if not (any(self.nan_data[cat_cols].isna()) and num_cols): - return self.nan_data.astype(self.dtypes) - - cat_mask = self.nan_data[cat_cols].isna().any(axis=1) - self.nan_data[cat_cols] = self.nan_data[cat_cols].fillna(MISSING_DATA_STR) - - for num_col in num_cols: - - # reset the generator for each column - rng = np.random.default_rng(seed=0) - - # Extract relevant num col variables from the user spec - num_col_dict = self.spec_dict["columns"][num_col] - - dist = num_col_dict["distribution"] - dist_params = num_col_dict["distribution_parameters"] - precision = num_col_dict["precision"] - - # if it's already NA, don't re-generate; it's NA for a reason! - num_mask = self.nan_data[num_col].isna() - mask = cat_mask & ~num_mask - - # it's possible to have the left side be Int64 type and the right side - # to be float64 (newly generated, unscaled); assigning different types - # doesn't work so we'll delay assignment and scale / cast type first! - unscaled_new_series = self.nan_data.loc[mask, cat_cols].apply( - func=generate_cont_val, - axis=1, - weights_table=self.wt, - num_col=num_col, - rng=rng, - dist=dist, - dist_params=dist_params - ) - - # rescale the masked section, but make sure to change target_sum! - # take a copy of the dist_params as full target_sum is used elsewhere - new_dist_params = dist_params.copy() - - if dist_params.get("target_sum", None) is not None: - old_sum = self.nan_data.loc[~mask, num_col].sum() - new_dist_params["target_sum"] = dist_params["target_sum"] - old_sum - - scaled_new_series = scale_continuous_column( - series=unscaled_new_series, - precision=precision, - **new_dist_params - ) - - # for some reason assigning a series back, rather than values - # creates nulls in certain rows, but not others; maybe Pandas bug. - # when the array is empty, Pandas generates a ValueError - if len(scaled_new_series) != 0: - self.nan_data.loc[mask, num_col] = scaled_new_series.values - - # replace Missing data back with np.nan - # since we're applying the function across all columns, including numerical, - # these can contain pd.NA which is a "special" type that will error out if - # trying to evaluate it against a string. Replace with a standard np.NAN. - self.nan_data = self.nan_data.applymap( - lambda x: np.nan if pd.isna(x) or x == MISSING_DATA_STR else x) - - return self.nan_data.astype(self.dtypes) - - def _find_columns_with_linked_missing_data(self): - ''' - Returns a list of column groupings where a missing value in one - means always a missing value in all in the grouping. The requirement - for that is that the missing_probability attribute of the spec is the - same for all such linked / paired columns. - - Returns a list with sets of columns - ''' - - result = [] - processed_pairs = set() - miss_probs = get_attr_values( - self.spec_dict, "miss_probability", col_names=True, types="categorical") - - for col, attrs in self.spec_dict["columns"].items(): - - if col in processed_pairs or attrs["type"] != "categorical": - continue - - pairs = set() - - # paired columns first - if attrs["paired_columns"]: - - pairs.update([col] + attrs["paired_columns"]) - - # linked groups - for i, linked_group in (self.spec_dict["linked_columns"] or []): - # zero numbered linked group is reserved for user defined linkage - if i == 0: - continue - - if col in linked_group: - pairs.update(linked_group) - - processed_pairs.update(pairs) - - # check that miss_probabilities are the same for all paired columns - miss_probs = sorted( - miss_probs, key=lambda x, pairs=pairs: x.col_name in pairs) - groups = groupby(miss_probs, lambda x, pairs=pairs: x.col_name in pairs) - - for key, group in groups: - - if key and len({v for k, v in group}) == 1: - - result.append(pairs) - - return result - - - def _find_make_null_idx(self): - ''' - The reason for keeping this and _find_not_null_idx separate is that - they are needed at different points in time - not_null_idx happens AFTER - all other sources of nan-generation have been exhausted and we're using - the data WITH nans to determine indices to pick up real values from the - original data passed in to the generator. - ''' - - cc = self.spec_dict["constraints"]["custom_constraints"] or {} - - make_null_idx = [] - - for _, constraint in cc.items(): - - cc_filter = constraint.get("filter", None) - cc_targets = constraint.get("targets", {}) - clean_cc_filter = clean_up_constraint_string(cc_filter) - cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter) - - for target_str, action_str in cc_targets.items(): - - if "make_null" in action_str: - - target_cols = [x.strip() for x in target_str.split(",")] - - for target in target_cols: - - make_null_idx.append( - (self.nan_data.loc[cc_mask].index, target) - ) - - return make_null_idx - - def _find_not_null_idx(self): - ''' - Doc string - ''' - - cc = self.spec_dict["constraints"]["custom_constraints"] or {} - - not_null_idx = [] - - for _, constraint in cc.items(): - - cc_filter = constraint.get("filter", None) - cc_targets = constraint.get("targets", {}) - clean_cc_filter = clean_up_constraint_string(cc_filter) - cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter) - - for target_str, action_str in cc_targets.items(): - - if "make_not_null" in action_str: - - target_cols = [x.strip() for x in target_str.split(",")] - - for target in target_cols: - - not_null_idx.append( - (self.nan_data.loc[cc_mask].index, target) - ) - - return not_null_idx +''' +Methods to generate / deal with missing data +''' + +# Standard library imports +from itertools import groupby + +# External library imports +import numpy as np +import pandas as pd + +# Exhibit +from ..constants import MISSING_DATA_STR +from ..constraints import clean_up_constraint_string, get_constraint_mask +from ..utils import get_attr_values +from .continuous import generate_cont_val, scale_continuous_column + +# EXPORTABLE METHODS & CLASSES +# ============================ + +class MissingDataGenerator: + ''' + The class will copy the nearly complete anonimised dataframe + which has implications on the RAM footprint of the package + ''' + + def __init__(self, spec_dict, data): + ''' + Doc string + ''' + + self.spec_dict = spec_dict + self.data = data + self.dtypes = data.dtypes + self.nan_data = data + self.wt = spec_dict.get("weights_table", None) + + # only copy the data if there are conditional constraints meaning + # we can't be sure the required columns HADN'T HAD data already made + # missing in an earlier step. + if spec_dict["constraints"]["custom_constraints"]: + self.nan_data = data.copy() + + def add_missing_data(self): + ''' + Returns the original data, modified in place to include nan values + + Since Missing data (categorical) has its own weights, if we're adding + any Missing data to the dataframe, we must re-generate the contunious + variables to make sure we use the Missing data weights and not the original. + + We also need to re-scale each continuous column where we either added a nan + or where the categorical columns had Missing data added to them. + + 1) Find cells to exclude - there can't be nans in them + 2) Find linked and paired columns - nulls are propagated from the root column + 3) Add nulls to the remaining columns, always mindful of the indices from 1) + ''' + + missing_link_cols = self._find_columns_with_linked_missing_data() + geospatial_cols = [c for c, _ in get_attr_values( + self.spec_dict, "type", col_names=True, types=["geospatial"])] + + standalone_cols = ( + set(self.spec_dict["columns"].keys()) - + {col for col_set in missing_link_cols for col in col_set} - + set(self.spec_dict.get("derived_columns", {}).keys()) - + set(self.spec_dict["metadata"].get("uuid_columns", set()) or set()) - + set(geospatial_cols) + ) + + #1) Generate nulls in standalone columns, including continuous + # make sure to change the seed for each standalone column to avoid creating + # relationships where NA in Column A is also NA is column B if both have the + # same miss_probability. + for i, col_name in enumerate(sorted(standalone_cols)): + + # reset the generator for each column + rng = np.random.default_rng(seed=i) + + # special case for user linked columns which can have "Missing data" already + # if it appeared in the source for the linkage along with its probability. + # hierarchical linkage is not affected because having multiple NAN CAs for + # different HBs, for example, means the linkage is no longer hierarchical + # and doesn't map 1 to many and is instead many to many. + + if any(self.nan_data[col_name] == MISSING_DATA_STR): + self.nan_data[col_name] = ( + self.nan_data[col_name].map( + lambda x: pd.NA if x == MISSING_DATA_STR else x)) + continue + + miss_pct = self.spec_dict["columns"][col_name]["miss_probability"] + rands = rng.random(size=self.nan_data.shape[0]) # pylint: disable=no-member + col_type = self.spec_dict["columns"][col_name]["type"] + miss_value = pd.NaT if col_type == "date" else np.NaN + repl_column = self.nan_data[col_name] + + # numpy default type detection messes up date columns in Pandas + if col_type == "date": + repl_column = np.array(self.nan_data[col_name], dtype=object) + + self.nan_data[col_name] = np.where( + rands < miss_pct, + miss_value, repl_column) + + if col_type == "continuous": + precision = self.spec_dict["columns"][col_name].get("precision", None) + if precision == "integer": + self.nan_data[col_name] = ( + self.nan_data[col_name].astype("float").round().astype("Int64")) + + #2) Generate nulls in linked and paired columns + for cols in missing_link_cols: + + # reset the generator for each column (keeping the seed to maintain links) + rng = np.random.default_rng(seed=0) + + # miss probability will be the same for all columns in cols + miss_pct = self.spec_dict["columns"][next(iter(cols))]["miss_probability"] + # rands is shared for all columns in cols + rands = rng.random(size=self.nan_data.shape[0]) # pylint: disable=no-member + + self.nan_data.loc[:, list(cols)] = np.where( + (rands < miss_pct)[..., None], + (np.NaN, ) * len(cols), + self.nan_data.loc[:, list(cols)] + ) + + #3) Generate nulls in geospacial columns (lat / long) + # Similar to linked / paired, keeping the random seed the same between + # lat and long, only changing if more than one column to generate. + + for col in geospatial_cols: + geo_cols = [f"{col}_latitude", f"{col}_longitude"] + rng = np.random.default_rng(seed=0) + miss_pct = self.spec_dict["columns"][col]["miss_probability"] + rands = rng.random(size=self.nan_data.shape[0]) + + self.nan_data.loc[:, geo_cols] = np.where( + (rands < miss_pct)[..., None], + (np.NaN, ) * len(geo_cols), + self.nan_data.loc[:, geo_cols] + ) + + #4) Generate nulls in indices explicitly defined in custom_constraints + make_null_idx = self._find_make_null_idx() + + for idx, col_name in make_null_idx: + self.nan_data.loc[idx, col_name] = np.NaN + + #5) Re-introduce the saved no_nulls rows from the original data + not_null_idx = self._find_not_null_idx() + for idx, col_name in not_null_idx: + self.nan_data.loc[idx, col_name] = self.data.loc[idx, col_name] + + #6) Replace NA with missing data placeholder for categorical columns and + # re-generate continuous variables for those rows according to proper weights + # only go through this step if there are nulls in categorical columns + # and the spec_dict includes numerical columns that would be affected + # otherwise, return early. + cat_cols = list(self.spec_dict["metadata"]["categorical_columns"]) + num_cols = ( + set(self.spec_dict["metadata"]["numerical_columns"]) - + set(self.spec_dict.get("derived_columns", {}).keys())) + + if not (any(self.nan_data[cat_cols].isna()) and num_cols): + return self.nan_data.astype(self.dtypes) + + cat_mask = self.nan_data[cat_cols].isna().any(axis=1) + self.nan_data[cat_cols] = self.nan_data[cat_cols].fillna(MISSING_DATA_STR) + + for num_col in num_cols: + + # reset the generator for each column + rng = np.random.default_rng(seed=0) + + # Extract relevant num col variables from the user spec + num_col_dict = self.spec_dict["columns"][num_col] + + dist = num_col_dict["distribution"] + dist_params = num_col_dict["distribution_parameters"] + precision = num_col_dict["precision"] + + # if it's already NA, don't re-generate; it's NA for a reason! + num_mask = self.nan_data[num_col].isna() + mask = cat_mask & ~num_mask + + # it's possible to have the left side be Int64 type and the right side + # to be float64 (newly generated, unscaled); assigning different types + # doesn't work so we'll delay assignment and scale / cast type first! + unscaled_new_series = self.nan_data.loc[mask, cat_cols].apply( + func=generate_cont_val, + axis=1, + weights_table=self.wt, + num_col=num_col, + rng=rng, + dist=dist, + dist_params=dist_params + ) + + # rescale the masked section, but make sure to change target_sum! + # take a copy of the dist_params as full target_sum is used elsewhere + new_dist_params = dist_params.copy() + + if dist_params.get("target_sum", None) is not None: + old_sum = self.nan_data.loc[~mask, num_col].sum() + new_dist_params["target_sum"] = dist_params["target_sum"] - old_sum + + scaled_new_series = scale_continuous_column( + series=unscaled_new_series, + precision=precision, + **new_dist_params + ) + + # for some reason assigning a series back, rather than values + # creates nulls in certain rows, but not others; maybe Pandas bug. + # when the array is empty, Pandas generates a ValueError + if len(scaled_new_series) != 0: + self.nan_data.loc[mask, num_col] = scaled_new_series.values + + # replace Missing data back with np.nan + # since we're applying the function across all columns, including numerical, + # these can contain pd.NA which is a "special" type that will error out if + # trying to evaluate it against a string. Replace with a standard np.NAN. + self.nan_data = self.nan_data.map( + lambda x: np.nan if pd.isna(x) or x == MISSING_DATA_STR else x) + + return self.nan_data.astype(self.dtypes) + + def _find_columns_with_linked_missing_data(self): + ''' + Returns a list of column groupings where a missing value in one + means always a missing value in all in the grouping. The requirement + for that is that the missing_probability attribute of the spec is the + same for all such linked / paired columns. + + Returns a list with sets of columns + ''' + + result = [] + processed_pairs = set() + miss_probs = get_attr_values( + self.spec_dict, "miss_probability", col_names=True, types="categorical") + + for col, attrs in self.spec_dict["columns"].items(): + + if col in processed_pairs or attrs["type"] != "categorical": + continue + + pairs = set() + + # paired columns first + if attrs["paired_columns"]: + + pairs.update([col] + attrs["paired_columns"]) + + # linked groups + for i, linked_group in (self.spec_dict["linked_columns"] or []): + # zero numbered linked group is reserved for user defined linkage + if i == 0: + continue + + if col in linked_group: + pairs.update(linked_group) + + processed_pairs.update(pairs) + + # check that miss_probabilities are the same for all paired columns + miss_probs = sorted( + miss_probs, key=lambda x, pairs=pairs: x.col_name in pairs) + groups = groupby(miss_probs, lambda x, pairs=pairs: x.col_name in pairs) + + for key, group in groups: + + if key and len({v for k, v in group}) == 1: + + result.append(pairs) + + return result + + + def _find_make_null_idx(self): + ''' + The reason for keeping this and _find_not_null_idx separate is that + they are needed at different points in time - not_null_idx happens AFTER + all other sources of nan-generation have been exhausted and we're using + the data WITH nans to determine indices to pick up real values from the + original data passed in to the generator. + ''' + + cc = self.spec_dict["constraints"]["custom_constraints"] or {} + + make_null_idx = [] + + for _, constraint in cc.items(): + + cc_filter = constraint.get("filter", None) + cc_targets = constraint.get("targets", {}) + clean_cc_filter = clean_up_constraint_string(cc_filter) + cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter) + + for target_str, action_str in cc_targets.items(): + + if "make_null" in action_str: + + target_cols = [x.strip() for x in target_str.split(",")] + + for target in target_cols: + + make_null_idx.append( + (self.nan_data.loc[cc_mask].index, target) + ) + + return make_null_idx + + def _find_not_null_idx(self): + ''' + Doc string + ''' + + cc = self.spec_dict["constraints"]["custom_constraints"] or {} + + not_null_idx = [] + + for _, constraint in cc.items(): + + cc_filter = constraint.get("filter", None) + cc_targets = constraint.get("targets", {}) + clean_cc_filter = clean_up_constraint_string(cc_filter) + cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter) + + for target_str, action_str in cc_targets.items(): + + if "make_not_null" in action_str: + + target_cols = [x.strip() for x in target_str.split(",")] + + for target in target_cols: + + not_null_idx.append( + (self.nan_data.loc[cc_mask].index, target) + ) + + return not_null_idx diff --git a/exhibit/core/generate/tests/test_derived.py b/exhibit/core/generate/tests/test_derived.py index a3be948..b41dec8 100644 --- a/exhibit/core/generate/tests/test_derived.py +++ b/exhibit/core/generate/tests/test_derived.py @@ -1,103 +1,103 @@ -''' -Test the generation of continuous columns & values -''' - -# Standard library imports -import unittest - -# External library imports -import pandas as pd -from pandas.testing import assert_series_equal -import numpy as np - -# Module under test -from exhibit.core.generate import derived as tm - -class derivedTests(unittest.TestCase): - ''' - Doc string - ''' - - def test_generate_derived_column_basic(self): - ''' - All of the work is done by pandas.eval() method; - we're just testing column names with whitespace are OK - ''' - - test_df = pd.DataFrame( - data=np.ones((5, 2)), - columns=["Hello World", "A"]) - - calc = "Hello World + A" - - self.assertEqual(tm.generate_derived_column(test_df, calc).sum(), 10) - - def test_generate_derived_column_groupby(self): - ''' - We want to allow users to create aggregated columns, like peer values. - Make sure that column names are enclosed in single spaces. - ''' - - test_df = pd.DataFrame( - data={ - "C1":["A", "A", "B", "B", "C", "C"], #locations - "C2":["spam", "eggs"] * 3, #groupby dimension(s) - "C3":[1, 10] * 3 #aggregation column - } - ) - - calc = "df.groupby('C2')['C3'].sum()" - - expected = pd.Series([3, 30, 3, 30, 3, 30], name="C3") - - assert_series_equal( - left=tm.generate_derived_column(test_df, calc), - right=expected, - check_dtype=False - ) - - def test_generate_derived_column_with_timestamp(self): - ''' - We want to allow users to create timestamps using generated columns with - hours, minutes and seconds. Bear in mind that missing values in all column - types are represented as np.nan. - ''' - - dates = pd.date_range( - start="01-01-2022", - periods=3, - freq="M", - ) - - test_df = pd.DataFrame( - data={ - "dates" : dates, - "hours" : pd.Categorical(["1", "2", np.nan]), - "minutes": [0, np.nan, 59], - "seconds": [0, 1, 10], - } - ) - - calc = "@create_timestamp(hours, minutes, seconds)" - - expected = pd.Series([ - "2022-01-31 01:00:00", - "2022-02-28 02:00:01", - "2022-03-31 00:59:10" - ]) - - # can add dates and timedelta timestamps easily - result = ( - test_df["dates"] + tm.generate_derived_column(test_df, calc) - ).astype(str) - - assert_series_equal( - left=result, - right=expected, - check_dtype=False - ) - -if __name__ == "__main__" and __package__ is None: - #overwrite __package__ builtin as per PEP 366 - __package__ = "exhibit" - unittest.main(warnings="ignore") +''' +Test the generation of continuous columns & values +''' + +# Standard library imports +import unittest + +# External library imports +import pandas as pd +from pandas.testing import assert_series_equal +import numpy as np + +# Module under test +from exhibit.core.generate import derived as tm + +class derivedTests(unittest.TestCase): + ''' + Doc string + ''' + + def test_generate_derived_column_basic(self): + ''' + All of the work is done by pandas.eval() method; + we're just testing column names with whitespace are OK + ''' + + test_df = pd.DataFrame( + data=np.ones((5, 2)), + columns=["Hello World", "A"]) + + calc = "Hello World + A" + + self.assertEqual(tm.generate_derived_column(test_df, calc).sum(), 10) + + def test_generate_derived_column_groupby(self): + ''' + We want to allow users to create aggregated columns, like peer values. + Make sure that column names are enclosed in single spaces. + ''' + + test_df = pd.DataFrame( + data={ + "C1":["A", "A", "B", "B", "C", "C"], #locations + "C2":["spam", "eggs"] * 3, #groupby dimension(s) + "C3":[1, 10] * 3 #aggregation column + } + ) + + calc = "df.groupby('C2')['C3'].sum()" + + expected = pd.Series([3, 30, 3, 30, 3, 30], name="C3") + + assert_series_equal( + left=tm.generate_derived_column(test_df, calc), + right=expected, + check_dtype=False + ) + + def test_generate_derived_column_with_timestamp(self): + ''' + We want to allow users to create timestamps using generated columns with + hours, minutes and seconds. Bear in mind that missing values in all column + types are represented as np.nan. + ''' + + dates = pd.date_range( + start="01-01-2022", + periods=3, + freq="ME", + ) + + test_df = pd.DataFrame( + data={ + "dates" : dates, + "hours" : pd.Categorical(["1", "2", np.nan]), + "minutes": [0, np.nan, 59], + "seconds": [0, 1, 10], + } + ) + + calc = "@create_timestamp(hours, minutes, seconds)" + + expected = pd.Series([ + "2022-01-31 01:00:00", + "2022-02-28 02:00:01", + "2022-03-31 00:59:10" + ]) + + # can add dates and timedelta timestamps easily + result = ( + test_df["dates"] + tm.generate_derived_column(test_df, calc) + ).astype(str) + + assert_series_equal( + left=result, + right=expected, + check_dtype=False + ) + +if __name__ == "__main__" and __package__ is None: + #overwrite __package__ builtin as per PEP 366 + __package__ = "exhibit" + unittest.main(warnings="ignore") diff --git a/exhibit/core/generate/tests/test_missing.py b/exhibit/core/generate/tests/test_missing.py index 002b3aa..10b4262 100644 --- a/exhibit/core/generate/tests/test_missing.py +++ b/exhibit/core/generate/tests/test_missing.py @@ -1,618 +1,618 @@ -''' -Test the handling & generation of missing values -''' - -# Standard library imports -import unittest -from collections import namedtuple -from unittest.mock import Mock, patch - -# External library imports -import pandas as pd -import numpy as np -from pandas.testing import assert_frame_equal, assert_series_equal - -# Exhibit imports -from exhibit.db import db_util -from exhibit.core.constants import MISSING_DATA_STR, ORIGINAL_VALUES_PAIRED -from exhibit.core.tests.test_reference import temp_exhibit - -# Module under test -from exhibit.core.generate import missing as tm - -class missingDataTests(unittest.TestCase): - ''' - Doc string - ''' - - @classmethod - def tearDownClass(cls): - ''' - Clean up local exhibit.db from temp tables - ''' - - db_util.purge_temp_tables() - - def test_feeding_data_to_missing_generator(self): - ''' - Doc string - ''' - - test_df = pd.DataFrame() - - path = "exhibit.core.generate.missing.MissingDataGenerator.__init__" - with patch(path) as mock_init: - mock_init.return_value = None - generatorMock = tm.MissingDataGenerator(Mock(), Mock()) - - setattr(generatorMock, "data", test_df) - - self.assertTrue( - isinstance(generatorMock.data, - pd.DataFrame)) - - def test_never_null_indices_are_identified(self): - ''' - Some cells can't ever have nulls due to custom constraints. - Filter and Partition fields are optional when defining custom - constraints. - ''' - - test_dict = { - "constraints" : { - "custom_constraints": { - "cc1" : { - "filter" : "~Test.isna()", - "targets" : { - "Num" : "make_not_null" - } - }, - } - } - } - - test_data = pd.DataFrame(data={ - "Test" : [1, 2, 3, pd.NA, 5], - "Num" : [1, 2, 3, 4, 5] - }) - - test_gen = tm.MissingDataGenerator(test_dict, test_data) - - not_null_idx = test_gen._find_not_null_idx() - - result = not_null_idx[0] - - assert_series_equal( - test_data.loc[result], - test_data.loc[[0, 1, 2, 4], "Num"]) - - def test_paired_columns_with_missing_data_identified(self): - ''' - Doc string - ''' - - test_dict = { - "columns" : { - "A" : { - "type" : "categorical", - "paired_columns" : ["B"], - "miss_probability": 0.5, - "original_values" : ORIGINAL_VALUES_PAIRED - }, - "B" : { - "type" : "categorical", - "paired_columns" : ["A"], - "miss_probability" : 0.5, - "original_values" : pd.DataFrame() - }, - "C" : { - "type" : "categorical", - "paired_columns" : ["D"], - "miss_probability" : 0.6, - "original_values" : pd.DataFrame() - }, - "D" : { - "type" : "categorical", - "paired_columns" : ["C"], - "miss_probability" : 0.7, - "original_values" : ORIGINAL_VALUES_PAIRED - } - }, - "constraints" : { - "custom_constraints" : {}, - - }, - "linked_columns" : [] - } - - expected = [ - {"A", "B"}, - ] - - test_gen = tm.MissingDataGenerator(test_dict, Mock()) - result = test_gen._find_columns_with_linked_missing_data() - - self.assertCountEqual(expected, result) - - def test_linked_columns_with_missing_data_identified(self): - ''' - Doc string - ''' - - test_dict = { - "columns" : { - "A" : { - "type" : "categorical", - "paired_columns" : [], - "miss_probability": 0.5, - "original_values" : pd.DataFrame() - }, - "B" : { - "type" : "categorical", - "paired_columns" : [], - "miss_probability" : 0.5, - "original_values" : pd.DataFrame() - }, - "C" : { - "type" : "categorical", - "paired_columns" : [], - "miss_probability" : 0.6, - "original_values" : pd.DataFrame() - }, - "D" : { - "type" : "categorical", - "paired_columns" : [], - "miss_probability" : 0.5, - "original_values" : pd.DataFrame() - } - }, - "constraints" : { - "custom_constraints" : {}, - }, - "linked_columns" : [ - (1, ["A", "B"]), - (2, ["C", "D"]) - ] - } - - expected = [ - {"A", "B"}, - ] - - test_gen = tm.MissingDataGenerator(test_dict, Mock()) - result = test_gen._find_columns_with_linked_missing_data() - - self.assertCountEqual(expected, result) - - def test_linked_and_paired_columns_with_missing_data_identified(self): - ''' - Doc string - ''' - - test_dict = { - "columns" : { - "A" : { - "type" : "categorical", - "paired_columns" : ["B"], - "miss_probability": 0.5, - "original_values" : pd.DataFrame() - }, - "B" : { - "type" : "categorical", - "paired_columns" : ["A"], - "miss_probability" : 0.5, - "original_values" : ORIGINAL_VALUES_PAIRED - }, - "C" : { - "type" : "categorical", - "paired_columns" : [], - "miss_probability" : 0.5, - "original_values" : pd.DataFrame() - } - }, - "constraints" : { - "custom_constraints" : {}, - }, - "linked_columns" : [ - (0, ["A", "C"]), - ] - } - - expected = [ - {"A", "B", "C"}, - ] - - test_gen = tm.MissingDataGenerator(test_dict, Mock()) - result = test_gen._find_columns_with_linked_missing_data() - - self.assertTrue(expected[0], result[0]) - - def test_make_null_constraint_in_columns(self): - ''' - When we're adding nulls to categorical columns, the non-null - numerical values must be re-calulcated and re-scaled because - Missing data (null identifier in categorical columns) can have - vastly different weights compared to the old values. However, - we shouldn't rescaled the whole column anew, just the added values. - ''' - - Weights = namedtuple("Weights", ["weight", "equal_weight"]) - - #demo weights table - weights_df = pd.DataFrame( - data=[ - ("C", "A", "spam", Weights(0.5, 0.5)), - ("C", "A", "eggs", Weights(0.5, 0.5)), - ("C", "B", "bacon", Weights(0.5, 0.5)), - ("C", "A", MISSING_DATA_STR, Weights(0.5, 0.5)), - ("C", "B", MISSING_DATA_STR, Weights(0.5, 0.5)), - ], - columns=["num_col", "cat_col", "cat_value", "weights"]) - - #reformat into dictionary - weights = ( - weights_df - .set_index(["num_col", "cat_col", "cat_value"]) - .to_dict(orient="index") - ) - - test_dict = { - "_rng" : np.random.default_rng(seed=0), - "metadata" : { - "categorical_columns": ["A", "B"], - "numerical_columns" : ["C"] - }, - "columns": { - "A": { - "type" : "categorical", - "paired_columns" : [], - "miss_probability": 0, - "original_values" : pd.DataFrame() - - }, - "B": { - "type" : "categorical", - "paired_columns" : [], - "miss_probability": 0, - "original_values" : pd.DataFrame() - - }, - "C": { - "type" : "continuous", - "precision" : "integer", - "distribution" : "weighted_uniform", - "distribution_parameters": { - "dispersion": 0, - "target_sum" : 10, - }, - "miss_probability": 0 - }, - - }, - "constraints" : { - "custom_constraints": { - "cc1" : { - "filter" : "A == 'spam'", - "targets" : { - "B" : "make_null" - } - } - } - }, - "linked_columns" : [], - "weights_table" : weights, - "weights_table_target_cols": ["A", "B"] - } - - test_data = pd.DataFrame(data={ - "A" : ["spam", "spam", "eggs", "eggs"], - "B" : ["bacon"] * 4, - "C" : [10, 20, 4, 4], - }) - - expected = pd.DataFrame(data={ - "A" : ["spam", "spam", "eggs", "eggs"], - "B" : [pd.NA, pd.NA, "bacon", "bacon"], - "C" : [1, 1, 4, 4], - }) - - test_gen = tm.MissingDataGenerator(test_dict, test_data) - result = test_gen.add_missing_data() - - assert_frame_equal(result, expected, check_dtype=False) - - def test_not_null_constraint_in_columns(self): - ''' - Doc string - ''' - - test_dict = { - "_rng" : np.random.default_rng(seed=0), - "metadata" : { - "categorical_columns": ["C", "D"], - "numerical_columns" : [] - }, - "columns": { - "C": { - "type" : "categorical", - "paired_columns" : [], - "miss_probability": 0.2, - "original_values" : pd.DataFrame() - - }, - "D": { - "type" : "categorical", - "paired_columns" : [], - "miss_probability": 0.5, - "original_values" : pd.DataFrame() - - } - }, - "constraints" : { - "custom_constraints": { - "cc1" : { - "filter" : "~C.isna()", - "targets" : { - "D" : "make_not_null" - } - } - } - }, - "linked_columns" : [] - } - - test_data = pd.DataFrame(data={ - "C" : np.random.random(1000), #pylint: disable=no-member - "D" : np.random.random(1000), #pylint: disable=no-member - - }) - - test_gen = tm.MissingDataGenerator(test_dict, test_data) - result = test_gen.add_missing_data() - - self.assertTrue(result["C"].isna().any()) - self.assertTrue(result["D"].isna().any()) - self.assertFalse(result.loc[~result["C"].isna(), "D"].isna().any()) - - def test_paired_columns_are_respected_for_missing_data(self): - ''' - Doc string - ''' - - test_dict = { - "_rng" : np.random.default_rng(seed=0), - "metadata" : { - "categorical_columns": ["A", "B"], - "numerical_columns" : [] - }, - "columns": { - "A": { - "type" : "categorical", - "paired_columns" : ["B"], - "miss_probability": 0.5, - "original_values" : pd.DataFrame() - }, - "B": { - "type" : "categorical", - "paired_columns" : ["A"], - "miss_probability": 0.5, - "original_values" : ORIGINAL_VALUES_PAIRED - - }, - }, - "constraints" : { - "custom_constraints" : {}, - }, - "linked_columns" : [], - } - - test_data = pd.DataFrame(data={ - "A" : np.random.random(1000), #pylint: disable=no-member - "B" : np.random.random(1000), #pylint: disable=no-member - }) - - test_gen = tm.MissingDataGenerator(test_dict, test_data) - result = test_gen.add_missing_data() - - self.assertTrue(result["A"].isna().any()) - self.assertTrue(result["B"].isna().any()) - assert_series_equal(result["B"].isna(), result["A"].isna(), check_names=False) - - def test_missing_data_added_to_standalone_categorical_column(self): - ''' - Doc string - ''' - - test_dict = { - "_rng" : np.random.default_rng(seed=0), - "metadata" : { - "categorical_columns": ["A", "B"], - "numerical_columns" : [] - }, - "columns": { - "A": { - "type" : "categorical", - "paired_columns" : [], - "miss_probability": 1, - "original_values" : pd.DataFrame() - }, - "B": { - "type" : "categorical", - "paired_columns" : [], - "miss_probability": 0, - "original_values" : pd.DataFrame() - - }, - }, - "constraints" : { - "custom_constraints" : {} - }, - "linked_columns" : [], - } - - test_data = pd.DataFrame(data={ - "A" : list("ABCDE"), - "B" : list("ABCDE") - }) - - expected = pd.DataFrame(data={ - "A" : [pd.NA] * 5, - "B" : list("ABCDE") - }) - - test_gen = tm.MissingDataGenerator(test_dict, test_data) - result = test_gen.add_missing_data() - - assert_frame_equal(expected, result, check_dtype=False) - - def test_continuous_column_adjusted_to_categorical_missing_data(self): - ''' - Remember that continuous columns depend on values in categorical columns - in the same row for their weights, including for Missing data values. - Adding Missing data also changes the target_sum of the continuous column - so we need to re-scale the whole column after adding missing data either - to it or to the categorical columns. - - We rely on np.random to generate reasonable number of NAs with 0.5 prob, - but that can sometimes fail so we ensure that the seed is constant. - ''' - - Weights = namedtuple("Weights", ["weight", "equal_weight"]) - - #demo weights table - weights_df = pd.DataFrame( - data=[ - ("C2", "C1", "A", Weights(0.1, 0.5)), - ("C2", "C1", "B", Weights(0.9, 0.5)), - ("C2", "C1", MISSING_DATA_STR, Weights(0.2, 0.5)), - ], - columns=["num_col", "cat_col", "cat_value", "weights"]) - - #reformat into dictionary - weights = ( - weights_df - .set_index(["num_col", "cat_col", "cat_value"]) - .to_dict(orient="index") - ) - - test_dict = { - "_rng" : np.random.default_rng(seed=0), - "metadata": { - "categorical_columns": [ - "C1" - ], - "numerical_columns" : [ - "C2" - ] - }, - "columns": { - "C1": { - "type" : "categorical", - "paired_columns" : [], - "miss_probability": 0.5, - "original_values" : pd.DataFrame() - }, - "C2": { - "type" : "continuous", - "precision" : "integer", - "distribution" : "weighted_uniform", - "distribution_parameters": { - "uniform_base_value" : 100, - "dispersion": 0, - "target_sum" : 200, # factor of two - }, - "miss_probability": 0 - }, - }, - "constraints" : { - "custom_constraints" : {} - }, - "linked_columns" : [], - "weights_table" : weights, - "weights_table_target_cols": ["C1"] - } - - test_data = pd.DataFrame(data={ - "C1" : ["A", "A", "A", "B", "B"] * 20, - "C2" : [1] * 100 - }) - - - test_gen = tm.MissingDataGenerator(test_dict, test_data) - result = test_gen.add_missing_data() - - self.assertTrue(result["C1"].isna().any()) - self.assertEqual(result["C2"].sum(), 200) - - def test_user_linked_columns_having_missing_data(self): - ''' - Because user linked columns can have complex relationships, we - need to make sure missing data is handled correctly. - ''' - - test_df = pd.DataFrame(data={ - "A": ["spam", "spam", "eggs", "eggs", "spam"], - "B": ["bacon", "spamspam", np.nan, "parrot", "bacon"], - "C": range(5) - }) - - test_dict = { - "metadata" : { - "number_of_rows" : 1000 - } - } - - fromdata_test = { - "linked_columns" : ["A", "B"] - } - - _, df = temp_exhibit( - filename=test_df, fromdata_namespace=fromdata_test, - test_spec_dict=test_dict, return_spec=False) - - self.assertTrue(df.query("A == 'eggs'")["B"].isna().any()) - - def test_categorical_numerical_missing_data_with_make_null_cc(self): - ''' - Typing issues (categorical vs object) can cause bugs when we have categorical columns, - a make_null custom constraint, a filter casting categorical column to integers (which - assumes object, not categorical - because you can't cast categorical to int if there - is a Missing data categorical value - without removing unused categories first) AND - a numerical column. Commenting out the numerical column used to pass the test, and - uncommenting it used to fail it - which is wrong. - - Without extra checks, AGE.astype('int') will fail if AGE is dtype="category" because - it'll have numbers as strings (which can be cast to int) and "invisible" Missing data - which can't. - ''' - - test_df = pd.DataFrame(data={ - "AGE": ["1", "2", "3", "4", "4"], - "NULLED" : list("ABCAB"), - "NUMS": range(5) - }) - - test_dict = { - "metadata" : { - "number_of_rows" : 10, - "categorical_columns": ["AGE", "NULLED"], - "numerical_columns" : ["NUMS"] - }, - "constraints" : { - "custom_constraints" : { - "test_nulls" : { - "filter" : "AGE.astype('int') > 1", - "targets" : {"NULLED" : "make_null"} - } - } - } - } - - _, df = temp_exhibit(filename=test_df, test_spec_dict=test_dict, return_spec=False) - - self.assertTrue(df.NULLED.isna().any()) - -if __name__ == "__main__" and __package__ is None: - #overwrite __package__ builtin as per PEP 366 - __package__ = "exhibit" - unittest.main(warnings="ignore") +''' +Test the handling & generation of missing values +''' + +# Standard library imports +import unittest +from collections import namedtuple +from unittest.mock import Mock, patch + +# External library imports +import pandas as pd +import numpy as np +from pandas.testing import assert_frame_equal, assert_series_equal + +# Exhibit imports +from exhibit.db import db_util +from exhibit.core.constants import MISSING_DATA_STR, ORIGINAL_VALUES_PAIRED +from exhibit.core.tests.test_reference import temp_exhibit + +# Module under test +from exhibit.core.generate import missing as tm + +class missingDataTests(unittest.TestCase): + ''' + Doc string + ''' + + @classmethod + def tearDownClass(cls): + ''' + Clean up local exhibit.db from temp tables + ''' + + db_util.purge_temp_tables() + + def test_feeding_data_to_missing_generator(self): + ''' + Doc string + ''' + + test_df = pd.DataFrame() + + path = "exhibit.core.generate.missing.MissingDataGenerator.__init__" + with patch(path) as mock_init: + mock_init.return_value = None + generatorMock = tm.MissingDataGenerator(Mock(), Mock()) + + setattr(generatorMock, "data", test_df) + + self.assertTrue( + isinstance(generatorMock.data, + pd.DataFrame)) + + def test_never_null_indices_are_identified(self): + ''' + Some cells can't ever have nulls due to custom constraints. + Filter and Partition fields are optional when defining custom + constraints. + ''' + + test_dict = { + "constraints" : { + "custom_constraints": { + "cc1" : { + "filter" : "~Test.isna()", + "targets" : { + "Num" : "make_not_null" + } + }, + } + } + } + + test_data = pd.DataFrame(data={ + "Test" : [1, 2, 3, pd.NA, 5], + "Num" : [1, 2, 3, 4, 5] + }) + + test_gen = tm.MissingDataGenerator(test_dict, test_data) + + not_null_idx = test_gen._find_not_null_idx() + + result = not_null_idx[0] + + assert_series_equal( + test_data.loc[result], + test_data.loc[[0, 1, 2, 4], "Num"]) + + def test_paired_columns_with_missing_data_identified(self): + ''' + Doc string + ''' + + test_dict = { + "columns" : { + "A" : { + "type" : "categorical", + "paired_columns" : ["B"], + "miss_probability": 0.5, + "original_values" : ORIGINAL_VALUES_PAIRED + }, + "B" : { + "type" : "categorical", + "paired_columns" : ["A"], + "miss_probability" : 0.5, + "original_values" : pd.DataFrame() + }, + "C" : { + "type" : "categorical", + "paired_columns" : ["D"], + "miss_probability" : 0.6, + "original_values" : pd.DataFrame() + }, + "D" : { + "type" : "categorical", + "paired_columns" : ["C"], + "miss_probability" : 0.7, + "original_values" : ORIGINAL_VALUES_PAIRED + } + }, + "constraints" : { + "custom_constraints" : {}, + + }, + "linked_columns" : [] + } + + expected = [ + {"A", "B"}, + ] + + test_gen = tm.MissingDataGenerator(test_dict, Mock()) + result = test_gen._find_columns_with_linked_missing_data() + + self.assertCountEqual(expected, result) + + def test_linked_columns_with_missing_data_identified(self): + ''' + Doc string + ''' + + test_dict = { + "columns" : { + "A" : { + "type" : "categorical", + "paired_columns" : [], + "miss_probability": 0.5, + "original_values" : pd.DataFrame() + }, + "B" : { + "type" : "categorical", + "paired_columns" : [], + "miss_probability" : 0.5, + "original_values" : pd.DataFrame() + }, + "C" : { + "type" : "categorical", + "paired_columns" : [], + "miss_probability" : 0.6, + "original_values" : pd.DataFrame() + }, + "D" : { + "type" : "categorical", + "paired_columns" : [], + "miss_probability" : 0.5, + "original_values" : pd.DataFrame() + } + }, + "constraints" : { + "custom_constraints" : {}, + }, + "linked_columns" : [ + (1, ["A", "B"]), + (2, ["C", "D"]) + ] + } + + expected = [ + {"A", "B"}, + ] + + test_gen = tm.MissingDataGenerator(test_dict, Mock()) + result = test_gen._find_columns_with_linked_missing_data() + + self.assertCountEqual(expected, result) + + def test_linked_and_paired_columns_with_missing_data_identified(self): + ''' + Doc string + ''' + + test_dict = { + "columns" : { + "A" : { + "type" : "categorical", + "paired_columns" : ["B"], + "miss_probability": 0.5, + "original_values" : pd.DataFrame() + }, + "B" : { + "type" : "categorical", + "paired_columns" : ["A"], + "miss_probability" : 0.5, + "original_values" : ORIGINAL_VALUES_PAIRED + }, + "C" : { + "type" : "categorical", + "paired_columns" : [], + "miss_probability" : 0.5, + "original_values" : pd.DataFrame() + } + }, + "constraints" : { + "custom_constraints" : {}, + }, + "linked_columns" : [ + (0, ["A", "C"]), + ] + } + + expected = [ + {"A", "B", "C"}, + ] + + test_gen = tm.MissingDataGenerator(test_dict, Mock()) + result = test_gen._find_columns_with_linked_missing_data() + + self.assertTrue(expected[0], result[0]) + + def test_make_null_constraint_in_columns(self): + ''' + When we're adding nulls to categorical columns, the non-null + numerical values must be re-calulcated and re-scaled because + Missing data (null identifier in categorical columns) can have + vastly different weights compared to the old values. However, + we shouldn't rescaled the whole column anew, just the added values. + ''' + + Weights = namedtuple("Weights", ["weight", "equal_weight"]) + + #demo weights table + weights_df = pd.DataFrame( + data=[ + ("C", "A", "spam", Weights(0.5, 0.5)), + ("C", "A", "eggs", Weights(0.5, 0.5)), + ("C", "B", "bacon", Weights(0.5, 0.5)), + ("C", "A", MISSING_DATA_STR, Weights(0.5, 0.5)), + ("C", "B", MISSING_DATA_STR, Weights(0.5, 0.5)), + ], + columns=["num_col", "cat_col", "cat_value", "weights"]) + + #reformat into dictionary + weights = ( + weights_df + .set_index(["num_col", "cat_col", "cat_value"]) + .to_dict(orient="index") + ) + + test_dict = { + "_rng" : np.random.default_rng(seed=0), + "metadata" : { + "categorical_columns": ["A", "B"], + "numerical_columns" : ["C"] + }, + "columns": { + "A": { + "type" : "categorical", + "paired_columns" : [], + "miss_probability": 0, + "original_values" : pd.DataFrame() + + }, + "B": { + "type" : "categorical", + "paired_columns" : [], + "miss_probability": 0, + "original_values" : pd.DataFrame() + + }, + "C": { + "type" : "continuous", + "precision" : "integer", + "distribution" : "weighted_uniform", + "distribution_parameters": { + "dispersion": 0, + "target_sum" : 10, + }, + "miss_probability": 0 + }, + + }, + "constraints" : { + "custom_constraints": { + "cc1" : { + "filter" : "A == 'spam'", + "targets" : { + "B" : "make_null" + } + } + } + }, + "linked_columns" : [], + "weights_table" : weights, + "weights_table_target_cols": ["A", "B"] + } + + test_data = pd.DataFrame(data={ + "A" : ["spam", "spam", "eggs", "eggs"], + "B" : ["bacon"] * 4, + "C" : [10, 20, 4, 4], + }) + + expected = pd.DataFrame(data={ + "A" : ["spam", "spam", "eggs", "eggs"], + "B" : [np.nan, np.nan, "bacon", "bacon"], + "C" : [1, 1, 4, 4], + }) + + test_gen = tm.MissingDataGenerator(test_dict, test_data) + result = test_gen.add_missing_data() + + assert_frame_equal(result, expected, check_dtype=False) + + def test_not_null_constraint_in_columns(self): + ''' + Doc string + ''' + + test_dict = { + "_rng" : np.random.default_rng(seed=0), + "metadata" : { + "categorical_columns": ["C", "D"], + "numerical_columns" : [] + }, + "columns": { + "C": { + "type" : "categorical", + "paired_columns" : [], + "miss_probability": 0.2, + "original_values" : pd.DataFrame() + + }, + "D": { + "type" : "categorical", + "paired_columns" : [], + "miss_probability": 0.5, + "original_values" : pd.DataFrame() + + } + }, + "constraints" : { + "custom_constraints": { + "cc1" : { + "filter" : "~C.isna()", + "targets" : { + "D" : "make_not_null" + } + } + } + }, + "linked_columns" : [] + } + + test_data = pd.DataFrame(data={ + "C" : np.random.random(1000), #pylint: disable=no-member + "D" : np.random.random(1000), #pylint: disable=no-member + + }) + + test_gen = tm.MissingDataGenerator(test_dict, test_data) + result = test_gen.add_missing_data() + + self.assertTrue(result["C"].isna().any()) + self.assertTrue(result["D"].isna().any()) + self.assertFalse(result.loc[~result["C"].isna(), "D"].isna().any()) + + def test_paired_columns_are_respected_for_missing_data(self): + ''' + Doc string + ''' + + test_dict = { + "_rng" : np.random.default_rng(seed=0), + "metadata" : { + "categorical_columns": ["A", "B"], + "numerical_columns" : [] + }, + "columns": { + "A": { + "type" : "categorical", + "paired_columns" : ["B"], + "miss_probability": 0.5, + "original_values" : pd.DataFrame() + }, + "B": { + "type" : "categorical", + "paired_columns" : ["A"], + "miss_probability": 0.5, + "original_values" : ORIGINAL_VALUES_PAIRED + + }, + }, + "constraints" : { + "custom_constraints" : {}, + }, + "linked_columns" : [], + } + + test_data = pd.DataFrame(data={ + "A" : np.random.random(1000), #pylint: disable=no-member + "B" : np.random.random(1000), #pylint: disable=no-member + }) + + test_gen = tm.MissingDataGenerator(test_dict, test_data) + result = test_gen.add_missing_data() + + self.assertTrue(result["A"].isna().any()) + self.assertTrue(result["B"].isna().any()) + assert_series_equal(result["B"].isna(), result["A"].isna(), check_names=False) + + def test_missing_data_added_to_standalone_categorical_column(self): + ''' + Doc string + ''' + + test_dict = { + "_rng" : np.random.default_rng(seed=0), + "metadata" : { + "categorical_columns": ["A", "B"], + "numerical_columns" : [] + }, + "columns": { + "A": { + "type" : "categorical", + "paired_columns" : [], + "miss_probability": 1, + "original_values" : pd.DataFrame() + }, + "B": { + "type" : "categorical", + "paired_columns" : [], + "miss_probability": 0, + "original_values" : pd.DataFrame() + + }, + }, + "constraints" : { + "custom_constraints" : {} + }, + "linked_columns" : [], + } + + test_data = pd.DataFrame(data={ + "A" : list("ABCDE"), + "B" : list("ABCDE") + }) + + expected = pd.DataFrame(data={ + "A" : [np.nan] * 5, + "B" : list("ABCDE") + }) + + test_gen = tm.MissingDataGenerator(test_dict, test_data) + result = test_gen.add_missing_data() + + assert_frame_equal(expected, result, check_dtype=False) + + def test_continuous_column_adjusted_to_categorical_missing_data(self): + ''' + Remember that continuous columns depend on values in categorical columns + in the same row for their weights, including for Missing data values. + Adding Missing data also changes the target_sum of the continuous column + so we need to re-scale the whole column after adding missing data either + to it or to the categorical columns. + + We rely on np.random to generate reasonable number of NAs with 0.5 prob, + but that can sometimes fail so we ensure that the seed is constant. + ''' + + Weights = namedtuple("Weights", ["weight", "equal_weight"]) + + #demo weights table + weights_df = pd.DataFrame( + data=[ + ("C2", "C1", "A", Weights(0.1, 0.5)), + ("C2", "C1", "B", Weights(0.9, 0.5)), + ("C2", "C1", MISSING_DATA_STR, Weights(0.2, 0.5)), + ], + columns=["num_col", "cat_col", "cat_value", "weights"]) + + #reformat into dictionary + weights = ( + weights_df + .set_index(["num_col", "cat_col", "cat_value"]) + .to_dict(orient="index") + ) + + test_dict = { + "_rng" : np.random.default_rng(seed=0), + "metadata": { + "categorical_columns": [ + "C1" + ], + "numerical_columns" : [ + "C2" + ] + }, + "columns": { + "C1": { + "type" : "categorical", + "paired_columns" : [], + "miss_probability": 0.5, + "original_values" : pd.DataFrame() + }, + "C2": { + "type" : "continuous", + "precision" : "integer", + "distribution" : "weighted_uniform", + "distribution_parameters": { + "uniform_base_value" : 100, + "dispersion": 0, + "target_sum" : 200, # factor of two + }, + "miss_probability": 0 + }, + }, + "constraints" : { + "custom_constraints" : {} + }, + "linked_columns" : [], + "weights_table" : weights, + "weights_table_target_cols": ["C1"] + } + + test_data = pd.DataFrame(data={ + "C1" : ["A", "A", "A", "B", "B"] * 20, + "C2" : [1] * 100 + }) + + + test_gen = tm.MissingDataGenerator(test_dict, test_data) + result = test_gen.add_missing_data() + + self.assertTrue(result["C1"].isna().any()) + self.assertEqual(result["C2"].sum(), 200) + + def test_user_linked_columns_having_missing_data(self): + ''' + Because user linked columns can have complex relationships, we + need to make sure missing data is handled correctly. + ''' + + test_df = pd.DataFrame(data={ + "A": ["spam", "spam", "eggs", "eggs", "spam"], + "B": ["bacon", "spamspam", np.nan, "parrot", "bacon"], + "C": range(5) + }) + + test_dict = { + "metadata" : { + "number_of_rows" : 1000 + } + } + + fromdata_test = { + "linked_columns" : ["A", "B"] + } + + _, df = temp_exhibit( + filename=test_df, fromdata_namespace=fromdata_test, + test_spec_dict=test_dict, return_spec=False) + + self.assertTrue(df.query("A == 'eggs'")["B"].isna().any()) + + def test_categorical_numerical_missing_data_with_make_null_cc(self): + ''' + Typing issues (categorical vs object) can cause bugs when we have categorical columns, + a make_null custom constraint, a filter casting categorical column to integers (which + assumes object, not categorical - because you can't cast categorical to int if there + is a Missing data categorical value - without removing unused categories first) AND + a numerical column. Commenting out the numerical column used to pass the test, and + uncommenting it used to fail it - which is wrong. + + Without extra checks, AGE.astype('int') will fail if AGE is dtype="category" because + it'll have numbers as strings (which can be cast to int) and "invisible" Missing data + which can't. + ''' + + test_df = pd.DataFrame(data={ + "AGE": ["1", "2", "3", "4", "4"], + "NULLED" : list("ABCAB"), + "NUMS": range(5) + }) + + test_dict = { + "metadata" : { + "number_of_rows" : 10, + "categorical_columns": ["AGE", "NULLED"], + "numerical_columns" : ["NUMS"] + }, + "constraints" : { + "custom_constraints" : { + "test_nulls" : { + "filter" : "AGE.astype('int') > 1", + "targets" : {"NULLED" : "make_null"} + } + } + } + } + + _, df = temp_exhibit(filename=test_df, test_spec_dict=test_dict, return_spec=False) + + self.assertTrue(df.NULLED.isna().any()) + +if __name__ == "__main__" and __package__ is None: + #overwrite __package__ builtin as per PEP 366 + __package__ = "exhibit" + unittest.main(warnings="ignore") diff --git a/exhibit/core/linkage/hierarchical.py b/exhibit/core/linkage/hierarchical.py index 0e66171..22e1600 100644 --- a/exhibit/core/linkage/hierarchical.py +++ b/exhibit/core/linkage/hierarchical.py @@ -637,7 +637,7 @@ def scenario_2(self): orig_df = self.spec_dict["columns"][self.base_col]["original_values"] repl = self.sql_df[self.base_col].unique()[0:self.base_col_unique_count] aliases = dict(zip(orig_df[self.base_col].values[:-1], repl)) - aliased_df = orig_df.applymap(lambda x: aliases.get(x, x)) + aliased_df = orig_df.map(lambda x: aliases.get(x, x)) self.spec_dict["columns"][self.base_col]["original_values"] = aliased_df base_col_vals = aliased_df[self.base_col].iloc[:-1].unique() @@ -699,7 +699,7 @@ def scenario_3(self): orig_df = self.spec_dict["columns"][self.base_col]["original_values"] repl = self.sql_df[self.base_col].unique()[0:self.base_col_unique_count] aliases = dict(zip(orig_df[self.base_col].values[:-1], repl)) - aliased_df = orig_df.applymap(lambda x: aliases.get(x, x)) + aliased_df = orig_df.map(lambda x: aliases.get(x, x)) self.spec_dict["columns"][self.base_col]["original_values"] = aliased_df base_col_vals = aliased_df[self.base_col].iloc[:-1].unique() diff --git a/exhibit/core/linkage/matrix.py b/exhibit/core/linkage/matrix.py index 43c662a..66107dc 100644 --- a/exhibit/core/linkage/matrix.py +++ b/exhibit/core/linkage/matrix.py @@ -1,389 +1,389 @@ -''' -Module isolating methods and classes to find, process and generate -user-defined linked columns where the relationships are coded in a -lookup + matrix. For hierarchical linkage see the hierarchical module, -''' - -# Standard library imports -import sys -import textwrap -from functools import partial -from multiprocessing import Pool - -# External imports -import numpy as np -import pandas as pd - -# Exhibit imports -from ..constants import MISSING_DATA_STR -from ..sql import create_temp_table, query_exhibit_database - -def save_predefined_linked_cols_to_db(df, id): - """ - Derive and save everything that's required to generate - user defined linked columns on demand from a future spec - - Parameters - ---------- - df : pd.DataFrame - original dataframe with just the categorical columns; - we assume that linked columns defined by the user are - categorical. Maybe need a special case for time? - id : str - taken from metadata[id] - - Returns - ------- - nothing - """ - - prefixed_df = add_prefix(df) - orig_label_to_pos_label = {} # age__0-9 : age__0, etc. - pos_labels_inc_column = [] # age__0, age__1, etc. - sep = "__" - - for col in prefixed_df.columns: - - col_vals = sorted(prefixed_df[col].unique()) - - # add Missing data by hand if not already there OR - # pop and reinsert at the end to align with the spec! - # make sure the values are sorted AFTER we remove the existing - # Missing data, but BEFORE we reinsert it. - col_miss_val = f"{col}{sep}{MISSING_DATA_STR}" - - # don't forget that we need to test equality element-wise, hence conversion - # to an array from; lists don't compare in the same way. - if col_miss_val in col_vals: - col_vals = sorted(np.delete(col_vals, np.array(col_vals) == col_miss_val)) - - col_vals = np.append(col_vals, col_miss_val) - - pos_labels_temp = [ - f"{col}{sep}{x}" for x in range(len(col_vals)) - ] - - pos_labels_inc_column.extend(pos_labels_temp) - - orig_label_to_pos_label.update( - {k:v for v, k in zip(pos_labels_temp, col_vals)} - ) - - # age__0 : 0, etc. - pos_label_to_id = dict( - zip(pos_labels_inc_column, range(len(pos_labels_inc_column))) - ) - - # convert the original, prefixed values first to positional labels - # and then just to numerical IDs - temp_df = (prefixed_df - .applymap(lambda x: orig_label_to_pos_label.get(x, x)) - .applymap(lambda x: pos_label_to_id.get(x, x))) - - label_matrix = np.unique(temp_df.values, axis=0).astype(np.intc) - - # make sure column names don't have spaces - col_names = [x.replace(" ", "$") for x in prefixed_df.columns] - - # save the label matrix to SQLite db - create_temp_table( - table_name=f"temp_{id}_matrix", - col_names=col_names, - data=label_matrix, - ) - - # save the lookup to SQLite db; note that numerical_ids are - # upcast to strings by numpy when creating the array! - create_temp_table( - table_name=f"temp_{id}_lookup", - col_names=["pos_label", "num_label"], - data=list(pos_label_to_id.items()), - ) - -def add_prefix(df, sep="__"): - """ - Add column name as prefix to the column values - - Parameters - ---------- - df : pd.DataFrame - df must have purely categorical columns - no checks are made - sep : str, optional - separator must be consistent between add_prefix and remove_prefix - by default "__" - - Returns - ------- - new DataFrame where values are prefixed with column name - """ - - data_dict = {} - - for col in df.columns: - # cast to str in case we're dealing with integer-based categorical columns, like age - df_col_str = df[col].fillna(MISSING_DATA_STR).astype(str) - data_dict[col] = np.add(f"{col}{sep}", df_col_str.values) - - return pd.DataFrame(data_dict) - -def generate_user_linked_anon_df( - spec_dict, linked_cols, num_rows, starting_col_matrix=None): - ''' - Main function to generated user-defined linked columns. - - Parameters - ---------- - spec_dict : dictionary - specification plus internal keys, like _rng - linked_cols : list - there can be only one user-linked group (0, [linked_col_1, linked_col_2, ]) - num_rows : int - number of rows to generate - starting_col_matrix : np.Array shaped (num_rows, len(linked_cols)) - the matrix is either filled with None values or pre-populated if the function - is run multiple times (like when regenerating values after applying custom - actions like make_same) - - Returns - ------- - Data Frame with linked columns - ''' - - table_id = spec_dict["metadata"]["id"] - rng = spec_dict["_rng"] - lookup, matrix = get_lookup_and_matrix_from_db(table_id) - new_label_lookup, proba_lookup = build_new_lookups(spec_dict, linked_cols, lookup) - # DANGER WHEN REVERSING THE DICT - SAME VALUES IN MULTIPLE COLUMNS WILL BE LOST - rev_label_lookup = {key:value for value, key in new_label_lookup.items()} - # linked columns dispersion list - lcd = [spec_dict["columns"][col]["dispersion"] for col in linked_cols] - - # if re-creating linked values from a pre-generated sequence, reverse the dict to - # get the numerical mapping as expected, also changing the dtype for performance. - - if starting_col_matrix is not None: - starting_col_matrix = ( - pd.DataFrame(starting_col_matrix) - .fillna(MISSING_DATA_STR) - .applymap(lambda x: rev_label_lookup.get(x, x)).values.astype(np.int16) - ) - - else: - starting_col_matrix = np.full( - shape=(num_rows, len(linked_cols)), fill_value=-1) - - # multiprocessing only on unix - if sys.platform != "win32": - with Pool(processes=4) as pool: - - new_rows = pool.map( - partial(process_row, matrix, proba_lookup, lcd, rng), - starting_col_matrix - ) - else: #pragma: no cover - - new_rows = [] - - for i in range(num_rows): - new_row = process_row( - matrix, proba_lookup, lcd, rng, starting_col_matrix[i]) - new_rows.append(new_row) - - new_matrix = np.stack(new_rows) - - new_df = pd.DataFrame( - new_matrix, columns=linked_cols).applymap(lambda x: new_label_lookup.get(x, x)) - - return new_df - -def get_lookup_and_matrix_from_db(table_id): - ''' - The names of the two tables required for user defined linkage don't change: - one is lookup and another is matrix. - ''' - - lookup = dict(query_exhibit_database(f"temp_{table_id}_lookup").values) - matrix = query_exhibit_database(f"temp_{table_id}_matrix").values - - return lookup, matrix - -def process_row( - label_matrix, proba_lookup, lcd, rng, ref_array, acc_array=None, i=0): - ''' - Recursive function to generate new rows of data from the - existing linked matrix. It's possible the function will be - called multiple times to generate a column value if there - are no valid values that follow on from earlier values in the sequence. - - For example, if A => A1 => A11 and B => B2 => B12 then if the second - column has dispersion set to > 0, the row generation might go like this: - A => B2 (due to dispersion) => B12 (falling back to a valid 2-member sequence - rather than generating a random value because there isn't a A => B2 predefined - in the linkage matrix taken from the original data). - - Parameters - ---------- - label_matrix : np.array - array where shape[0] is the number_unique_combinations_of_all_linked_col_values - and shape[1] is the number of linked columns - proba_lookup : dictionary - dictionary where keys are encoded original values (0, 1, 2, etc.) and values - are their probabilities taken either from the specification or equalised from db - lcd : list - list with dispersion values for each column in linked_columns - rng : np.rng - shared RNG generator - ref_array : np.Array - array of either None values or pre-populated with existing df values - acc_array : np.Array - accummulated array that is being processed and returned - i : integer - a counter in case we need to reduce the sequence size to check for valid - combinations to determine the next valid value - - Returns - ------- - np.array of a single row with encoded column values - ''' - - if acc_array is None: - acc_array = np.array([]) - - arr_len = len(acc_array) - ref_arr_len = len(ref_array) - - if arr_len == label_matrix.shape[1]: - return acc_array - - # if there are no valid targets due to dispersion throwing in a non-valid target, - # rather than continue checking the full array (which will always fail to produce - # a valid next value), change the first position of the array being checked from 0 - # to counter i and increase until you exhaust the prior possibilities. The fallback - # is that there will always be valid targets for previous sequence length = 1 aka - # from one column to the next. - - _ref_array = np.where(ref_array == -1, label_matrix, ref_array) - mask = np.all(label_matrix[:, i:ref_arr_len] == _ref_array[:, i:], axis=1) - - valid_targets = np.unique(label_matrix[mask, arr_len]) - - if len(valid_targets) == 0: - - i = i + 1 - return process_row( - label_matrix, proba_lookup, lcd, rng, ref_array, acc_array, i) - - target_proba = np.array([proba_lookup[x] for x in valid_targets]) - - # typically, there will be more than 1 value in target_proba, but we have to guard against - # possibility of there being just one value, and if its probability is zero (Missing data) - # then summing it to 1 will result in NA (division by zero). As a workaround, set proba to - # 1 whenever it's the only possible value - since having it less than 1 doesn't make sense. - if len(target_proba) == 1: - target_proba = np.array([1]) - - # make sure the probabilities sum up to 1 - target_proba = target_proba * (1 / sum(target_proba)) - - # take dispersion from the spec - dispersion = lcd[arr_len] - - # default is to pick a random valid target - next_val = rng.choice(a=valid_targets, p=target_proba) - - # except when it's already pre-generated - if ref_array[arr_len] != -1: - next_val = ref_array[arr_len] - - # or dispersion is in effect; this part is expensive so only calculate if needed - elif dispersion and rng.random() < dispersion: - all_targets = np.unique(label_matrix[:, arr_len]) - non_valid_targets = np.setdiff1d(all_targets, valid_targets) - if len(non_valid_targets) > 0: - next_val = rng.choice(a=non_valid_targets) - - new_array = np.append(acc_array, next_val) - - # update the ref_array to capture the just generated value - if ref_array[arr_len] == -1: - ref_array[arr_len] = next_val - - return process_row(label_matrix, proba_lookup, lcd, rng, ref_array, new_array) - -def build_new_lookups(spec_dict, linked_cols, original_lookup): - ''' - Build two lookups: - - from the numerical id to its aliased value. {0: 'hb_code__S08000015', ...} - - from the numerical id to the probability value {0: 0.5} - - Be mindful of all the intermediate steps. The intermediate lookup is created - with the numerical ID to a tuple and then split into two. - - original_lookup is a positional to numerical_id, like so: - {'hb_code__0': 0} which is to say that the zero-th value in the list of - all hb_code values is aliased to the numerical id zero. - - Special case if original values are not stored in the spec, but instead have - been put into the DB - ''' - - pos_labels_inc_column = [] # age__0, age__1, etc. - pos_label_to_orig_tuple = {} # age__0: (age__0-9, 0.5), etc. - - for col in linked_cols: - - orig_vals = spec_dict["columns"][col]["original_values"] - prob_vector = None - - if not isinstance(orig_vals, pd.DataFrame): - - safe_col = col.replace(" ", "$") - table_id = spec_dict["metadata"]["id"] - orig_vals_db = query_exhibit_database(table_name=f"temp_{table_id}_{safe_col}") - orig_vals_sorted = ( - sorted([x for x in orig_vals_db[col] if x != MISSING_DATA_STR]) + - [MISSING_DATA_STR] - ) - - orig_vals = pd.DataFrame(data={col:orig_vals_sorted}) - - if "probability_vector" not in orig_vals_db.columns: - prob_vector = np.ones(orig_vals.shape[0]) - prob_vector[-1] = spec_dict["columns"][col]["miss_probability"] - else: - prob_vector = orig_vals_db["probability_vector"].astype(float).values - prob_vector = np.append( - prob_vector, spec_dict["columns"][col]["miss_probability"]) - - prob_vector /= prob_vector.sum() - - if prob_vector is None: - prob_vector = orig_vals["probability_vector"].values - - pos_labels_temp = [f"{col}__{x}" for x in range(len(orig_vals[col].values))] - pos_labels_inc_column.extend(pos_labels_temp) - pos_label_to_orig_tuple.update( - dict(zip( - pos_labels_temp, tuple(zip(orig_vals[col].values, prob_vector)) - )) - ) - - # 0: age__0, etc. using the ORIGINAL lookup which has all the relationships - id_to_pos_label = {v:k for k, v in original_lookup.items()} - - # if we don't check for the user removed values here, the next line - # will error out with an obscure Key not found message. - if len(original_lookup) != len(pos_label_to_orig_tuple): - raise ValueError(textwrap.dedent(""" - The number of values in user linked columns doesn't match original data. - If you would like to remove values, set their probability to zero. - """)) - - # 0: 'hb_code__aliased_code' - rev_labels = {k: pos_label_to_orig_tuple[v] for k, v in id_to_pos_label.items()} - - # finally, split the tuple dictionary into two separate ones: - label_lookup = {k:v[0] for k, v in rev_labels.items()} - proba_lookup = {k:v[1] for k, v in rev_labels.items()} - - return label_lookup, proba_lookup +''' +Module isolating methods and classes to find, process and generate +user-defined linked columns where the relationships are coded in a +lookup + matrix. For hierarchical linkage see the hierarchical module, +''' + +# Standard library imports +import sys +import textwrap +from functools import partial +from multiprocessing import Pool + +# External imports +import numpy as np +import pandas as pd + +# Exhibit imports +from ..constants import MISSING_DATA_STR +from ..sql import create_temp_table, query_exhibit_database + +def save_predefined_linked_cols_to_db(df, id): + """ + Derive and save everything that's required to generate + user defined linked columns on demand from a future spec + + Parameters + ---------- + df : pd.DataFrame + original dataframe with just the categorical columns; + we assume that linked columns defined by the user are + categorical. Maybe need a special case for time? + id : str + taken from metadata[id] + + Returns + ------- + nothing + """ + + prefixed_df = add_prefix(df) + orig_label_to_pos_label = {} # age__0-9 : age__0, etc. + pos_labels_inc_column = [] # age__0, age__1, etc. + sep = "__" + + for col in prefixed_df.columns: + + col_vals = sorted(prefixed_df[col].unique()) + + # add Missing data by hand if not already there OR + # pop and reinsert at the end to align with the spec! + # make sure the values are sorted AFTER we remove the existing + # Missing data, but BEFORE we reinsert it. + col_miss_val = f"{col}{sep}{MISSING_DATA_STR}" + + # don't forget that we need to test equality element-wise, hence conversion + # to an array from; lists don't compare in the same way. + if col_miss_val in col_vals: + col_vals = sorted(np.delete(col_vals, np.array(col_vals) == col_miss_val)) + + col_vals = np.append(col_vals, col_miss_val) + + pos_labels_temp = [ + f"{col}{sep}{x}" for x in range(len(col_vals)) + ] + + pos_labels_inc_column.extend(pos_labels_temp) + + orig_label_to_pos_label.update( + {k:v for v, k in zip(pos_labels_temp, col_vals)} + ) + + # age__0 : 0, etc. + pos_label_to_id = dict( + zip(pos_labels_inc_column, range(len(pos_labels_inc_column))) + ) + + # convert the original, prefixed values first to positional labels + # and then just to numerical IDs + temp_df = (prefixed_df + .map(lambda x: orig_label_to_pos_label.get(x, x)) + .map(lambda x: pos_label_to_id.get(x, x))) + + label_matrix = np.unique(temp_df.values, axis=0).astype(np.intc) + + # make sure column names don't have spaces + col_names = [x.replace(" ", "$") for x in prefixed_df.columns] + + # save the label matrix to SQLite db + create_temp_table( + table_name=f"temp_{id}_matrix", + col_names=col_names, + data=label_matrix, + ) + + # save the lookup to SQLite db; note that numerical_ids are + # upcast to strings by numpy when creating the array! + create_temp_table( + table_name=f"temp_{id}_lookup", + col_names=["pos_label", "num_label"], + data=list(pos_label_to_id.items()), + ) + +def add_prefix(df, sep="__"): + """ + Add column name as prefix to the column values + + Parameters + ---------- + df : pd.DataFrame + df must have purely categorical columns - no checks are made + sep : str, optional + separator must be consistent between add_prefix and remove_prefix + by default "__" + + Returns + ------- + new DataFrame where values are prefixed with column name + """ + + data_dict = {} + + for col in df.columns: + # cast to str in case we're dealing with integer-based categorical columns, like age + df_col_str = df[col].fillna(MISSING_DATA_STR).astype(str) + data_dict[col] = np.add(f"{col}{sep}", df_col_str.values) + + return pd.DataFrame(data_dict) + +def generate_user_linked_anon_df( + spec_dict, linked_cols, num_rows, starting_col_matrix=None): + ''' + Main function to generated user-defined linked columns. + + Parameters + ---------- + spec_dict : dictionary + specification plus internal keys, like _rng + linked_cols : list + there can be only one user-linked group (0, [linked_col_1, linked_col_2, ]) + num_rows : int + number of rows to generate + starting_col_matrix : np.Array shaped (num_rows, len(linked_cols)) + the matrix is either filled with None values or pre-populated if the function + is run multiple times (like when regenerating values after applying custom + actions like make_same) + + Returns + ------- + Data Frame with linked columns + ''' + + table_id = spec_dict["metadata"]["id"] + rng = spec_dict["_rng"] + lookup, matrix = get_lookup_and_matrix_from_db(table_id) + new_label_lookup, proba_lookup = build_new_lookups(spec_dict, linked_cols, lookup) + # DANGER WHEN REVERSING THE DICT - SAME VALUES IN MULTIPLE COLUMNS WILL BE LOST + rev_label_lookup = {key:value for value, key in new_label_lookup.items()} + # linked columns dispersion list + lcd = [spec_dict["columns"][col]["dispersion"] for col in linked_cols] + + # if re-creating linked values from a pre-generated sequence, reverse the dict to + # get the numerical mapping as expected, also changing the dtype for performance. + + if starting_col_matrix is not None: + starting_col_matrix = ( + pd.DataFrame(starting_col_matrix).infer_objects(copy=False) + .fillna(MISSING_DATA_STR) + .map(lambda x: rev_label_lookup.get(x, x)).values.astype(np.int16) + ) + + else: + starting_col_matrix = np.full( + shape=(num_rows, len(linked_cols)), fill_value=-1) + + # multiprocessing only on unix + if sys.platform != "win32": + with Pool(processes=4) as pool: + + new_rows = pool.map( + partial(process_row, matrix, proba_lookup, lcd, rng), + starting_col_matrix + ) + else: #pragma: no cover + + new_rows = [] + + for i in range(num_rows): + new_row = process_row( + matrix, proba_lookup, lcd, rng, starting_col_matrix[i]) + new_rows.append(new_row) + + new_matrix = np.stack(new_rows) + + new_df = pd.DataFrame( + new_matrix, columns=linked_cols).map(lambda x: new_label_lookup.get(x, x)) + + return new_df + +def get_lookup_and_matrix_from_db(table_id): + ''' + The names of the two tables required for user defined linkage don't change: + one is lookup and another is matrix. + ''' + + lookup = dict(query_exhibit_database(f"temp_{table_id}_lookup").values) + matrix = query_exhibit_database(f"temp_{table_id}_matrix").values + + return lookup, matrix + +def process_row( + label_matrix, proba_lookup, lcd, rng, ref_array, acc_array=None, i=0): + ''' + Recursive function to generate new rows of data from the + existing linked matrix. It's possible the function will be + called multiple times to generate a column value if there + are no valid values that follow on from earlier values in the sequence. + + For example, if A => A1 => A11 and B => B2 => B12 then if the second + column has dispersion set to > 0, the row generation might go like this: + A => B2 (due to dispersion) => B12 (falling back to a valid 2-member sequence + rather than generating a random value because there isn't a A => B2 predefined + in the linkage matrix taken from the original data). + + Parameters + ---------- + label_matrix : np.array + array where shape[0] is the number_unique_combinations_of_all_linked_col_values + and shape[1] is the number of linked columns + proba_lookup : dictionary + dictionary where keys are encoded original values (0, 1, 2, etc.) and values + are their probabilities taken either from the specification or equalised from db + lcd : list + list with dispersion values for each column in linked_columns + rng : np.rng + shared RNG generator + ref_array : np.Array + array of either None values or pre-populated with existing df values + acc_array : np.Array + accummulated array that is being processed and returned + i : integer + a counter in case we need to reduce the sequence size to check for valid + combinations to determine the next valid value + + Returns + ------- + np.array of a single row with encoded column values + ''' + + if acc_array is None: + acc_array = np.array([]) + + arr_len = len(acc_array) + ref_arr_len = len(ref_array) + + if arr_len == label_matrix.shape[1]: + return acc_array + + # if there are no valid targets due to dispersion throwing in a non-valid target, + # rather than continue checking the full array (which will always fail to produce + # a valid next value), change the first position of the array being checked from 0 + # to counter i and increase until you exhaust the prior possibilities. The fallback + # is that there will always be valid targets for previous sequence length = 1 aka + # from one column to the next. + + _ref_array = np.where(ref_array == -1, label_matrix, ref_array) + mask = np.all(label_matrix[:, i:ref_arr_len] == _ref_array[:, i:], axis=1) + + valid_targets = np.unique(label_matrix[mask, arr_len]) + + if len(valid_targets) == 0: + + i = i + 1 + return process_row( + label_matrix, proba_lookup, lcd, rng, ref_array, acc_array, i) + + target_proba = np.array([proba_lookup[x] for x in valid_targets]) + + # typically, there will be more than 1 value in target_proba, but we have to guard against + # possibility of there being just one value, and if its probability is zero (Missing data) + # then summing it to 1 will result in NA (division by zero). As a workaround, set proba to + # 1 whenever it's the only possible value - since having it less than 1 doesn't make sense. + if len(target_proba) == 1: + target_proba = np.array([1]) + + # make sure the probabilities sum up to 1 + target_proba = target_proba * (1 / sum(target_proba)) + + # take dispersion from the spec + dispersion = lcd[arr_len] + + # default is to pick a random valid target + next_val = rng.choice(a=valid_targets, p=target_proba) + + # except when it's already pre-generated + if ref_array[arr_len] != -1: + next_val = ref_array[arr_len] + + # or dispersion is in effect; this part is expensive so only calculate if needed + elif dispersion and rng.random() < dispersion: + all_targets = np.unique(label_matrix[:, arr_len]) + non_valid_targets = np.setdiff1d(all_targets, valid_targets) + if len(non_valid_targets) > 0: + next_val = rng.choice(a=non_valid_targets) + + new_array = np.append(acc_array, next_val) + + # update the ref_array to capture the just generated value + if ref_array[arr_len] == -1: + ref_array[arr_len] = next_val + + return process_row(label_matrix, proba_lookup, lcd, rng, ref_array, new_array) + +def build_new_lookups(spec_dict, linked_cols, original_lookup): + ''' + Build two lookups: + - from the numerical id to its aliased value. {0: 'hb_code__S08000015', ...} + - from the numerical id to the probability value {0: 0.5} + + Be mindful of all the intermediate steps. The intermediate lookup is created + with the numerical ID to a tuple and then split into two. + + original_lookup is a positional to numerical_id, like so: + {'hb_code__0': 0} which is to say that the zero-th value in the list of + all hb_code values is aliased to the numerical id zero. + + Special case if original values are not stored in the spec, but instead have + been put into the DB + ''' + + pos_labels_inc_column = [] # age__0, age__1, etc. + pos_label_to_orig_tuple = {} # age__0: (age__0-9, 0.5), etc. + + for col in linked_cols: + + orig_vals = spec_dict["columns"][col]["original_values"] + prob_vector = None + + if not isinstance(orig_vals, pd.DataFrame): + + safe_col = col.replace(" ", "$") + table_id = spec_dict["metadata"]["id"] + orig_vals_db = query_exhibit_database(table_name=f"temp_{table_id}_{safe_col}") + orig_vals_sorted = ( + sorted([x for x in orig_vals_db[col] if x != MISSING_DATA_STR]) + + [MISSING_DATA_STR] + ) + + orig_vals = pd.DataFrame(data={col:orig_vals_sorted}) + + if "probability_vector" not in orig_vals_db.columns: + prob_vector = np.ones(orig_vals.shape[0]) + prob_vector[-1] = spec_dict["columns"][col]["miss_probability"] + else: + prob_vector = orig_vals_db["probability_vector"].astype(float).values + prob_vector = np.append( + prob_vector, spec_dict["columns"][col]["miss_probability"]) + + prob_vector /= prob_vector.sum() + + if prob_vector is None: + prob_vector = orig_vals["probability_vector"].values + + pos_labels_temp = [f"{col}__{x}" for x in range(len(orig_vals[col].values))] + pos_labels_inc_column.extend(pos_labels_temp) + pos_label_to_orig_tuple.update( + dict(zip( + pos_labels_temp, tuple(zip(orig_vals[col].values, prob_vector)) + )) + ) + + # 0: age__0, etc. using the ORIGINAL lookup which has all the relationships + id_to_pos_label = {v:k for k, v in original_lookup.items()} + + # if we don't check for the user removed values here, the next line + # will error out with an obscure Key not found message. + if len(original_lookup) != len(pos_label_to_orig_tuple): + raise ValueError(textwrap.dedent(""" + The number of values in user linked columns doesn't match original data. + If you would like to remove values, set their probability to zero. + """)) + + # 0: 'hb_code__aliased_code' + rev_labels = {k: pos_label_to_orig_tuple[v] for k, v in id_to_pos_label.items()} + + # finally, split the tuple dictionary into two separate ones: + label_lookup = {k:v[0] for k, v in rev_labels.items()} + proba_lookup = {k:v[1] for k, v in rev_labels.items()} + + return label_lookup, proba_lookup diff --git a/exhibit/core/linkage/tests/test_linkage_hierarchical.py b/exhibit/core/linkage/tests/test_linkage_hierarchical.py index 0cb3f8e..23a407f 100644 --- a/exhibit/core/linkage/tests/test_linkage_hierarchical.py +++ b/exhibit/core/linkage/tests/test_linkage_hierarchical.py @@ -506,7 +506,7 @@ def test_scenario_2_random_4_cols(self): #first test that high-level column (A) is correctly split ~20-80 self.assertAlmostEqual( 0.2/0.8, - result.groupby("A").size().agg(lambda x: x[0]/x[1]), + result.groupby("A").size().agg(lambda x: x.iloc[0]/x.iloc[1]), delta=0.1 ) @@ -648,7 +648,7 @@ def test_scenario_3_random(self): #between A0 and A1 + A2 (derived from children's probabilieis). self.assertAlmostEqual( 0.2/0.8, - result.groupby("A").size().agg(lambda x: x[0] / (x[1] + x[2])), + result.groupby("A").size().agg(lambda x: x.iloc[0] / (x.iloc[1] + x.iloc[2])), delta=0.1 ) @@ -709,7 +709,7 @@ def test_scenario_3_aliased(self): #between A0 and A1 + A2 (derived from children's probabilieis). self.assertAlmostEqual( 0.2/0.8, - result.groupby("A").size().agg(lambda x: x[0] / (x[1] + x[2])), + result.groupby("A").size().agg(lambda x: x.iloc[0] / (x.iloc[1] + x.iloc[2])), delta=0.1 ) diff --git a/exhibit/core/tests/test_reference.py b/exhibit/core/tests/test_reference.py index 8071d0b..ac886e5 100644 --- a/exhibit/core/tests/test_reference.py +++ b/exhibit/core/tests/test_reference.py @@ -1,796 +1,796 @@ -''' -Reference tests for the Exhibit package -''' - -# Standard library imports -import unittest -from pathlib import Path -import tempfile -from os.path import join -from collections import namedtuple - -# External imports -from pandas.testing import assert_frame_equal -import pandas as pd -import numpy as np - -# Exhibit imports -from exhibit.core.utils import package_dir -from exhibit.db import db_util -from exhibit.core.constants import MISSING_DATA_STR -from exhibit.sample.sample import inpatients_anon, uuid_anon - -# Module under test -from exhibit.core import exhibit as tm - -def replace_nested_dict_values(d1, d2): - ''' - Recursive replacement of dictionary values in matching keys - or adding new ones. - ''' - - for key2 in d2: - if key2 in d1: - if isinstance(d1[key2], dict): - replace_nested_dict_values(d1[key2], d2[key2]) - else: - d1[key2] = d2[key2] - else: - d1[key2] = d2[key2] - -def temp_exhibit( - filename="inpatients.csv", - fromdata_namespace=None, - fromspec_namespace=None, - test_spec_dict=None, - return_spec=True, - return_df=True, - ): - ''' - A helper method to generate and read custom specifications - - Parameters - ---------- - filename : str or pd.DataFrame - the .csv to use as the base for spec / df generation - fromdata_namespace : dict - dictionary with testing values for creating a spec - fromspec_namespace : dict - dictionary with testing values for running generation command - test_spec_dict : dict - dictionary with testing values for user spec: used to update the spec - generated from filename csv or DataFrame - return_spec : boolean - sometimes you only want to generate the csv and don't need the spec, - like in performance benchmark testing - return_df : boolean - sometimes you only want to generate a spec; if return_df is False - then the second element in the return tuple is None - - Returns - ------- - A named tuples with spec dict and the generated dataframe - ''' - - returnTuple = namedtuple("TestRun", ["temp_spec", "temp_df"]) - temp_spec = None - temp_df = None - - if isinstance(filename, dict) or \ - (isinstance(filename, str) and filename[-3:] == "yml"): - source = "yml" - else: - source = "csv" - - # function has five paths: - # 1) given .csv filename (or DataFrame) produce just a spec - # 2) given .csv filename (or DataFrame) produce a spec and a demo .csv - # 3) given a .yml filename (or dict) produce a demo .csv - # 4) given a .yml filename (or dict) produce a spec - # 5) given a .yml filename (or dict) produce a spec and a demo .csv - - # it's important to only generate appropriate parts because they are measured - # separetely in performance benchmarking tests. - - # if source is data, we always produce a spec (can't have demo data without it) - if source == "csv": - - with tempfile.TemporaryDirectory() as td: - - temp_spec_name = "_.yml" - f_name = join(td, temp_spec_name) - - # for internal use when testing with a custom dataframe, not a static file - if isinstance(filename, pd.DataFrame): - default_data_path = filename - else: - default_data_path = Path(package_dir("sample", "_data", filename)) - - fromdata_defaults = { - "command" : "fromdata", - "source" : default_data_path, - "inline_limit" : 30, - "verbose" : True, - "output" : f_name, - "skip_columns" : [], - "equal_weights" : False, - "linked_columns" : None - } - - #Update namespaces - if fromdata_namespace: - fromdata_defaults.update(fromdata_namespace) - - xA = tm.Exhibit(**fromdata_defaults) - xA.read_data() - xA.generate_spec() - - if return_df: - - xA.write_spec() - fromspec_defaults = { - "command" : "fromspec", - "source" : Path(f_name), - "verbose" : True, - } - - if fromspec_namespace: - fromspec_defaults.update(fromspec_namespace) - - xA = tm.Exhibit(**fromspec_defaults) - xA.read_spec() - - if test_spec_dict: - replace_nested_dict_values(xA.spec_dict, test_spec_dict) - - if xA.validate_spec(): - xA.execute_spec() - - temp_df = xA.anon_df - - if return_spec: - temp_spec=xA.spec_dict - - if source == "yml": - - # for internal use when testing with a custom spec_dict, not a static file - if isinstance(filename, dict): - default_spec_path = filename - else: - default_spec_path = Path(package_dir("sample", "_spec", filename)) - - fromspec_defaults = { - "command" : "fromspec", - "source" : default_spec_path, - "verbose" : True, - } - - if fromspec_namespace: - fromspec_defaults.update(fromspec_namespace) - - xA = tm.Exhibit(**fromspec_defaults) - xA.read_spec() - - if test_spec_dict: - replace_nested_dict_values(xA.spec_dict, test_spec_dict) - - if return_spec: - temp_spec = xA.spec_dict - - if return_df: - - if xA.validate_spec(): - xA.execute_spec() - - temp_df = xA.anon_df - - return returnTuple(temp_spec, temp_df) - -class referenceTests(unittest.TestCase): - ''' - Main test suite; command line arguments are mocked - via patch context manager; internal intermediate functions - are mocked inside each test. - ''' - - @classmethod - def setUpClass(cls): - ''' - Create a list of tables to drop after reference tests finish - ''' - - cls._temp_tables = [] - - @classmethod - def tearDownClass(cls): - ''' - Clean up local exhibit.db from temp tables - ''' - - db_util.drop_tables(cls._temp_tables) - - def test_reference_prescribing_non_linked_anon_data(self): - ''' - What this reference test is covering: - - paired 1:1 anonymisation set (birds) - - designating paired columns as complete columns - - unlinking of columns - ''' - - expected_df = pd.read_csv( - package_dir( - "core", "tests", "_reference_data", - "prescribing_anon_non_linked.csv"), - parse_dates=["PaidDateMonth"] - ) - - test_dict = { - "metadata":{"number_of_rows":1500}, - "columns":{ - "HB2014":{ - "cross_join_all_unique_values": True - }, - "HB2014Name":{ - "cross_join_all_unique_values": True - }, - "BNFItemCode":{"anonymising_set":"birds"}, - "BNFItemDescription":{"anonymising_set":"birds"}, - "GPPracticeName":{"anonymising_set":"random"} - }, - "linked_columns":[] - } - - temp_spec, temp_df = temp_exhibit( - filename="prescribing.csv", - test_spec_dict=test_dict - ) - - #save ID to tidy up temp columns created as part of testing - self._temp_tables.append(temp_spec["metadata"]["id"]) - - #sort column names to make sure they are the same - temp_df.sort_index(axis=1, inplace=True) - expected_df.sort_index(axis=1, inplace=True) - - assert_frame_equal( - left=expected_df, - right=temp_df, - check_exact=False, - check_dtype=False, - check_categorical=False - ) - - def test_reference_prescribing_linked_mnt_anon_data(self): - ''' - What this reference test is covering: - - one of the linked columns is in the spec, another is in DB - - anonymisation is done using "mountains" set - - NumberOfPaidItems is generated from a shifted normal distribution - - Note that prescribing dataset has duplicate categorical rows - ''' - - expected_df = pd.read_csv( - package_dir( - "core", "tests", "_reference_data", - "prescribing_anon_mnt_linked.csv"), - parse_dates=["PaidDateMonth"] - ) - - test_dict = { - "columns":{ - "HB2014":{"anonymising_set":"mountains"}, - "HB2014Name":{"anonymising_set":"mountains"}, - "GPPracticeName":{"anonymising_set":"mountains"}, - "NumberOfPaidItems":{"distribution":"normal"} - } - } - - temp_spec, temp_df = temp_exhibit( - filename="prescribing.csv", - test_spec_dict=test_dict - ) - - #save ID to tidy up temp columns created as part of testing - self._temp_tables.append(temp_spec["metadata"]["id"]) - - #sort column names to make sure they are the same - temp_df.sort_index(axis=1, inplace=True) - expected_df.sort_index(axis=1, inplace=True) - - assert_frame_equal( - left=expected_df, - right=temp_df, - check_exact=False, - check_dtype=False, - check_categorical=False - ) - - def test_reference_inpatient_anon_data(self): - ''' - What this reference test is covering: - - duplicates are removed - - manually change labels in Sex column (Female to A, Male to B) - - manually added derived column (avlos) - - removed linked columns from spec - - removed Scotland from HBs and deleted loc columns - - changed the totals for stays (100 000) and los (200 000) - - changed basic constraint to los >= stays - - DB is not used at all so no need for ID - - Note that when basic constraints are added, generated totals can - be different from those set in the spec as target sum is enforced - BEFORE basic constraints are adjusted. - ''' - - args = { - "command" : "fromspec", - "source" : Path(package_dir("sample", "_spec", "inpatients_demo.yml")), - "skip_columns" : [], - "verbose" : True, - } - - xA = tm.Exhibit(**args) - xA.read_spec() - if xA.validate_spec(): - xA.execute_spec() - - table_id = xA.spec_dict["metadata"]["id"] - - #save ID to tidy up temp columns created as part of testing - self._temp_tables.append(table_id) - - #sort column names to make sure they are the same - inpatients_anon.sort_index(axis=1, inplace=True) - xA.anon_df.sort_index(axis=1, inplace=True) - - # there is a quirk of how int is cast on Windows and Unix: int32 vs int64 - # see SO answer: - # Why do Pandas integer `dtypes` not behave the same on Unix and Windows? - assert_frame_equal( - left=inpatients_anon, - right=xA.anon_df, - check_exact=False, - check_dtype=False, - check_categorical=False - ) - - def test_reference_inpatient_il10_random_data(self): - ''' - What this reference test is covering: - - number of unique values exceeds inline limit in all linked columns - - anonymisation method is "random" - - non-linked categorical column (Sex) has missing data - - linked columns share missing categorical data - - Because by default the spec includes the basic constraints of los >= avlos, - if avlos is null (0.065 probability in source data) then los will also be null - in ~130 records. - ''' - - source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) - - # dayfirst=True would trigger warnings when encountering dates in Y-m-d format - test_dataframe = pd.read_csv( - source_data_path, - parse_dates=["quarter_date"], - ) - - # Modify test_dataframe to suit test conditions - # Gives us 500/10225 ~ 5% chance of missing data - rng = np.random.default_rng(seed=0) - rand_idx = rng.choice( - range(test_dataframe.shape[0]), - size=500, - replace=False) - - linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"] - test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN) - - # Gives us ~10% chance of missing data - rand_idx2 = rng.choice( - range(test_dataframe.shape[0]), - size=1000, - replace=False) - - na_cols = ["sex"] - test_dataframe.loc[rand_idx2, na_cols] = np.NaN - - # modify CLI namespace - fromdata_namespace = { - "source" : test_dataframe, - "inline_limit": 10, - } - - # modify spec - test_spec_dict = { - "metadata": {"number_of_rows": 2000, "random_seed": 2}, - "columns" : {"sex": {"cross_join_all_unique_values" : True}} - } - - temp_spec, temp_df = temp_exhibit( - fromdata_namespace=fromdata_namespace, - test_spec_dict=test_spec_dict - ) - - inpatients_anon_il10 = pd.read_csv( - package_dir( - "core", "tests", "_reference_data", - "inpatients_anon_rnd_il10.csv"), - parse_dates=["quarter_date"] - ) - - #save ID to tidy up temp columns created as part of testing - table_id = temp_spec["metadata"]["id"] - self._temp_tables.append(table_id) - - assert_frame_equal( - left=inpatients_anon_il10, - right=temp_df, - check_exact=False, - check_dtype=False, - check_categorical=False - ) - - def test_reference_inpatient_il50_random_data(self): - ''' - What this reference test is covering: - - number of unique values is within inline limit in all columns - - anonymisation method is "random" - - linked columns share missing categorical data - - manually change date frequency from QS to M - ''' - - rng = np.random.default_rng(seed=0) - - source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) - - test_dataframe = pd.read_csv( - source_data_path, - parse_dates=["quarter_date"], - ) - - # Modify test_dataframe to suit test conditions - rand_idx = rng.choice( - range(test_dataframe.shape[0]), - size=500, - replace=False) - - linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"] - test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN) - - # modify CLI namespace - fromdata_namespace = { - "source" : test_dataframe, - "inline_limit": 50, - } - - # modify spec - test_spec_dict = { - "metadata": {"number_of_rows": 2000}, - "columns" : {"quarter_date": - {"from" : "2018-01-01", "frequency": "M"} - } - } - - temp_spec, temp_df = temp_exhibit( - fromdata_namespace=fromdata_namespace, - test_spec_dict=test_spec_dict - ) - - inpatients_anon_il50 = pd.read_csv( - package_dir( - "core", "tests", "_reference_data", - "inpatients_anon_rnd_il50.csv"), - parse_dates=["quarter_date"] - ) - - #save ID to tidy up temp columns created as part of testing - table_id = temp_spec["metadata"]["id"] - self._temp_tables.append(table_id) - - assert_frame_equal( - left=inpatients_anon_il50, - right=temp_df, - check_exact=False, - check_dtype=False, - check_categorical=False - ) - - def test_reference_inpatient_il10_mountains_data(self): - ''' - What this reference test is covering: - - number of unique values exceeds inline limit in all columns - - anonymisation method is hierarchical "mountains" - - anon columns are specified using dot notation - - sex is a "complete" categorical column, but there will be gaps - where missind data is generated in other columns - categorical - values are generated first, and then "blanked" based on miss_pct - - only the most granular linked column has missing values - - avlos is not derived and is calculated "blindly" - ''' - - source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) - - test_dataframe = pd.read_csv( - source_data_path, - parse_dates=["quarter_date"], - ) - - # Modify test_dataframe to suit test conditions - rng = np.random.default_rng(seed=0) - rand_idx = rng.choice( - range(test_dataframe.shape[0]), - size=500, - replace=False) - - linked_cols = ["loc_code", "loc_name"] - test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN) - - # modify CLI namespace - fromdata_namespace = { - "source" : test_dataframe, - "inline_limit": 10, - } - - # Modify test_dataframe to suit test conditions - test_spec_dict = { - "metadata": - {"number_of_rows": 2000}, - "columns": { - "sex" : - {"cross_join_all_unique_values": True} - , - "hb_code": - {"anonymising_set":"mountains.range"} - , - "hb_name": - {"anonymising_set":"mountains.range"} - , - "loc_code": - {"anonymising_set":"mountains.peak"} - , - "loc_name": - {"anonymising_set":"mountains.peak"} - }, - "constraints": { - "basic_constraints" : {} - } - } - - temp_spec, temp_df = temp_exhibit( - fromdata_namespace=fromdata_namespace, - test_spec_dict=test_spec_dict - ) - - inpatients_anon_mnt_il10 = pd.read_csv( - package_dir( - "core", "tests", "_reference_data", - "inpatients_anon_mnt_il10.csv"), - parse_dates=["quarter_date"] - ) - - #save ID to tidy up temp columns created as part of testing - table_id = temp_spec["metadata"]["id"] - self._temp_tables.append(table_id) - - assert_frame_equal( - left=inpatients_anon_mnt_il10, - right=temp_df, - check_exact=False, - check_dtype=False, - check_categorical=False - ) - - def test_reference_inpatient_il50_mountains_data(self): - ''' - What this reference test is covering: - - number of unique values is within inline limit in all columns - - anonymisation method is hierarchical "mountains" - - linked columns share missing categorical data - ''' - - source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) - - test_dataframe = pd.read_csv( - source_data_path, - parse_dates=["quarter_date"], - ) - - # Modify test_dataframe to suit test conditions - rng = np.random.default_rng(seed=0) - rand_idx = rng.choice( - range(test_dataframe.shape[0]), - size=500, - replace=False) - - linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"] - test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN) - - # modify CLI namespace - fromdata_namespace = { - "source" : test_dataframe, - "inline_limit": 50, - } - - # modify spec - test_spec_dict = { - "metadata": - {"number_of_rows": 2000}, - "columns": { - "hb_code": - {"anonymising_set":"mountains"} - , - "hb_name": - {"anonymising_set":"mountains"} - , - "loc_code": - {"anonymising_set":"mountains"} - , - "loc_name": - {"anonymising_set":"mountains"} - }, - } - - temp_spec, temp_df = temp_exhibit( - fromdata_namespace=fromdata_namespace, - test_spec_dict=test_spec_dict - ) - - inpatients_anon_mnt_il50 = pd.read_csv( - package_dir( - "core", "tests", "_reference_data", - "inpatients_anon_mnt_il50.csv"), - parse_dates=["quarter_date"] - ) - - #save ID to tidy up temp columns created as part of testing - table_id = temp_spec["metadata"]["id"] - self._temp_tables.append(table_id) - - assert_frame_equal( - left=inpatients_anon_mnt_il50, - right=temp_df, - check_exact=False, - check_dtype=False, - check_categorical=False - ) - - def test_reference_inpatient_modified_linked_columns_scenario_2(self): - ''' - What this reference test is covering: - - scenario 2 - - custom value in one of the linked columns - - number of linked columns in spec is less than in original SQL - ''' - - source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) - - test_dataframe = pd.read_csv( - source_data_path, - parse_dates=["quarter_date"], - ) - - # modify CLI namespace - fromdata_namespace = { - "source" : test_dataframe, - } - - # modify spec - test_spec_dict = { - "metadata": {"number_of_rows": 2000, "random_seed": 0}, - "columns": { - "hb_name" : { - "uniques" : 2, - "original_values" : pd.DataFrame(data={ - "hb_name": ["PHS A&A", "NHS Borders", MISSING_DATA_STR], - "paired_hb_code": ["S08000015", "S08000016", MISSING_DATA_STR], - "probability_vector" : [0.5, 0.5, 0], - "avlos": [0.5, 0.5, 0], - "los": [0.5, 0.5, 0], - "stays": [0.5, 0.5, 0]}) - } - } - } - - temp_spec, temp_df = temp_exhibit( - fromdata_namespace=fromdata_namespace, - test_spec_dict=test_spec_dict, - ) - - #save ID to tidy up temp columns created as part of testing - table_id = temp_spec["metadata"]["id"] - self._temp_tables.append(table_id) - - self.assertCountEqual( - temp_df["hb_name"].unique(), - ["PHS A&A", "NHS Borders"]) - - def test_reference_inpatient_modified_linked_columns_scenario_3(self): - ''' - What this reference test is covering: - - scenario 3 - - custom value in one of the linked columns - - number of linked columns in spec is less than in original SQL - ''' - - source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) - - test_dataframe = pd.read_csv( - source_data_path, - parse_dates=["quarter_date"], - ) - - # modify CLI namespace - fromdata_namespace = { - "source" : test_dataframe, - "inline_limit": 50 - } - - # modify spec - test_spec_dict = { - "metadata": {"number_of_rows": 2000, "random_seed": 0}, - "columns": { - "loc_name" : { - "uniques" : 5, - "original_values" : pd.DataFrame(data={ - "loc_name": list("ABCDE") + [MISSING_DATA_STR], - "paired_loc_code": list("ABCDE") + [MISSING_DATA_STR], - "probability_vector" : [0.2] * 5 + [0], - "avlos": [0.2] * 5 + [0], - "los": [0.2] * 5 + [0], - "stays": [0.2] * 5 + [0]}) - } - } - } - - temp_spec, temp_df = temp_exhibit( - fromdata_namespace=fromdata_namespace, - test_spec_dict=test_spec_dict, - ) - - #save ID to tidy up temp columns created as part of testing - table_id = temp_spec["metadata"]["id"] - self._temp_tables.append(table_id) - - self.assertCountEqual(temp_df["loc_name"].unique(), list("ABCDE")) - - def test_reference_uuid_data(self): - ''' - What this reference test is covering: - - uuid column type - - generate_as_sequence, make_same and sorting custom actions - - no db - ''' - - args = { - "command" : "fromspec", - "source" : Path(package_dir("sample", "_spec", "uuid_demo.yml")), - "skip_columns" : [], - "verbose" : True, - } - - xA = tm.Exhibit(**args) - xA.read_spec() - if xA.validate_spec(): - xA.execute_spec() - - #sort column names to make sure they are the same - uuid_anon.sort_index(axis=1, inplace=True) - xA.anon_df.sort_index(axis=1, inplace=True) - - # there is a quirk of how int is cast on Windows and Unix: int32 vs int64 - # see SO answer: - # Why do Pandas integer `dtypes` not behave the same on Unix and Windows? - assert_frame_equal( - left=uuid_anon, - right=xA.anon_df, - check_exact=False, - check_dtype=False, - check_categorical=False - ) - -if __name__ == "__main__" and __package__ is None: - #overwrite __package__ builtin as per PEP 366 - __package__ = "exhibit" - unittest.main(warnings="ignore") +''' +Reference tests for the Exhibit package +''' + +# Standard library imports +import unittest +from pathlib import Path +import tempfile +from os.path import join +from collections import namedtuple + +# External imports +from pandas.testing import assert_frame_equal +import pandas as pd +import numpy as np + +# Exhibit imports +from exhibit.core.utils import package_dir +from exhibit.db import db_util +from exhibit.core.constants import MISSING_DATA_STR +from exhibit.sample.sample import inpatients_anon, uuid_anon + +# Module under test +from exhibit.core import exhibit as tm + +def replace_nested_dict_values(d1, d2): + ''' + Recursive replacement of dictionary values in matching keys + or adding new ones. + ''' + + for key2 in d2: + if key2 in d1: + if isinstance(d1[key2], dict): + replace_nested_dict_values(d1[key2], d2[key2]) + else: + d1[key2] = d2[key2] + else: + d1[key2] = d2[key2] + +def temp_exhibit( + filename="inpatients.csv", + fromdata_namespace=None, + fromspec_namespace=None, + test_spec_dict=None, + return_spec=True, + return_df=True, + ): + ''' + A helper method to generate and read custom specifications + + Parameters + ---------- + filename : str or pd.DataFrame + the .csv to use as the base for spec / df generation + fromdata_namespace : dict + dictionary with testing values for creating a spec + fromspec_namespace : dict + dictionary with testing values for running generation command + test_spec_dict : dict + dictionary with testing values for user spec: used to update the spec + generated from filename csv or DataFrame + return_spec : boolean + sometimes you only want to generate the csv and don't need the spec, + like in performance benchmark testing + return_df : boolean + sometimes you only want to generate a spec; if return_df is False + then the second element in the return tuple is None + + Returns + ------- + A named tuples with spec dict and the generated dataframe + ''' + + returnTuple = namedtuple("TestRun", ["temp_spec", "temp_df"]) + temp_spec = None + temp_df = None + + if isinstance(filename, dict) or \ + (isinstance(filename, str) and filename[-3:] == "yml"): + source = "yml" + else: + source = "csv" + + # function has five paths: + # 1) given .csv filename (or DataFrame) produce just a spec + # 2) given .csv filename (or DataFrame) produce a spec and a demo .csv + # 3) given a .yml filename (or dict) produce a demo .csv + # 4) given a .yml filename (or dict) produce a spec + # 5) given a .yml filename (or dict) produce a spec and a demo .csv + + # it's important to only generate appropriate parts because they are measured + # separetely in performance benchmarking tests. + + # if source is data, we always produce a spec (can't have demo data without it) + if source == "csv": + + with tempfile.TemporaryDirectory() as td: + + temp_spec_name = "_.yml" + f_name = join(td, temp_spec_name) + + # for internal use when testing with a custom dataframe, not a static file + if isinstance(filename, pd.DataFrame): + default_data_path = filename + else: + default_data_path = Path(package_dir("sample", "_data", filename)) + + fromdata_defaults = { + "command" : "fromdata", + "source" : default_data_path, + "inline_limit" : 30, + "verbose" : True, + "output" : f_name, + "skip_columns" : [], + "equal_weights" : False, + "linked_columns" : None + } + + #Update namespaces + if fromdata_namespace: + fromdata_defaults.update(fromdata_namespace) + + xA = tm.Exhibit(**fromdata_defaults) + xA.read_data() + xA.generate_spec() + + if return_df: + + xA.write_spec() + fromspec_defaults = { + "command" : "fromspec", + "source" : Path(f_name), + "verbose" : True, + } + + if fromspec_namespace: + fromspec_defaults.update(fromspec_namespace) + + xA = tm.Exhibit(**fromspec_defaults) + xA.read_spec() + + if test_spec_dict: + replace_nested_dict_values(xA.spec_dict, test_spec_dict) + + if xA.validate_spec(): + xA.execute_spec() + + temp_df = xA.anon_df + + if return_spec: + temp_spec=xA.spec_dict + + if source == "yml": + + # for internal use when testing with a custom spec_dict, not a static file + if isinstance(filename, dict): + default_spec_path = filename + else: + default_spec_path = Path(package_dir("sample", "_spec", filename)) + + fromspec_defaults = { + "command" : "fromspec", + "source" : default_spec_path, + "verbose" : True, + } + + if fromspec_namespace: + fromspec_defaults.update(fromspec_namespace) + + xA = tm.Exhibit(**fromspec_defaults) + xA.read_spec() + + if test_spec_dict: + replace_nested_dict_values(xA.spec_dict, test_spec_dict) + + if return_spec: + temp_spec = xA.spec_dict + + if return_df: + + if xA.validate_spec(): + xA.execute_spec() + + temp_df = xA.anon_df + + return returnTuple(temp_spec, temp_df) + +class referenceTests(unittest.TestCase): + ''' + Main test suite; command line arguments are mocked + via patch context manager; internal intermediate functions + are mocked inside each test. + ''' + + @classmethod + def setUpClass(cls): + ''' + Create a list of tables to drop after reference tests finish + ''' + + cls._temp_tables = [] + + @classmethod + def tearDownClass(cls): + ''' + Clean up local exhibit.db from temp tables + ''' + + db_util.drop_tables(cls._temp_tables) + + def test_reference_prescribing_non_linked_anon_data(self): + ''' + What this reference test is covering: + - paired 1:1 anonymisation set (birds) + - designating paired columns as complete columns + - unlinking of columns + ''' + + expected_df = pd.read_csv( + package_dir( + "core", "tests", "_reference_data", + "prescribing_anon_non_linked.csv"), + parse_dates=["PaidDateMonth"] + ) + + test_dict = { + "metadata":{"number_of_rows":1500}, + "columns":{ + "HB2014":{ + "cross_join_all_unique_values": True + }, + "HB2014Name":{ + "cross_join_all_unique_values": True + }, + "BNFItemCode":{"anonymising_set":"birds"}, + "BNFItemDescription":{"anonymising_set":"birds"}, + "GPPracticeName":{"anonymising_set":"random"} + }, + "linked_columns":[] + } + + temp_spec, temp_df = temp_exhibit( + filename="prescribing.csv", + test_spec_dict=test_dict + ) + + #save ID to tidy up temp columns created as part of testing + self._temp_tables.append(temp_spec["metadata"]["id"]) + + #sort column names to make sure they are the same + temp_df.sort_index(axis=1, inplace=True) + expected_df.sort_index(axis=1, inplace=True) + + assert_frame_equal( + left=expected_df, + right=temp_df, + check_exact=False, + check_dtype=False, + check_categorical=False + ) + + def test_reference_prescribing_linked_mnt_anon_data(self): + ''' + What this reference test is covering: + - one of the linked columns is in the spec, another is in DB + - anonymisation is done using "mountains" set + - NumberOfPaidItems is generated from a shifted normal distribution + + Note that prescribing dataset has duplicate categorical rows + ''' + + expected_df = pd.read_csv( + package_dir( + "core", "tests", "_reference_data", + "prescribing_anon_mnt_linked.csv"), + parse_dates=["PaidDateMonth"] + ) + + test_dict = { + "columns":{ + "HB2014":{"anonymising_set":"mountains"}, + "HB2014Name":{"anonymising_set":"mountains"}, + "GPPracticeName":{"anonymising_set":"mountains"}, + "NumberOfPaidItems":{"distribution":"normal"} + } + } + + temp_spec, temp_df = temp_exhibit( + filename="prescribing.csv", + test_spec_dict=test_dict + ) + + #save ID to tidy up temp columns created as part of testing + self._temp_tables.append(temp_spec["metadata"]["id"]) + + #sort column names to make sure they are the same + temp_df.sort_index(axis=1, inplace=True) + expected_df.sort_index(axis=1, inplace=True) + + assert_frame_equal( + left=expected_df, + right=temp_df, + check_exact=False, + check_dtype=False, + check_categorical=False + ) + + def test_reference_inpatient_anon_data(self): + ''' + What this reference test is covering: + - duplicates are removed + - manually change labels in Sex column (Female to A, Male to B) + - manually added derived column (avlos) + - removed linked columns from spec + - removed Scotland from HBs and deleted loc columns + - changed the totals for stays (100 000) and los (200 000) + - changed basic constraint to los >= stays + - DB is not used at all so no need for ID + + Note that when basic constraints are added, generated totals can + be different from those set in the spec as target sum is enforced + BEFORE basic constraints are adjusted. + ''' + + args = { + "command" : "fromspec", + "source" : Path(package_dir("sample", "_spec", "inpatients_demo.yml")), + "skip_columns" : [], + "verbose" : True, + } + + xA = tm.Exhibit(**args) + xA.read_spec() + if xA.validate_spec(): + xA.execute_spec() + + table_id = xA.spec_dict["metadata"]["id"] + + #save ID to tidy up temp columns created as part of testing + self._temp_tables.append(table_id) + + #sort column names to make sure they are the same + inpatients_anon.sort_index(axis=1, inplace=True) + xA.anon_df.sort_index(axis=1, inplace=True) + + # there is a quirk of how int is cast on Windows and Unix: int32 vs int64 + # see SO answer: + # Why do Pandas integer `dtypes` not behave the same on Unix and Windows? + assert_frame_equal( + left=inpatients_anon, + right=xA.anon_df, + check_exact=False, + check_dtype=False, + check_categorical=False + ) + + def test_reference_inpatient_il10_random_data(self): + ''' + What this reference test is covering: + - number of unique values exceeds inline limit in all linked columns + - anonymisation method is "random" + - non-linked categorical column (Sex) has missing data + - linked columns share missing categorical data + + Because by default the spec includes the basic constraints of los >= avlos, + if avlos is null (0.065 probability in source data) then los will also be null + in ~130 records. + ''' + + source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) + + # dayfirst=True would trigger warnings when encountering dates in Y-m-d format + test_dataframe = pd.read_csv( + source_data_path, + parse_dates=["quarter_date"], + ) + + # Modify test_dataframe to suit test conditions + # Gives us 500/10225 ~ 5% chance of missing data + rng = np.random.default_rng(seed=0) + rand_idx = rng.choice( + range(test_dataframe.shape[0]), + size=500, + replace=False) + + linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"] + test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN) + + # Gives us ~10% chance of missing data + rand_idx2 = rng.choice( + range(test_dataframe.shape[0]), + size=1000, + replace=False) + + na_cols = ["sex"] + test_dataframe.loc[rand_idx2, na_cols] = np.NaN + + # modify CLI namespace + fromdata_namespace = { + "source" : test_dataframe, + "inline_limit": 10, + } + + # modify spec + test_spec_dict = { + "metadata": {"number_of_rows": 2000, "random_seed": 2}, + "columns" : {"sex": {"cross_join_all_unique_values" : True}} + } + + temp_spec, temp_df = temp_exhibit( + fromdata_namespace=fromdata_namespace, + test_spec_dict=test_spec_dict + ) + + inpatients_anon_il10 = pd.read_csv( + package_dir( + "core", "tests", "_reference_data", + "inpatients_anon_rnd_il10.csv"), + parse_dates=["quarter_date"] + ) + + #save ID to tidy up temp columns created as part of testing + table_id = temp_spec["metadata"]["id"] + self._temp_tables.append(table_id) + + assert_frame_equal( + left=inpatients_anon_il10, + right=temp_df, + check_exact=False, + check_dtype=False, + check_categorical=False + ) + + def test_reference_inpatient_il50_random_data(self): + ''' + What this reference test is covering: + - number of unique values is within inline limit in all columns + - anonymisation method is "random" + - linked columns share missing categorical data + - manually change date frequency from QS to M + ''' + + rng = np.random.default_rng(seed=0) + + source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) + + test_dataframe = pd.read_csv( + source_data_path, + parse_dates=["quarter_date"], + ) + + # Modify test_dataframe to suit test conditions + rand_idx = rng.choice( + range(test_dataframe.shape[0]), + size=500, + replace=False) + + linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"] + test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN) + + # modify CLI namespace + fromdata_namespace = { + "source" : test_dataframe, + "inline_limit": 50, + } + + # modify spec + test_spec_dict = { + "metadata": {"number_of_rows": 2000}, + "columns" : {"quarter_date": + {"from" : "2018-01-01", "frequency": "ME"} + } + } + + temp_spec, temp_df = temp_exhibit( + fromdata_namespace=fromdata_namespace, + test_spec_dict=test_spec_dict + ) + + inpatients_anon_il50 = pd.read_csv( + package_dir( + "core", "tests", "_reference_data", + "inpatients_anon_rnd_il50.csv"), + parse_dates=["quarter_date"] + ) + + #save ID to tidy up temp columns created as part of testing + table_id = temp_spec["metadata"]["id"] + self._temp_tables.append(table_id) + + assert_frame_equal( + left=inpatients_anon_il50, + right=temp_df, + check_exact=False, + check_dtype=False, + check_categorical=False + ) + + def test_reference_inpatient_il10_mountains_data(self): + ''' + What this reference test is covering: + - number of unique values exceeds inline limit in all columns + - anonymisation method is hierarchical "mountains" + - anon columns are specified using dot notation + - sex is a "complete" categorical column, but there will be gaps + where missind data is generated in other columns - categorical + values are generated first, and then "blanked" based on miss_pct + - only the most granular linked column has missing values + - avlos is not derived and is calculated "blindly" + ''' + + source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) + + test_dataframe = pd.read_csv( + source_data_path, + parse_dates=["quarter_date"], + ) + + # Modify test_dataframe to suit test conditions + rng = np.random.default_rng(seed=0) + rand_idx = rng.choice( + range(test_dataframe.shape[0]), + size=500, + replace=False) + + linked_cols = ["loc_code", "loc_name"] + test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN) + + # modify CLI namespace + fromdata_namespace = { + "source" : test_dataframe, + "inline_limit": 10, + } + + # Modify test_dataframe to suit test conditions + test_spec_dict = { + "metadata": + {"number_of_rows": 2000}, + "columns": { + "sex" : + {"cross_join_all_unique_values": True} + , + "hb_code": + {"anonymising_set":"mountains.range"} + , + "hb_name": + {"anonymising_set":"mountains.range"} + , + "loc_code": + {"anonymising_set":"mountains.peak"} + , + "loc_name": + {"anonymising_set":"mountains.peak"} + }, + "constraints": { + "basic_constraints" : {} + } + } + + temp_spec, temp_df = temp_exhibit( + fromdata_namespace=fromdata_namespace, + test_spec_dict=test_spec_dict + ) + + inpatients_anon_mnt_il10 = pd.read_csv( + package_dir( + "core", "tests", "_reference_data", + "inpatients_anon_mnt_il10.csv"), + parse_dates=["quarter_date"] + ) + + #save ID to tidy up temp columns created as part of testing + table_id = temp_spec["metadata"]["id"] + self._temp_tables.append(table_id) + + assert_frame_equal( + left=inpatients_anon_mnt_il10, + right=temp_df, + check_exact=False, + check_dtype=False, + check_categorical=False + ) + + def test_reference_inpatient_il50_mountains_data(self): + ''' + What this reference test is covering: + - number of unique values is within inline limit in all columns + - anonymisation method is hierarchical "mountains" + - linked columns share missing categorical data + ''' + + source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) + + test_dataframe = pd.read_csv( + source_data_path, + parse_dates=["quarter_date"], + ) + + # Modify test_dataframe to suit test conditions + rng = np.random.default_rng(seed=0) + rand_idx = rng.choice( + range(test_dataframe.shape[0]), + size=500, + replace=False) + + linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"] + test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN) + + # modify CLI namespace + fromdata_namespace = { + "source" : test_dataframe, + "inline_limit": 50, + } + + # modify spec + test_spec_dict = { + "metadata": + {"number_of_rows": 2000}, + "columns": { + "hb_code": + {"anonymising_set":"mountains"} + , + "hb_name": + {"anonymising_set":"mountains"} + , + "loc_code": + {"anonymising_set":"mountains"} + , + "loc_name": + {"anonymising_set":"mountains"} + }, + } + + temp_spec, temp_df = temp_exhibit( + fromdata_namespace=fromdata_namespace, + test_spec_dict=test_spec_dict + ) + + inpatients_anon_mnt_il50 = pd.read_csv( + package_dir( + "core", "tests", "_reference_data", + "inpatients_anon_mnt_il50.csv"), + parse_dates=["quarter_date"] + ) + + #save ID to tidy up temp columns created as part of testing + table_id = temp_spec["metadata"]["id"] + self._temp_tables.append(table_id) + + assert_frame_equal( + left=inpatients_anon_mnt_il50, + right=temp_df, + check_exact=False, + check_dtype=False, + check_categorical=False + ) + + def test_reference_inpatient_modified_linked_columns_scenario_2(self): + ''' + What this reference test is covering: + - scenario 2 + - custom value in one of the linked columns + - number of linked columns in spec is less than in original SQL + ''' + + source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) + + test_dataframe = pd.read_csv( + source_data_path, + parse_dates=["quarter_date"], + ) + + # modify CLI namespace + fromdata_namespace = { + "source" : test_dataframe, + } + + # modify spec + test_spec_dict = { + "metadata": {"number_of_rows": 2000, "random_seed": 0}, + "columns": { + "hb_name" : { + "uniques" : 2, + "original_values" : pd.DataFrame(data={ + "hb_name": ["PHS A&A", "NHS Borders", MISSING_DATA_STR], + "paired_hb_code": ["S08000015", "S08000016", MISSING_DATA_STR], + "probability_vector" : [0.5, 0.5, 0], + "avlos": [0.5, 0.5, 0], + "los": [0.5, 0.5, 0], + "stays": [0.5, 0.5, 0]}) + } + } + } + + temp_spec, temp_df = temp_exhibit( + fromdata_namespace=fromdata_namespace, + test_spec_dict=test_spec_dict, + ) + + #save ID to tidy up temp columns created as part of testing + table_id = temp_spec["metadata"]["id"] + self._temp_tables.append(table_id) + + self.assertCountEqual( + temp_df["hb_name"].unique(), + ["PHS A&A", "NHS Borders"]) + + def test_reference_inpatient_modified_linked_columns_scenario_3(self): + ''' + What this reference test is covering: + - scenario 3 + - custom value in one of the linked columns + - number of linked columns in spec is less than in original SQL + ''' + + source_data_path = Path(package_dir("sample", "_data", "inpatients.csv")) + + test_dataframe = pd.read_csv( + source_data_path, + parse_dates=["quarter_date"], + ) + + # modify CLI namespace + fromdata_namespace = { + "source" : test_dataframe, + "inline_limit": 50 + } + + # modify spec + test_spec_dict = { + "metadata": {"number_of_rows": 2000, "random_seed": 0}, + "columns": { + "loc_name" : { + "uniques" : 5, + "original_values" : pd.DataFrame(data={ + "loc_name": list("ABCDE") + [MISSING_DATA_STR], + "paired_loc_code": list("ABCDE") + [MISSING_DATA_STR], + "probability_vector" : [0.2] * 5 + [0], + "avlos": [0.2] * 5 + [0], + "los": [0.2] * 5 + [0], + "stays": [0.2] * 5 + [0]}) + } + } + } + + temp_spec, temp_df = temp_exhibit( + fromdata_namespace=fromdata_namespace, + test_spec_dict=test_spec_dict, + ) + + #save ID to tidy up temp columns created as part of testing + table_id = temp_spec["metadata"]["id"] + self._temp_tables.append(table_id) + + self.assertCountEqual(temp_df["loc_name"].unique(), list("ABCDE")) + + def test_reference_uuid_data(self): + ''' + What this reference test is covering: + - uuid column type + - generate_as_sequence, make_same and sorting custom actions + - no db + ''' + + args = { + "command" : "fromspec", + "source" : Path(package_dir("sample", "_spec", "uuid_demo.yml")), + "skip_columns" : [], + "verbose" : True, + } + + xA = tm.Exhibit(**args) + xA.read_spec() + if xA.validate_spec(): + xA.execute_spec() + + #sort column names to make sure they are the same + uuid_anon.sort_index(axis=1, inplace=True) + xA.anon_df.sort_index(axis=1, inplace=True) + + # there is a quirk of how int is cast on Windows and Unix: int32 vs int64 + # see SO answer: + # Why do Pandas integer `dtypes` not behave the same on Unix and Windows? + assert_frame_equal( + left=uuid_anon, + right=xA.anon_df, + check_exact=False, + check_dtype=False, + check_categorical=False + ) + +if __name__ == "__main__" and __package__ is None: + #overwrite __package__ builtin as per PEP 366 + __package__ = "exhibit" + unittest.main(warnings="ignore") diff --git a/exhibit/core/tests/test_spec.py b/exhibit/core/tests/test_spec.py index f5230d7..1a3a72a 100644 --- a/exhibit/core/tests/test_spec.py +++ b/exhibit/core/tests/test_spec.py @@ -69,7 +69,7 @@ def test_column_order_in_spec_is_correctly_based_on_types(self): "ints" : range(5), "floats": np.linspace(0, 1, num=5), "bools" : [True, True, True, True, False], - "dates" : pd.date_range(start="1/1/2018", periods=5, freq="M"), + "dates" : pd.date_range(start="1/1/2018", periods=5, freq="ME"), "cats" : list("ABCDE") }) diff --git a/exhibit/core/tests/test_utils.py b/exhibit/core/tests/test_utils.py index 4e7f1b3..52585a0 100644 --- a/exhibit/core/tests/test_utils.py +++ b/exhibit/core/tests/test_utils.py @@ -87,13 +87,13 @@ def test_date_frequency_guesser(self): returns correct values. ''' - test_frequencies = ["D", "M", "MS", "Q", "QS", "BA-MAR"] + test_frequencies = ["D", "ME", "MS", "QE", "QS", "BYE-MAR"] test_cases = [pd.Series(pd.date_range(start="2015/01/01", periods=12, freq=f)) for f in test_frequencies] result = [tm.guess_date_frequency(x) for x in test_cases] - expected = ["D", "M", "MS", "Q", "QS", "YS"] + expected = ["D", "ME", "MS", "QE", "QS", "YS"] self.assertEqual(result, expected) @@ -154,7 +154,8 @@ def test_float_or_int(self): ''' test_series_1 = pd.Series([1, 2, 3, 4, 5, 0.0]) - test_series_2 = pd.Series([1, pd.NA, 2, 3]) + # default dtype for the below range is object rather than int64 + test_series_2 = pd.Series([1, pd.NA, 2, 3], dtype="Int64") test_series_3 = pd.Series([0.1, 0.2, 3, 4]) self.assertTrue(tm.float_or_int(test_series_1), "integer") diff --git a/exhibit/core/utils.py b/exhibit/core/utils.py index 95a7dae..5323cdc 100644 --- a/exhibit/core/utils.py +++ b/exhibit/core/utils.py @@ -208,10 +208,10 @@ def guess_date_frequency(timeseries): for period_range, period_alias in aliases.items(): if first_period in period_range: - # decide whether it's period start (QS) or end (Q) + # decide whether it's period start or end (M/Q/YE) if period_alias in ["MS", "QS"]: if not (timeseries.dt.day == 1).all(): - return period_alias[0] + return period_alias[0] + "E" return period_alias return None #pragma: no cover