From 283ef501632fec1b7f683d60221dc5ff6f04f0f3 Mon Sep 17 00:00:00 2001
From: gherka <germanpriks@gmail.com>
Date: Mon, 2 Sep 2024 15:35:28 +0100
Subject: [PATCH] Fixed errors and deprecation warnings from Pandas 2.2.2

---
 exhibit/core/constraints.py                   |   18 +-
 exhibit/core/exhibit.py                       |    7 +-
 exhibit/core/generate/categorical.py          | 1324 +++++++-------
 exhibit/core/generate/missing.py              |  690 +++----
 exhibit/core/generate/tests/test_derived.py   |  206 +--
 exhibit/core/generate/tests/test_missing.py   | 1236 ++++++-------
 exhibit/core/linkage/hierarchical.py          |    4 +-
 exhibit/core/linkage/matrix.py                |  778 ++++----
 .../tests/test_linkage_hierarchical.py        |    6 +-
 exhibit/core/tests/test_reference.py          | 1592 ++++++++---------
 exhibit/core/tests/test_spec.py               |    2 +-
 exhibit/core/tests/test_utils.py              |    7 +-
 exhibit/core/utils.py                         |    4 +-
 13 files changed, 2943 insertions(+), 2931 deletions(-)

diff --git a/exhibit/core/constraints.py b/exhibit/core/constraints.py
index 42ebe6f..1588cf3 100644
--- a/exhibit/core/constraints.py
+++ b/exhibit/core/constraints.py
@@ -210,11 +210,20 @@ def process_custom_constraints(self, custom_constraints):
                         _kwargs = kwargs_dict.get(action, {})
                         _kwargs.update(spec_action_kwargs)
 
-                        # overwrite the original DF row IDs with the adjusted ones
-                        output_df.loc[cc_filter_idx] = action_func(
+                        # because the result of the action can be a different dtype compared
+                        # to the original (like int to float, particularly involving NULLs)
+                        # we need to capture the resultant dtype first, and then cast the 
+                        # original df to match it to avoid Pandas errors.
+                        action_df = action_func(
                             output_df, cc_filter_idx, target_str,
                             cc_partitions, **_kwargs)
+                        
+                        action_dtypes = action_df.dtypes
 
+                        output_df = output_df.astype(action_dtypes)
+        
+                        # overwrite the original DF row IDs with the adjusted ones
+                        output_df.loc[cc_filter_idx] = action_df
         return output_df
 
     def adjust_dataframe_to_fit_constraint(self, anon_df, basic_constraint):
@@ -1231,12 +1240,13 @@ def shift_distribution(
 
                 final_result.append(new_series)
                 continue
-
+        
+        # return the DF, matching the dtypes of the original (relevant for dates)
         new_df = pd.concat(
             final_result + 
             [df.loc[filter_idx, [x for x in df.columns if x not in target_cols]]],
             axis=1
-        ).reindex(columns=df.columns)
+        ).reindex(columns=df.columns).astype(df.dtypes)
 
         return new_df
 
diff --git a/exhibit/core/exhibit.py b/exhibit/core/exhibit.py
index 72f4012..0b0a397 100644
--- a/exhibit/core/exhibit.py
+++ b/exhibit/core/exhibit.py
@@ -344,9 +344,10 @@ def execute_spec(self):
                             )
             
             if col in geo_action_targets:
-                # add placeholders to avoid errors when generating missing data
+                # add float placeholders to avoid errors when generating missing data
                 geo_cols = [f"{col}_latitude", f"{col}_longitude"]
-                anon_df[geo_cols] = 0
+                # use 0.0 to ensure column dtype is float so that we could null them later
+                anon_df[geo_cols] = 0.0
                 continue
 
             h3_table_name = self.spec_dict["columns"][col]["h3_table"]
@@ -444,7 +445,7 @@ def execute_spec(self):
                         anon_df[derived_col] = generate_derived_column(anon_df, derived_def)
                         break             
             # change the missing data placeholder back to NAs
-            anon_df.loc[:, cat_cols] = anon_df.loc[:, cat_cols].applymap(
+            anon_df.loc[:, cat_cols] = anon_df.loc[:, cat_cols].map(
             lambda x: np.nan if x == MISSING_DATA_STR else x)
 
         #8) GENERATE DERIVED COLUMNS IF ANY ARE SPECIFIED
diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py
index 861e673..7be717b 100644
--- a/exhibit/core/generate/categorical.py
+++ b/exhibit/core/generate/categorical.py
@@ -1,662 +1,662 @@
-'''
-Methods to generate categorical columns / values
-'''
-
-# Standard library imports
-from collections import namedtuple
-from itertools import chain
-import warnings
-
-# External library imports
-import pandas as pd
-import numpy as np
-from sql_metadata import Parser
-from pandas.api.types import is_numeric_dtype, is_datetime64_dtype
-
-# Exhibit imports
-from ..constants import ORIGINAL_VALUES_REGEX, ORIGINAL_VALUES_PAIRED
-from ..utils import get_attr_values, shuffle_data
-from ..sql import query_exhibit_database, check_table_exists, execute_sql, create_temp_table
-from ..linkage.hierarchical import generate_linked_anon_df
-from ..linkage.matrix import generate_user_linked_anon_df
-from .regex import generate_regex_column
-
-# EXPORTABLE METHODS
-# ==================
-class CategoricalDataGenerator:
-    '''
-    Although this class is pretty bare, it still helps avoid passing
-    the same variables through functions and also mirrors the setup
-    for generation of linked data.
-
-    One area that potentially needs looking at is if the user makes
-    manual changes to column values that were initially put into SQL
-    (where uniques > inline_limit) - for now, this works only for linked data.
-    '''
-
-    def __init__(self, spec_dict, core_rows, anon_df=None):
-        '''
-        This class is covering the entire spec_dict as far as the 
-        generation of non-numerical data is concerned.
-        '''
-        
-        self.spec_dict = spec_dict
-        self.rng = spec_dict["_rng"]
-        self.num_rows = core_rows
-        self.fixed_anon_sets = ["random", "mountains", "patients", "birds", "dates"]
-        # we need UUID dataset (if it exists) for possible conditional SQL that
-        # references already-generated columns in the spec
-        self.generated_dfs = []
-        self.anon_df = anon_df
-        
-        (self.all_cols,
-         self.complete_cols,
-         self.paired_cols,
-         self.skipped_cols) = self._get_column_types()
-
-    def generate(self):
-        '''
-        Brings together all the components of non-numerical data generation.
-
-        Returns
-        -------
-        A dataframe with all categorical columns
-        '''
-
-        #1) GENERATE LINKED DFs FROM EACH LINKED COLUMNS GROUP
-        for linked_group in (self.spec_dict.get("linked_columns") or []):
-            
-            # zero-numbered linked group is reserved for user-defined groupings
-            if linked_group[0] == 0:
-
-                u_linked_df = generate_user_linked_anon_df(
-                    spec_dict=self.spec_dict,
-                    linked_cols=linked_group[1],
-                    num_rows=self.num_rows
-                )
-
-                self.generated_dfs.append(u_linked_df)
-
-            else:
-
-                linked_df = generate_linked_anon_df(
-                    spec_dict=self.spec_dict,
-                    linked_group=linked_group,
-                    num_rows=self.num_rows)
-
-                self.generated_dfs.append(linked_df)
-
-        #2) GENERATE NON-LINKED DFs
-        for col in [col for col in self.all_cols if col not in self.skipped_cols]:
-            s = self._generate_anon_series(col)
-            self.generated_dfs.append(s)
-
-        #3) CONCAT GENERATED DFs AND SERIES
-        temp_anon_df = pd.concat(self.generated_dfs, axis=1)
-
-        #4) GENERATE SERIES WITH "COMPLETE", CROSS-JOINED COLUMNS
-        complete_series = []
-
-        # Complete series can sort the data again
-        for col in self.complete_cols:
-            s = self._generate_complete_series(col)
-            #paired columns return None
-            if not s is None:
-                complete_series.append(s)
-        
-        #5) OUTER JOIN
-        temp_anon_df["key"] = 1
-
-        for s in complete_series:
-
-            temp_anon_df = pd.merge(
-                temp_anon_df,
-                pd.DataFrame(s).assign(key=1),
-                how="outer",
-                on="key"
-            )
-        
-        #6) TIDY UP
-        anon_df = temp_anon_df.drop("key", axis=1)
-
-        return anon_df
-
-    def _generate_timeseries(self, col_name, complete=False):
-        '''
-        Basic generator of randomised / complete timeseries data
-
-        Parameters:
-        ----------
-        col_name  : str
-            time column to generate (type checks are made upstream)
-        complete  : boolean
-            if timeseries is meant to be "complete", return full series
-            without picking N=num_rows random values from the pool
-
-        Returns:
-        --------
-        pd.Series
-        '''
-
-        # see which date parameters we have access to
-        start = self.spec_dict["columns"][col_name].get("from", None)
-        end = self.spec_dict["columns"][col_name].get("to", None)
-        
-        # frequency and periods are always required
-        freq = self.spec_dict["columns"][col_name]["frequency"]
-        periods = self.spec_dict["columns"][col_name]["uniques"]
-
-        # if we have both start and end, we generate all values in-between and pick the 
-        # dates at random to match the number of periods, without repeats
-        if start is not None and end is not None:
-
-            all_pos_dates = pd.date_range(start=start, end=end, freq=freq)
-            # when the number of requested periods is greater than the total possible
-            # range between from and to, given the frequency, we issue a warning, then
-            # omit the date_to and generate N=periods unique dates from date_from.
-            if len(all_pos_dates) < periods:
-                warnings.warn(
-                    f"The number of unique dates at frequency {freq} between {start} "
-                    f"and {end} is smaller than the number of requested periods"
-                    f"({periods}). The date_to parameter will be ignored.",
-                    RuntimeWarning
-                    )
-                all_pos_dates = pd.date_range(start=start, periods=periods, freq=freq)
-
-            all_pos_dates = self.rng.choice(all_pos_dates, periods, replace=False)
-
-        else:
-            # one of the start / end is None
-            all_pos_dates = pd.date_range(
-                start=start, end=end, periods=periods, freq=freq)
-
-        if complete:
-            return pd.Series(all_pos_dates, name=col_name)
-        
-        random_dates = self.rng.choice(all_pos_dates, self.num_rows)
-
-        return shuffle_data(pd.Series(random_dates, name=col_name))
-
-    def _generate_anon_series(self, col_name):
-        '''
-        Generate basic categorical series anonymised according to user input.
-
-        Note that in all cases except external tables, the final series is shuffled
-        and index reset. Series generated from external tables are an exception because
-        their values are linked to columns that have already been generated.
-
-        The code can take different paths depending on these things: 
-        - whether a the anonymising method is set to random or a custom set
-        - whether the number of unique values exceeds the threshold
-        - whether the column has any paired columns
-
-        The paths differ primarily in terms of where the data sits: as part
-        of the spec in original_values or in exhibit DB.
-
-        Things are further complicated if users want to use a single column
-        from an anonymising table, like mountains.peak
-
-        Parameters:
-        -----------
-        col_name : str
-            column name to process & anonymise
-
-        Returns:
-        -------
-        Pandas Series object or a Dataframe
-        '''
-
-        col_attrs = self.spec_dict["columns"][col_name]
-        col_type = col_attrs["type"]
-        
-        # capture categorical-only information, with fallback for date columns
-        paired_cols = col_attrs.get("paired_columns", None)
-        orig_vals = col_attrs.get("original_values", None)
-        target_uniques = col_attrs.get("uniques", None)
-
-        # typically, only categorical columns will have an anonymising set, but time
-        # columns can use it for SQL to pull conditional values from external table
-        # ignoring the standard date genderation parameters, like from / to.        
-        anon_set = col_attrs.get("anonymising_set", None)
-
-        # Users can pass custom functions to generate categorical / date columns
-        if callable(anon_set):
-            return self._generate_using_custom_function(col_name, anon_set)
-
-        # check if the anonymising set is a SQL statement starting with SELECT
-        # note that for dates, all other parameters, like from / to will be ignored
-        if anon_set is not None and anon_set.strip().upper()[:6] == "SELECT":
-            return self._generate_using_external_table(col_name, anon_set)
-
-        # normal date columns generated using from / to / number of uniques
-        if col_type == "date":
-            return self._generate_timeseries(col_name, complete=False)  
-
-        # generate values based on a regular expression specified in the anonymising_set
-        if isinstance(orig_vals, str) and orig_vals == ORIGINAL_VALUES_REGEX:
-            return generate_regex_column(
-                anon_set, col_name, self.num_rows, target_uniques)
-
-        # values were stored in SQL; randomise based on uniform distribution
-        if col_attrs["uniques"] > self.spec_dict["metadata"]["inline_limit"]:
-            return self._generate_from_sql(col_name, col_attrs)
-
-        # we have access to original_values and the paths are dependant on anon_set
-        # take every row except last which is reserved for Missing data
-        col_df = col_attrs["original_values"].iloc[:-1, :]
-        col_prob = np.array(col_df["probability_vector"]).astype(float)
-
-        if col_prob.sum() != 1:
-            col_prob /= col_prob.sum()
-
-        if anon_set == "random": 
-
-            col_values = col_df[col_name].to_list()
-
-            original_series = pd.Series(
-                data=self.rng.choice(a=col_values, size=self.num_rows, p=col_prob),
-                name=col_name)
-
-            if paired_cols:
-                paired_df = (
-                    col_df[[col_name] + [f"paired_{x}" for x in paired_cols]]
-                        .rename(columns=lambda x: x.replace("paired_", ""))
-                )
-
-                return shuffle_data(
-                    pd.merge(original_series, paired_df, how="left", on=col_name))
-
-            return shuffle_data(original_series)
-
-        # finally, if we have original_values, but anon_set is not random
-        # we pick the N distinct values from the anonymysing set, replace
-        # the original values + paired column values in the original_values
-        # DATAFRAME, making sure the changes happen in-place which means
-        # that downstream, the weights table will be built based on the
-        # modified "original_values" dataframe.
-
-        sql_df = self._generate_from_sql(col_name, col_attrs, complete=True)
-
-        # includes Missing data row as opposed to col_df which doesn't
-        orig_df = col_attrs["original_values"]
-
-        # missing data is the last row
-        repl = sql_df[col_name].unique()
-        aliases = dict(zip(orig_df[col_name].values[:-1], repl))
-        aliased_df = orig_df.applymap(lambda x: aliases.get(x, x))
-        self.spec_dict["columns"][col_name]["original_values"] = aliased_df
-
-        # we ignore Missing data probability when we originally create the variable
-        idx = self.rng.choice(a=len(sql_df), p=col_prob, size=self.num_rows)
-        anon_list = [sql_df.iloc[x, :].values for x in idx]
-        anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list)
-
-        return shuffle_data(anon_df)
-        
-    def _generate_from_sql(self, col_name, col_attrs, complete=False, db_path=None):
-        '''
-        Whatever the anonymising method, if a column has more unique values than
-        allowed by the inline_limit parameter, it will be put into SQLite3 db.
-        '''
-
-        anon_set = col_attrs["anonymising_set"]
-        uniques = col_attrs["uniques"]
-        paired_cols = col_attrs["paired_columns"] or []
-
-        #1) QUERY SQL TO GET VALUES USED TO BUILD THE DATAFRAME
-        if anon_set == "random":
-
-            safe_col_name = col_name.replace(" ", "$")
-            table_name = f"temp_{self.spec_dict['metadata']['id']}_{safe_col_name}"
-            sql_df = query_exhibit_database(
-                table_name, exclude_missing=True, db_path=db_path)
-
-        else:
-            table_name, *sql_column = anon_set.split(".")
-            sql_df = query_exhibit_database(table_name, sql_column, uniques)
-
-        # if sql df is an anonymising set with different column names, like mountaints,
-        # we want to rename them to the actual column names used in the spec;
-        # alternatively, if the sql df is a lookup and column there match the spec, we
-        # make sure to take those columns that match.
-        if set([col_name] + paired_cols).issubset(set(sql_df.columns)):
-            sql_df = sql_df[[col_name] + paired_cols]
-
-        # rename sql_df columns to be same as original + paired; zip is 
-        # only going to pair up columns up to the shorter list!
-        sql_df.rename(
-            columns=dict(zip(
-                sql_df.columns,
-                [col_name] + paired_cols
-            )),
-            inplace=True
-        )
-
-        #2) GENERATE ANONYMISED ROWS
-        if complete:
-            anon_df = sql_df.drop(columns="probability_vector", errors="ignore")
-        else:
-            if "probability_vector" in sql_df.columns:
-                probs = sql_df["probability_vector"].astype(float).values
-                probs = probs / probs.sum()
-                sql_df.drop(columns="probability_vector", inplace=True)
-                idx = self.rng.choice(a=len(sql_df), p=probs, size=self.num_rows)
-            else:
-                idx = self.rng.choice(len(sql_df), self.num_rows)
-
-            anon_list = [sql_df.iloc[x, :].values for x in idx]
-            anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list)
-
-        #3) HANDLE MISSING PAIRED COLUMNS IN SQL
-        # if the column has paired columns and a non-random anonymising set,
-        # the anonymising set must also provide the paired columns or the same
-        # values will be used for the original + paired columns
-        missing_paired_cols = set(paired_cols) - set(sql_df.columns[1:])
-
-        if missing_paired_cols:
-            missing_df = pd.DataFrame(
-                data=zip(*[anon_df[col_name]] * len(missing_paired_cols)),
-                # sets are no longer allowed as column names
-                columns=list(missing_paired_cols)
-            )
-
-            anon_df = pd.concat([anon_df, missing_df], axis=1)
-
-        return shuffle_data(anon_df)
-
-    def _generate_complete_series(self, col_name):
-        '''
-        This function doesn't take num_rows argument because
-        we are always generating the full number of rows
-        for this column as specified in the spec.
-
-        Function path depends on the column type: date or categorical
-
-        Returns
-        -------
-        pd.Series for non-paired columns and pd.DataFrame for pairs
-
-        For now, the function doesn't support columns where values are
-        stored in the DB because the number of their uniques exceeds
-        category threshold or if they are anonymised using a set from DB.
-        '''
-        
-        col_attrs = self.spec_dict["columns"][col_name]
-        
-        if col_attrs["type"] == "date":
-
-            return self._generate_timeseries(col_name, complete=True) 
-        
-        # if paired column, skip, and add pairs as part of parent column's processing
-        if col_name in self.paired_cols:
-            return None
-
-        # if column has paired columns, return a dataframe with it + paired cols
-        paired_cols = col_attrs["paired_columns"]
-
-        # all cat. columns have a missing data placeholder as -1 row so we exclude it
-        if paired_cols:
-            paired_complete_df = (
-                col_attrs["original_values"].iloc[:-1, 0:len(paired_cols)+1])
-            paired_complete_df.rename(
-                columns=lambda x: x.replace("paired_", ""), inplace=True)
-
-            return paired_complete_df
-
-        return pd.Series(col_attrs["original_values"].iloc[:-1, 0], name=col_name)
-
-    def _get_column_types(self):
-        '''
-        Convenience function to categorise columns into 4 types:
-            - nested linked columns (generated separately as part of linkage.py)
-            - complete columns - all values are used
-            - columns where original values are paired with a "main" column
-
-        All of the above are treated in a special way either in a separate
-        generation routine (like linked columns) or are generated as a
-        by-product of another routine (like paired columns). Columns that remain,
-        are generated in a "normal" way as part of this module.
-
-        Returns
-        -------
-        namedtuple("Columns", ["all", "complete", "paired", "skipped"])
-        '''
-
-        Columns = namedtuple("Columns", ["all", "complete", "paired", "skipped"])
-
-        # there might be cases when you want to generate just the date columns or just
-        # the categorical columns so they might be missing from the metadata section
-        all_cols = (
-            (self.spec_dict["metadata"].get("categorical_columns", [])) +
-            (self.spec_dict["metadata"].get("date_columns", []))
-        )
-        
-        nested_linked_cols = [
-            sublist for n, sublist in (self.spec_dict.get("linked_columns") or [])
-            ]
-
-        complete_cols = [c for c, v in get_attr_values(
-            self.spec_dict,
-            "cross_join_all_unique_values",
-            col_names=True, 
-            types=["categorical", "date"]) if v]
-
-        list_of_orig_val_tuples = get_attr_values(
-            self.spec_dict,
-            "original_values",
-            col_names=True,
-            types=["categorical", "date"])
-
-        paired_cols = [
-            k for k, v in list_of_orig_val_tuples if str(v) == ORIGINAL_VALUES_PAIRED]
-
-        skipped_cols = (
-            list(chain.from_iterable(nested_linked_cols)) +
-            complete_cols +
-            paired_cols
-        )
-
-        column_types = Columns(all_cols, complete_cols, paired_cols, skipped_cols)
-
-        return column_types
-
-    def _generate_using_external_table(self, col_name, anon_set):
-        '''
-        We assume that the aliased column is the one you want to pick the values from
-        and the rest of the columns in the select statement are going to be the join
-        keys.
-        '''
-
-        parser = Parser(anon_set)
-        sql_tables = parser.tables
-        aliased_columns = parser.columns_aliases_names
-        source_table_id = self.spec_dict["metadata"]["id"]
-    
-        if len(aliased_columns) != 1 or aliased_columns[0] != col_name:
-            raise RuntimeError(
-                f"Please make sure the SQL SELECT statement in {col_name}'s "
-                f"anonymising_set includes exactly one aliased column named {col_name}."
-            )
-        
-        # "join" columns are all non-aliased columns from the source table
-        # "join" here refers to joining back the data from the SQL statment to the
-        # original source data, not any join columns that are part of the JOIN section
-        # of SQL proper.
-
-        join_columns = []
-        for qualified_column in parser.columns_dict["select"]:
-            table, column = qualified_column.split(".")
-            if table == f"temp_{source_table_id}" and column != col_name:
-                join_columns.append(column)
-
-        # "source" table aka existing table is always put into exhibit DB, but if 
-        # SQL is trying to reference an external table, we should check if it exists
-        ext_tables = [
-            t for t in sql_tables if t not in ["temp_original_values", f"temp_{source_table_id}"]
-        ]
-
-        # check the "external" table is in exhibit.db
-        for ext_table in ext_tables:
-            if not check_table_exists(ext_table):
-                raise RuntimeError(
-                    f"Please make sure that {ext_table} used in the anonymising_set SQL"
-                    f" for column {col_name} exists in the Exhibit database."
-                )
-        
-        # insert the dataframe generated so far into the DB; we make sure to drop
-        # duplicates in case user didn't specify DISTINCT in his SQL query;
-        # the anon_df would typically be from UUIDs that are generated before
-        # categorical columns.
-
-        # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
-        if self.anon_df is None or self.anon_df.empty:
-            # self.generated_dfs has cat. columns generated BEFORE this particular column
-            if not self.generated_dfs: #pragma: no cover
-                existing_data = pd.DataFrame()
-            else:
-                existing_data = pd.concat(self.generated_dfs, axis=1)
-        else:
-            existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
-
-        # for convenience, we can reference original_values as a table - this could be 
-        # original_values as they appear in the SPEC or in the SQL (not implemented yet)
-        if "temp_original_values" in sql_tables:
-            ov_df = self.spec_dict["columns"][col_name]["original_values"][[col_name]]
-            create_temp_table(
-                table_name="temp_original_values",
-                col_names=[col_name],
-                data=ov_df
-            )
-
-        # ensure the data going into DB is processed identically for join keys
-        for col in join_columns:
-            if is_numeric_dtype(existing_data[col]):
-                existing_data[col] = existing_data[col].astype(float)
-            elif is_datetime64_dtype(existing_data[col]):
-                existing_data[col] = existing_data[col].dt.strftime("%Y-%m-%d")
-            else:
-                existing_data[col] = existing_data[col].astype(str).str.strip()
-
-        # dropping duplicates is a filter operation (even though it returns new data)
-        # unless we make an explicit copy of the de-duplicated dataframe, Pandas will 
-        # trigger SettingWithCopy warning when trying to change any values.
-        existing_data_distinct = existing_data.drop_duplicates(subset=join_columns).copy()
-        existing_data_cols = list(existing_data.columns)
-
-        # this function converts list of tuples into a dataframe anyway
-        create_temp_table(
-            table_name=f"temp_{source_table_id}",
-            col_names=existing_data_cols,
-            data=existing_data_distinct
-        )
-
-        # run the SQL from anon_set; note that the type of SQL query we'll likely see 
-        # will be a cross-join (e.g. dates) so any speed optimisations would be welcome
-        result = execute_sql(anon_set)
-
-        # create the dataframe with SQL data
-        sql_df = pd.DataFrame(data=result, columns=join_columns + aliased_columns)
-
-        # ensure that the column of interest (the one we're potentially matching to original
-        # values) is typed to string - and not datetime or int, coming out of SQL. We will
-        # convert to datetime at the end, if that's what the type in the spec is.
-        sql_df[col_name] = sql_df[col_name].astype("str")
-
-        # get the probabilities for the selected column in the external table
-        # at the level of the join key - use a hash for the combination of columns!
-
-        # Rather than use existing probabilities from the spec, treat them as a weight 
-        # and apply them to the conditional, per-join key probabilities from external
-        # table.
-        probas = {}
-        orig_vals = None
-
-        try:
-            orig_vals = self.spec_dict["columns"][col_name]["original_values"]
-            if isinstance(orig_vals, pd.DataFrame):
-                orig_vals = orig_vals.set_index(col_name)
-        # if we don't have original_values in the column spec, it's a date
-        except KeyError:
-            pass
-
-        groups = sql_df.groupby(join_columns)
-        for i, group in groups:
-
-            total_count = len(group)
-            proba_arr = (group
-                            .value_counts()
-                            .apply(lambda x: 0 if x == 0 else max(0.001, x / total_count))
-                            .reset_index(level=col_name)
-                            .to_numpy(dtype="str")
-                            )
-            a, p = np.split(proba_arr, 2, axis=1)
-            a = a.flatten()
-            p = p.flatten().astype(float)
-
-            if orig_vals is not None:
-                for j, val in enumerate(a):
-                    if val in orig_vals.index:
-                        p_weight = float(orig_vals.loc[val, "probability_vector"])
-                        p[j] = p[j] * p_weight
-
-            # enusre p sums up to 1
-            p = p * (1 / sum(p))
-            probas[i[0]] = (a, p)
-
-        # take the data generated so far and generate appropriate values based on key
-        groups = existing_data.groupby(join_columns).groups
-        temp_result = []
-
-        for group_key, group_index in groups.items():
-            # if the key is missing, then the SQL filtered out the data for that key
-            # having a COALESCE in SQL would fix it, but in case it's also missing, 
-            # we try to catch this edge case in code as well. 
-            try:
-                new_data = self.rng.choice(
-                    a=probas[group_key][0], p=probas[group_key][1], size=len(group_index))
-            except KeyError: #pragma: no cover
-                new_data = [np.nan] * len(group_index)
-    
-            temp_result.append(pd.Series(data=new_data, index=group_index, name=col_name))
-
-        final_result = pd.concat(temp_result)
-
-        # ensure we return the correct type for date columns
-        col_type = self.spec_dict["columns"][col_name]["type"]
-        if col_type == "date":
-            final_result = final_result.astype("datetime64[ns]")
-
-        return final_result
-    
-    def _generate_using_custom_function(self, col_name, anon_set):
-        '''
-        _summary_
-
-        Parameters
-        ----------
-        col_name : _type_
-            _description_
-        anon_set : _type_
-            _description_
-        '''
-        # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
-        if self.anon_df is None or self.anon_df.empty:
-            # self.generated_dfs has cat. columns generated BEFORE this particular column
-            if not self.generated_dfs:
-                existing_data = pd.DataFrame()
-            else:
-                existing_data = pd.concat(self.generated_dfs, axis=1)
-        else: #pragma: no cover
-            existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
-
-        if existing_data.empty:
-            result = pd.Series(
-                data=[anon_set(pd.Series) for _ in range(self.num_rows)],
-                name=col_name
-            )
-            return result
-
-        result = existing_data.apply(anon_set, axis=1)
-        result.name = col_name
-
-        return result
+'''
+Methods to generate categorical columns / values
+'''
+
+# Standard library imports
+from collections import namedtuple
+from itertools import chain
+import warnings
+
+# External library imports
+import pandas as pd
+import numpy as np
+from sql_metadata import Parser
+from pandas.api.types import is_numeric_dtype, is_datetime64_dtype
+
+# Exhibit imports
+from ..constants import ORIGINAL_VALUES_REGEX, ORIGINAL_VALUES_PAIRED
+from ..utils import get_attr_values, shuffle_data
+from ..sql import query_exhibit_database, check_table_exists, execute_sql, create_temp_table
+from ..linkage.hierarchical import generate_linked_anon_df
+from ..linkage.matrix import generate_user_linked_anon_df
+from .regex import generate_regex_column
+
+# EXPORTABLE METHODS
+# ==================
+class CategoricalDataGenerator:
+    '''
+    Although this class is pretty bare, it still helps avoid passing
+    the same variables through functions and also mirrors the setup
+    for generation of linked data.
+
+    One area that potentially needs looking at is if the user makes
+    manual changes to column values that were initially put into SQL
+    (where uniques > inline_limit) - for now, this works only for linked data.
+    '''
+
+    def __init__(self, spec_dict, core_rows, anon_df=None):
+        '''
+        This class is covering the entire spec_dict as far as the 
+        generation of non-numerical data is concerned.
+        '''
+        
+        self.spec_dict = spec_dict
+        self.rng = spec_dict["_rng"]
+        self.num_rows = core_rows
+        self.fixed_anon_sets = ["random", "mountains", "patients", "birds", "dates"]
+        # we need UUID dataset (if it exists) for possible conditional SQL that
+        # references already-generated columns in the spec
+        self.generated_dfs = []
+        self.anon_df = anon_df
+        
+        (self.all_cols,
+         self.complete_cols,
+         self.paired_cols,
+         self.skipped_cols) = self._get_column_types()
+
+    def generate(self):
+        '''
+        Brings together all the components of non-numerical data generation.
+
+        Returns
+        -------
+        A dataframe with all categorical columns
+        '''
+
+        #1) GENERATE LINKED DFs FROM EACH LINKED COLUMNS GROUP
+        for linked_group in (self.spec_dict.get("linked_columns") or []):
+            
+            # zero-numbered linked group is reserved for user-defined groupings
+            if linked_group[0] == 0:
+
+                u_linked_df = generate_user_linked_anon_df(
+                    spec_dict=self.spec_dict,
+                    linked_cols=linked_group[1],
+                    num_rows=self.num_rows
+                )
+
+                self.generated_dfs.append(u_linked_df)
+
+            else:
+
+                linked_df = generate_linked_anon_df(
+                    spec_dict=self.spec_dict,
+                    linked_group=linked_group,
+                    num_rows=self.num_rows)
+
+                self.generated_dfs.append(linked_df)
+
+        #2) GENERATE NON-LINKED DFs
+        for col in [col for col in self.all_cols if col not in self.skipped_cols]:
+            s = self._generate_anon_series(col)
+            self.generated_dfs.append(s)
+
+        #3) CONCAT GENERATED DFs AND SERIES
+        temp_anon_df = pd.concat(self.generated_dfs, axis=1)
+
+        #4) GENERATE SERIES WITH "COMPLETE", CROSS-JOINED COLUMNS
+        complete_series = []
+
+        # Complete series can sort the data again
+        for col in self.complete_cols:
+            s = self._generate_complete_series(col)
+            #paired columns return None
+            if not s is None:
+                complete_series.append(s)
+        
+        #5) OUTER JOIN
+        temp_anon_df["key"] = 1
+
+        for s in complete_series:
+
+            temp_anon_df = pd.merge(
+                temp_anon_df,
+                pd.DataFrame(s).assign(key=1),
+                how="outer",
+                on="key"
+            )
+        
+        #6) TIDY UP
+        anon_df = temp_anon_df.drop("key", axis=1)
+
+        return anon_df
+
+    def _generate_timeseries(self, col_name, complete=False):
+        '''
+        Basic generator of randomised / complete timeseries data
+
+        Parameters:
+        ----------
+        col_name  : str
+            time column to generate (type checks are made upstream)
+        complete  : boolean
+            if timeseries is meant to be "complete", return full series
+            without picking N=num_rows random values from the pool
+
+        Returns:
+        --------
+        pd.Series
+        '''
+
+        # see which date parameters we have access to
+        start = self.spec_dict["columns"][col_name].get("from", None)
+        end = self.spec_dict["columns"][col_name].get("to", None)
+        
+        # frequency and periods are always required
+        freq = self.spec_dict["columns"][col_name]["frequency"]
+        periods = self.spec_dict["columns"][col_name]["uniques"]
+
+        # if we have both start and end, we generate all values in-between and pick the 
+        # dates at random to match the number of periods, without repeats
+        if start is not None and end is not None:
+
+            all_pos_dates = pd.date_range(start=start, end=end, freq=freq)
+            # when the number of requested periods is greater than the total possible
+            # range between from and to, given the frequency, we issue a warning, then
+            # omit the date_to and generate N=periods unique dates from date_from.
+            if len(all_pos_dates) < periods:
+                warnings.warn(
+                    f"The number of unique dates at frequency {freq} between {start} "
+                    f"and {end} is smaller than the number of requested periods"
+                    f"({periods}). The date_to parameter will be ignored.",
+                    RuntimeWarning
+                    )
+                all_pos_dates = pd.date_range(start=start, periods=periods, freq=freq)
+
+            all_pos_dates = self.rng.choice(all_pos_dates, periods, replace=False)
+
+        else:
+            # one of the start / end is None
+            all_pos_dates = pd.date_range(
+                start=start, end=end, periods=periods, freq=freq)
+
+        if complete:
+            return pd.Series(all_pos_dates, name=col_name)
+        
+        random_dates = self.rng.choice(all_pos_dates, self.num_rows)
+
+        return shuffle_data(pd.Series(random_dates, name=col_name))
+
+    def _generate_anon_series(self, col_name):
+        '''
+        Generate basic categorical series anonymised according to user input.
+
+        Note that in all cases except external tables, the final series is shuffled
+        and index reset. Series generated from external tables are an exception because
+        their values are linked to columns that have already been generated.
+
+        The code can take different paths depending on these things: 
+        - whether a the anonymising method is set to random or a custom set
+        - whether the number of unique values exceeds the threshold
+        - whether the column has any paired columns
+
+        The paths differ primarily in terms of where the data sits: as part
+        of the spec in original_values or in exhibit DB.
+
+        Things are further complicated if users want to use a single column
+        from an anonymising table, like mountains.peak
+
+        Parameters:
+        -----------
+        col_name : str
+            column name to process & anonymise
+
+        Returns:
+        -------
+        Pandas Series object or a Dataframe
+        '''
+
+        col_attrs = self.spec_dict["columns"][col_name]
+        col_type = col_attrs["type"]
+        
+        # capture categorical-only information, with fallback for date columns
+        paired_cols = col_attrs.get("paired_columns", None)
+        orig_vals = col_attrs.get("original_values", None)
+        target_uniques = col_attrs.get("uniques", None)
+
+        # typically, only categorical columns will have an anonymising set, but time
+        # columns can use it for SQL to pull conditional values from external table
+        # ignoring the standard date genderation parameters, like from / to.        
+        anon_set = col_attrs.get("anonymising_set", None)
+
+        # Users can pass custom functions to generate categorical / date columns
+        if callable(anon_set):
+            return self._generate_using_custom_function(col_name, anon_set)
+
+        # check if the anonymising set is a SQL statement starting with SELECT
+        # note that for dates, all other parameters, like from / to will be ignored
+        if anon_set is not None and anon_set.strip().upper()[:6] == "SELECT":
+            return self._generate_using_external_table(col_name, anon_set)
+
+        # normal date columns generated using from / to / number of uniques
+        if col_type == "date":
+            return self._generate_timeseries(col_name, complete=False)  
+
+        # generate values based on a regular expression specified in the anonymising_set
+        if isinstance(orig_vals, str) and orig_vals == ORIGINAL_VALUES_REGEX:
+            return generate_regex_column(
+                anon_set, col_name, self.num_rows, target_uniques)
+
+        # values were stored in SQL; randomise based on uniform distribution
+        if col_attrs["uniques"] > self.spec_dict["metadata"]["inline_limit"]:
+            return self._generate_from_sql(col_name, col_attrs)
+
+        # we have access to original_values and the paths are dependant on anon_set
+        # take every row except last which is reserved for Missing data
+        col_df = col_attrs["original_values"].iloc[:-1, :]
+        col_prob = np.array(col_df["probability_vector"]).astype(float)
+
+        if col_prob.sum() != 1:
+            col_prob /= col_prob.sum()
+
+        if anon_set == "random": 
+
+            col_values = col_df[col_name].to_list()
+
+            original_series = pd.Series(
+                data=self.rng.choice(a=col_values, size=self.num_rows, p=col_prob),
+                name=col_name)
+
+            if paired_cols:
+                paired_df = (
+                    col_df[[col_name] + [f"paired_{x}" for x in paired_cols]]
+                        .rename(columns=lambda x: x.replace("paired_", ""))
+                )
+
+                return shuffle_data(
+                    pd.merge(original_series, paired_df, how="left", on=col_name))
+
+            return shuffle_data(original_series)
+
+        # finally, if we have original_values, but anon_set is not random
+        # we pick the N distinct values from the anonymysing set, replace
+        # the original values + paired column values in the original_values
+        # DATAFRAME, making sure the changes happen in-place which means
+        # that downstream, the weights table will be built based on the
+        # modified "original_values" dataframe.
+
+        sql_df = self._generate_from_sql(col_name, col_attrs, complete=True)
+
+        # includes Missing data row as opposed to col_df which doesn't
+        orig_df = col_attrs["original_values"]
+
+        # missing data is the last row
+        repl = sql_df[col_name].unique()
+        aliases = dict(zip(orig_df[col_name].values[:-1], repl))
+        aliased_df = orig_df.map(lambda x: aliases.get(x, x))
+        self.spec_dict["columns"][col_name]["original_values"] = aliased_df
+
+        # we ignore Missing data probability when we originally create the variable
+        idx = self.rng.choice(a=len(sql_df), p=col_prob, size=self.num_rows)
+        anon_list = [sql_df.iloc[x, :].values for x in idx]
+        anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list)
+
+        return shuffle_data(anon_df)
+        
+    def _generate_from_sql(self, col_name, col_attrs, complete=False, db_path=None):
+        '''
+        Whatever the anonymising method, if a column has more unique values than
+        allowed by the inline_limit parameter, it will be put into SQLite3 db.
+        '''
+
+        anon_set = col_attrs["anonymising_set"]
+        uniques = col_attrs["uniques"]
+        paired_cols = col_attrs["paired_columns"] or []
+
+        #1) QUERY SQL TO GET VALUES USED TO BUILD THE DATAFRAME
+        if anon_set == "random":
+
+            safe_col_name = col_name.replace(" ", "$")
+            table_name = f"temp_{self.spec_dict['metadata']['id']}_{safe_col_name}"
+            sql_df = query_exhibit_database(
+                table_name, exclude_missing=True, db_path=db_path)
+
+        else:
+            table_name, *sql_column = anon_set.split(".")
+            sql_df = query_exhibit_database(table_name, sql_column, uniques)
+
+        # if sql df is an anonymising set with different column names, like mountaints,
+        # we want to rename them to the actual column names used in the spec;
+        # alternatively, if the sql df is a lookup and column there match the spec, we
+        # make sure to take those columns that match.
+        if set([col_name] + paired_cols).issubset(set(sql_df.columns)):
+            sql_df = sql_df[[col_name] + paired_cols]
+
+        # rename sql_df columns to be same as original + paired; zip is 
+        # only going to pair up columns up to the shorter list!
+        sql_df.rename(
+            columns=dict(zip(
+                sql_df.columns,
+                [col_name] + paired_cols
+            )),
+            inplace=True
+        )
+
+        #2) GENERATE ANONYMISED ROWS
+        if complete:
+            anon_df = sql_df.drop(columns="probability_vector", errors="ignore")
+        else:
+            if "probability_vector" in sql_df.columns:
+                probs = sql_df["probability_vector"].astype(float).values
+                probs = probs / probs.sum()
+                sql_df.drop(columns="probability_vector", inplace=True)
+                idx = self.rng.choice(a=len(sql_df), p=probs, size=self.num_rows)
+            else:
+                idx = self.rng.choice(len(sql_df), self.num_rows)
+
+            anon_list = [sql_df.iloc[x, :].values for x in idx]
+            anon_df = pd.DataFrame(columns=sql_df.columns, data=anon_list)
+
+        #3) HANDLE MISSING PAIRED COLUMNS IN SQL
+        # if the column has paired columns and a non-random anonymising set,
+        # the anonymising set must also provide the paired columns or the same
+        # values will be used for the original + paired columns
+        missing_paired_cols = set(paired_cols) - set(sql_df.columns[1:])
+
+        if missing_paired_cols:
+            missing_df = pd.DataFrame(
+                data=zip(*[anon_df[col_name]] * len(missing_paired_cols)),
+                # sets are no longer allowed as column names
+                columns=list(missing_paired_cols)
+            )
+
+            anon_df = pd.concat([anon_df, missing_df], axis=1)
+
+        return shuffle_data(anon_df)
+
+    def _generate_complete_series(self, col_name):
+        '''
+        This function doesn't take num_rows argument because
+        we are always generating the full number of rows
+        for this column as specified in the spec.
+
+        Function path depends on the column type: date or categorical
+
+        Returns
+        -------
+        pd.Series for non-paired columns and pd.DataFrame for pairs
+
+        For now, the function doesn't support columns where values are
+        stored in the DB because the number of their uniques exceeds
+        category threshold or if they are anonymised using a set from DB.
+        '''
+        
+        col_attrs = self.spec_dict["columns"][col_name]
+        
+        if col_attrs["type"] == "date":
+
+            return self._generate_timeseries(col_name, complete=True) 
+        
+        # if paired column, skip, and add pairs as part of parent column's processing
+        if col_name in self.paired_cols:
+            return None
+
+        # if column has paired columns, return a dataframe with it + paired cols
+        paired_cols = col_attrs["paired_columns"]
+
+        # all cat. columns have a missing data placeholder as -1 row so we exclude it
+        if paired_cols:
+            paired_complete_df = (
+                col_attrs["original_values"].iloc[:-1, 0:len(paired_cols)+1])
+            paired_complete_df.rename(
+                columns=lambda x: x.replace("paired_", ""), inplace=True)
+
+            return paired_complete_df
+
+        return pd.Series(col_attrs["original_values"].iloc[:-1, 0], name=col_name)
+
+    def _get_column_types(self):
+        '''
+        Convenience function to categorise columns into 4 types:
+            - nested linked columns (generated separately as part of linkage.py)
+            - complete columns - all values are used
+            - columns where original values are paired with a "main" column
+
+        All of the above are treated in a special way either in a separate
+        generation routine (like linked columns) or are generated as a
+        by-product of another routine (like paired columns). Columns that remain,
+        are generated in a "normal" way as part of this module.
+
+        Returns
+        -------
+        namedtuple("Columns", ["all", "complete", "paired", "skipped"])
+        '''
+
+        Columns = namedtuple("Columns", ["all", "complete", "paired", "skipped"])
+
+        # there might be cases when you want to generate just the date columns or just
+        # the categorical columns so they might be missing from the metadata section
+        all_cols = (
+            (self.spec_dict["metadata"].get("categorical_columns", [])) +
+            (self.spec_dict["metadata"].get("date_columns", []))
+        )
+        
+        nested_linked_cols = [
+            sublist for n, sublist in (self.spec_dict.get("linked_columns") or [])
+            ]
+
+        complete_cols = [c for c, v in get_attr_values(
+            self.spec_dict,
+            "cross_join_all_unique_values",
+            col_names=True, 
+            types=["categorical", "date"]) if v]
+
+        list_of_orig_val_tuples = get_attr_values(
+            self.spec_dict,
+            "original_values",
+            col_names=True,
+            types=["categorical", "date"])
+
+        paired_cols = [
+            k for k, v in list_of_orig_val_tuples if str(v) == ORIGINAL_VALUES_PAIRED]
+
+        skipped_cols = (
+            list(chain.from_iterable(nested_linked_cols)) +
+            complete_cols +
+            paired_cols
+        )
+
+        column_types = Columns(all_cols, complete_cols, paired_cols, skipped_cols)
+
+        return column_types
+
+    def _generate_using_external_table(self, col_name, anon_set):
+        '''
+        We assume that the aliased column is the one you want to pick the values from
+        and the rest of the columns in the select statement are going to be the join
+        keys.
+        '''
+
+        parser = Parser(anon_set)
+        sql_tables = parser.tables
+        aliased_columns = parser.columns_aliases_names
+        source_table_id = self.spec_dict["metadata"]["id"]
+    
+        if len(aliased_columns) != 1 or aliased_columns[0] != col_name:
+            raise RuntimeError(
+                f"Please make sure the SQL SELECT statement in {col_name}'s "
+                f"anonymising_set includes exactly one aliased column named {col_name}."
+            )
+        
+        # "join" columns are all non-aliased columns from the source table
+        # "join" here refers to joining back the data from the SQL statment to the
+        # original source data, not any join columns that are part of the JOIN section
+        # of SQL proper.
+
+        join_columns = []
+        for qualified_column in parser.columns_dict["select"]:
+            table, column = qualified_column.split(".")
+            if table == f"temp_{source_table_id}" and column != col_name:
+                join_columns.append(column)
+
+        # "source" table aka existing table is always put into exhibit DB, but if 
+        # SQL is trying to reference an external table, we should check if it exists
+        ext_tables = [
+            t for t in sql_tables if t not in ["temp_original_values", f"temp_{source_table_id}"]
+        ]
+
+        # check the "external" table is in exhibit.db
+        for ext_table in ext_tables:
+            if not check_table_exists(ext_table):
+                raise RuntimeError(
+                    f"Please make sure that {ext_table} used in the anonymising_set SQL"
+                    f" for column {col_name} exists in the Exhibit database."
+                )
+        
+        # insert the dataframe generated so far into the DB; we make sure to drop
+        # duplicates in case user didn't specify DISTINCT in his SQL query;
+        # the anon_df would typically be from UUIDs that are generated before
+        # categorical columns.
+
+        # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
+        if self.anon_df is None or self.anon_df.empty:
+            # self.generated_dfs has cat. columns generated BEFORE this particular column
+            if not self.generated_dfs: #pragma: no cover
+                existing_data = pd.DataFrame()
+            else:
+                existing_data = pd.concat(self.generated_dfs, axis=1)
+        else:
+            existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
+
+        # for convenience, we can reference original_values as a table - this could be 
+        # original_values as they appear in the SPEC or in the SQL (not implemented yet)
+        if "temp_original_values" in sql_tables:
+            ov_df = self.spec_dict["columns"][col_name]["original_values"][[col_name]]
+            create_temp_table(
+                table_name="temp_original_values",
+                col_names=[col_name],
+                data=ov_df
+            )
+
+        # ensure the data going into DB is processed identically for join keys
+        for col in join_columns:
+            if is_numeric_dtype(existing_data[col]):
+                existing_data[col] = existing_data[col].astype(float)
+            elif is_datetime64_dtype(existing_data[col]):
+                existing_data[col] = existing_data[col].dt.strftime("%Y-%m-%d")
+            else:
+                existing_data[col] = existing_data[col].astype(str).str.strip()
+
+        # dropping duplicates is a filter operation (even though it returns new data)
+        # unless we make an explicit copy of the de-duplicated dataframe, Pandas will 
+        # trigger SettingWithCopy warning when trying to change any values.
+        existing_data_distinct = existing_data.drop_duplicates(subset=join_columns).copy()
+        existing_data_cols = list(existing_data.columns)
+
+        # this function converts list of tuples into a dataframe anyway
+        create_temp_table(
+            table_name=f"temp_{source_table_id}",
+            col_names=existing_data_cols,
+            data=existing_data_distinct
+        )
+
+        # run the SQL from anon_set; note that the type of SQL query we'll likely see 
+        # will be a cross-join (e.g. dates) so any speed optimisations would be welcome
+        result = execute_sql(anon_set)
+
+        # create the dataframe with SQL data
+        sql_df = pd.DataFrame(data=result, columns=join_columns + aliased_columns)
+
+        # ensure that the column of interest (the one we're potentially matching to original
+        # values) is typed to string - and not datetime or int, coming out of SQL. We will
+        # convert to datetime at the end, if that's what the type in the spec is.
+        sql_df[col_name] = sql_df[col_name].astype("str")
+
+        # get the probabilities for the selected column in the external table
+        # at the level of the join key - use a hash for the combination of columns!
+
+        # Rather than use existing probabilities from the spec, treat them as a weight 
+        # and apply them to the conditional, per-join key probabilities from external
+        # table.
+        probas = {}
+        orig_vals = None
+
+        try:
+            orig_vals = self.spec_dict["columns"][col_name]["original_values"]
+            if isinstance(orig_vals, pd.DataFrame):
+                orig_vals = orig_vals.set_index(col_name)
+        # if we don't have original_values in the column spec, it's a date
+        except KeyError:
+            pass
+
+        groups = sql_df.groupby(join_columns)
+        for i, group in groups:
+
+            total_count = len(group)
+            proba_arr = (group
+                            .value_counts()
+                            .apply(lambda x: 0 if x == 0 else max(0.001, x / total_count))
+                            .reset_index(level=col_name)
+                            .to_numpy(dtype="str")
+                            )
+            a, p = np.split(proba_arr, 2, axis=1)
+            a = a.flatten()
+            p = p.flatten().astype(float)
+
+            if orig_vals is not None:
+                for j, val in enumerate(a):
+                    if val in orig_vals.index:
+                        p_weight = float(orig_vals.loc[val, "probability_vector"])
+                        p[j] = p[j] * p_weight
+
+            # enusre p sums up to 1
+            p = p * (1 / sum(p))
+            probas[i[0]] = (a, p)
+
+        # take the data generated so far and generate appropriate values based on key
+        groups = existing_data.groupby(join_columns).groups
+        temp_result = []
+
+        for group_key, group_index in groups.items():
+            # if the key is missing, then the SQL filtered out the data for that key
+            # having a COALESCE in SQL would fix it, but in case it's also missing, 
+            # we try to catch this edge case in code as well. 
+            try:
+                new_data = self.rng.choice(
+                    a=probas[group_key][0], p=probas[group_key][1], size=len(group_index))
+            except KeyError: #pragma: no cover
+                new_data = [np.nan] * len(group_index)
+    
+            temp_result.append(pd.Series(data=new_data, index=group_index, name=col_name))
+
+        final_result = pd.concat(temp_result)
+
+        # ensure we return the correct type for date columns
+        col_type = self.spec_dict["columns"][col_name]["type"]
+        if col_type == "date":
+            final_result = final_result.astype("datetime64[ns]")
+
+        return final_result
+    
+    def _generate_using_custom_function(self, col_name, anon_set):
+        '''
+        _summary_
+
+        Parameters
+        ----------
+        col_name : _type_
+            _description_
+        anon_set : _type_
+            _description_
+        '''
+        # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
+        if self.anon_df is None or self.anon_df.empty:
+            # self.generated_dfs has cat. columns generated BEFORE this particular column
+            if not self.generated_dfs:
+                existing_data = pd.DataFrame()
+            else:
+                existing_data = pd.concat(self.generated_dfs, axis=1)
+        else: #pragma: no cover
+            existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
+
+        if existing_data.empty:
+            result = pd.Series(
+                data=[anon_set(pd.Series) for _ in range(self.num_rows)],
+                name=col_name
+            )
+            return result
+
+        result = existing_data.apply(anon_set, axis=1)
+        result.name = col_name
+
+        return result
diff --git a/exhibit/core/generate/missing.py b/exhibit/core/generate/missing.py
index 76eb76c..8575c50 100644
--- a/exhibit/core/generate/missing.py
+++ b/exhibit/core/generate/missing.py
@@ -1,345 +1,345 @@
-'''
-Methods to generate / deal with missing data
-'''
-
-# Standard library imports
-from itertools import groupby
-
-# External library imports
-import numpy as np
-import pandas as pd
-
-# Exhibit 
-from ..constants import MISSING_DATA_STR
-from ..constraints import clean_up_constraint_string, get_constraint_mask
-from ..utils import get_attr_values
-from .continuous import generate_cont_val, scale_continuous_column
-
-# EXPORTABLE METHODS & CLASSES
-# ============================
-
-class MissingDataGenerator:
-    '''
-    The class will copy the nearly complete anonimised dataframe
-    which has implications on the RAM footprint of the package
-    '''
-
-    def __init__(self, spec_dict, data):
-        '''
-        Doc string
-        '''
-
-        self.spec_dict = spec_dict
-        self.data = data
-        self.dtypes = data.dtypes
-        self.nan_data = data
-        self.wt = spec_dict.get("weights_table", None)
-
-        # only copy the data if there are conditional constraints meaning
-        # we can't be sure the required columns HADN'T HAD data already made
-        # missing in an earlier step.
-        if spec_dict["constraints"]["custom_constraints"]:
-            self.nan_data = data.copy()
-
-    def add_missing_data(self):
-        '''
-        Returns the original data, modified in place to include nan values
-
-        Since Missing data (categorical) has its own weights, if we're adding
-        any Missing data to the dataframe, we must re-generate the contunious
-        variables to make sure we use the Missing data weights and not the original.
-
-        We also need to re-scale each continuous column where we either added a nan
-        or where the categorical columns had Missing data added to them.
-
-        1) Find cells to exclude - there can't be nans in them
-        2) Find linked and paired columns - nulls are propagated from the root column
-        3) Add nulls to the remaining columns, always mindful of the indices from 1)
-        '''
-
-        missing_link_cols = self._find_columns_with_linked_missing_data()
-        geospatial_cols = [c for c, _ in get_attr_values(
-        self.spec_dict, "type", col_names=True, types=["geospatial"])]
-
-        standalone_cols = (
-            set(self.spec_dict["columns"].keys()) - 
-            {col for col_set in missing_link_cols for col in col_set} -
-            set(self.spec_dict.get("derived_columns", {}).keys()) -
-            set(self.spec_dict["metadata"].get("uuid_columns", set()) or set()) -
-            set(geospatial_cols)
-        )
-
-        #1) Generate nulls in standalone columns, including continuous
-        # make sure to change the seed for each standalone column to avoid creating
-        # relationships where NA in Column A is also NA is column B if both have the
-        # same miss_probability.
-        for i, col_name in enumerate(sorted(standalone_cols)):
-
-            # reset the generator for each column
-            rng = np.random.default_rng(seed=i)
-
-            # special case for user linked columns which can have "Missing data" already
-            # if it appeared in the source for the linkage along with its probability.
-            # hierarchical linkage is not affected because having multiple NAN CAs for 
-            # different HBs, for example, means the linkage is no longer hierarchical
-            # and doesn't map 1 to many and is instead many to many.
-            
-            if any(self.nan_data[col_name] == MISSING_DATA_STR):
-                self.nan_data[col_name] = (
-                        self.nan_data[col_name].map(
-                            lambda x: pd.NA if x == MISSING_DATA_STR else x))
-                continue
-            
-            miss_pct = self.spec_dict["columns"][col_name]["miss_probability"]
-            rands = rng.random(size=self.nan_data.shape[0]) # pylint: disable=no-member
-            col_type = self.spec_dict["columns"][col_name]["type"]
-            miss_value = pd.NaT if col_type == "date" else np.NaN
-            repl_column = self.nan_data[col_name]
-            
-            # numpy default type detection messes up date columns in Pandas
-            if col_type == "date":
-                repl_column = np.array(self.nan_data[col_name], dtype=object)
-
-            self.nan_data[col_name] = np.where(
-                rands < miss_pct,
-                miss_value, repl_column)
-
-            if col_type == "continuous":
-                precision = self.spec_dict["columns"][col_name].get("precision", None)
-                if precision == "integer":
-                    self.nan_data[col_name] = (
-                        self.nan_data[col_name].astype("float").round().astype("Int64"))
-
-        #2) Generate nulls in linked and paired columns
-        for cols in missing_link_cols:
-            
-            # reset the generator for each column (keeping the seed to maintain links)
-            rng = np.random.default_rng(seed=0)
-         
-            # miss probability will be the same for all columns in cols
-            miss_pct = self.spec_dict["columns"][next(iter(cols))]["miss_probability"]
-            # rands is shared for all columns in cols
-            rands = rng.random(size=self.nan_data.shape[0]) # pylint: disable=no-member
-
-            self.nan_data.loc[:, list(cols)] = np.where(
-                (rands < miss_pct)[..., None],
-                (np.NaN, ) * len(cols),
-                self.nan_data.loc[:, list(cols)]
-            )
-
-        #3) Generate nulls in geospacial columns (lat / long)
-        # Similar to linked / paired, keeping the random seed the same between
-        # lat and long, only changing if more than one column to generate.
-
-        for col in geospatial_cols:
-            geo_cols = [f"{col}_latitude", f"{col}_longitude"]
-            rng = np.random.default_rng(seed=0)
-            miss_pct = self.spec_dict["columns"][col]["miss_probability"]
-            rands = rng.random(size=self.nan_data.shape[0])
-
-            self.nan_data.loc[:, geo_cols] = np.where(
-                (rands < miss_pct)[..., None],
-                (np.NaN, ) * len(geo_cols),
-                self.nan_data.loc[:, geo_cols]
-            )
-            
-        #4) Generate nulls in indices explicitly defined in custom_constraints
-        make_null_idx = self._find_make_null_idx()
-
-        for idx, col_name in make_null_idx:
-            self.nan_data.loc[idx, col_name] = np.NaN
-
-        #5) Re-introduce the saved no_nulls rows from the original data
-        not_null_idx = self._find_not_null_idx()
-        for idx, col_name in not_null_idx:
-            self.nan_data.loc[idx, col_name] = self.data.loc[idx, col_name]
-
-        #6) Replace NA with missing data placeholder for categorical columns and
-        # re-generate continuous variables for those rows according to proper weights
-        # only go through this step if there are nulls in categorical columns
-        # and the spec_dict includes numerical columns that would be affected
-        # otherwise, return early.
-        cat_cols = list(self.spec_dict["metadata"]["categorical_columns"])
-        num_cols = (
-            set(self.spec_dict["metadata"]["numerical_columns"]) -
-            set(self.spec_dict.get("derived_columns", {}).keys()))
-
-        if not (any(self.nan_data[cat_cols].isna()) and num_cols):
-            return self.nan_data.astype(self.dtypes)
-
-        cat_mask = self.nan_data[cat_cols].isna().any(axis=1)
-        self.nan_data[cat_cols] = self.nan_data[cat_cols].fillna(MISSING_DATA_STR)
-        
-        for num_col in num_cols:
-
-            # reset the generator for each column
-            rng = np.random.default_rng(seed=0)
-           
-            # Extract relevant num col variables from the user spec
-            num_col_dict = self.spec_dict["columns"][num_col]
-
-            dist = num_col_dict["distribution"]
-            dist_params = num_col_dict["distribution_parameters"]
-            precision = num_col_dict["precision"]
-            
-            # if it's already NA, don't re-generate; it's NA for a reason!
-            num_mask = self.nan_data[num_col].isna()
-            mask = cat_mask & ~num_mask
-
-            # it's possible to have the left side be Int64 type and the right side
-            # to be float64 (newly generated, unscaled); assigning different types
-            # doesn't work so we'll delay assignment and scale / cast type first!
-            unscaled_new_series = self.nan_data.loc[mask, cat_cols].apply(
-                func=generate_cont_val,
-                axis=1,
-                weights_table=self.wt,
-                num_col=num_col,
-                rng=rng,
-                dist=dist,
-                dist_params=dist_params
-            )
-
-            # rescale the masked section, but make sure to change target_sum!
-            # take a copy of the dist_params as full target_sum is used elsewhere
-            new_dist_params = dist_params.copy()
-
-            if dist_params.get("target_sum", None) is not None:
-                old_sum = self.nan_data.loc[~mask, num_col].sum()
-                new_dist_params["target_sum"] = dist_params["target_sum"] - old_sum
-
-            scaled_new_series = scale_continuous_column(
-                series=unscaled_new_series,
-                precision=precision,
-                **new_dist_params
-            )
-
-            # for some reason assigning a series back, rather than values
-            # creates nulls in certain rows, but not others; maybe Pandas bug.
-            # when the array is empty, Pandas generates a ValueError
-            if len(scaled_new_series) != 0:
-                self.nan_data.loc[mask, num_col] = scaled_new_series.values
-
-        # replace Missing data back with np.nan
-        # since we're applying the function across all columns, including numerical,
-        # these can contain pd.NA which is a "special" type that will error out if
-        # trying to evaluate it against a string. Replace with a standard np.NAN.
-        self.nan_data = self.nan_data.applymap(
-            lambda x: np.nan if pd.isna(x) or x == MISSING_DATA_STR else x)
-
-        return self.nan_data.astype(self.dtypes)
-
-    def _find_columns_with_linked_missing_data(self):
-        '''
-        Returns a list of column groupings where a missing value in one
-        means always a missing value in all in the grouping. The requirement
-        for that is that the missing_probability attribute of the spec is the
-        same for all such linked / paired columns.
-
-        Returns a list with sets of columns
-        '''
-        
-        result = []
-        processed_pairs = set()
-        miss_probs = get_attr_values(
-            self.spec_dict, "miss_probability", col_names=True, types="categorical")
-
-        for col, attrs in self.spec_dict["columns"].items():
-
-            if col in processed_pairs or attrs["type"] != "categorical":
-                continue
-
-            pairs = set()
-            
-            # paired columns first
-            if attrs["paired_columns"]:
-
-                pairs.update([col] + attrs["paired_columns"])
-
-            # linked groups
-            for i, linked_group in (self.spec_dict["linked_columns"] or []):
-                # zero numbered linked group is reserved for user defined linkage
-                if i == 0:
-                    continue
-
-                if col in linked_group:
-                    pairs.update(linked_group)
-
-            processed_pairs.update(pairs)
-
-            # check that miss_probabilities are the same for all paired columns
-            miss_probs = sorted(
-                miss_probs, key=lambda x, pairs=pairs: x.col_name in pairs)
-            groups = groupby(miss_probs, lambda x, pairs=pairs: x.col_name in pairs)
-
-            for key, group in groups:
-
-                if key and len({v for k, v in group}) == 1:
-
-                    result.append(pairs)
-
-        return result
-
-
-    def _find_make_null_idx(self):
-        '''
-        The reason for keeping this and _find_not_null_idx separate is that
-        they are needed at different points in time - not_null_idx happens AFTER
-        all other sources of nan-generation have been exhausted and we're using
-        the data WITH nans to determine indices to pick up real values from the
-        original data passed in to the generator.
-        '''
-        
-        cc = self.spec_dict["constraints"]["custom_constraints"] or {}
-
-        make_null_idx = []
-        
-        for _, constraint in cc.items():
-
-            cc_filter = constraint.get("filter", None)
-            cc_targets = constraint.get("targets", {})
-            clean_cc_filter = clean_up_constraint_string(cc_filter)
-            cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter)
-
-            for target_str, action_str in cc_targets.items():
-
-                if "make_null" in action_str:
-
-                    target_cols = [x.strip() for x in target_str.split(",")]
-
-                    for target in target_cols:
-
-                        make_null_idx.append(
-                            (self.nan_data.loc[cc_mask].index, target)
-                        )
-
-        return make_null_idx
-
-    def _find_not_null_idx(self):
-        '''
-        Doc string
-        '''
-        
-        cc = self.spec_dict["constraints"]["custom_constraints"] or {}
-
-        not_null_idx = []
-            
-        for _, constraint in cc.items():
-
-            cc_filter = constraint.get("filter", None)
-            cc_targets = constraint.get("targets", {})
-            clean_cc_filter = clean_up_constraint_string(cc_filter)
-            cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter)
-
-            for target_str, action_str in cc_targets.items():
-
-                if "make_not_null" in action_str:
-
-                    target_cols = [x.strip() for x in target_str.split(",")]
-
-                    for target in target_cols:
-
-                        not_null_idx.append(
-                            (self.nan_data.loc[cc_mask].index, target)
-                        )
-
-        return not_null_idx
+'''
+Methods to generate / deal with missing data
+'''
+
+# Standard library imports
+from itertools import groupby
+
+# External library imports
+import numpy as np
+import pandas as pd
+
+# Exhibit 
+from ..constants import MISSING_DATA_STR
+from ..constraints import clean_up_constraint_string, get_constraint_mask
+from ..utils import get_attr_values
+from .continuous import generate_cont_val, scale_continuous_column
+
+# EXPORTABLE METHODS & CLASSES
+# ============================
+
+class MissingDataGenerator:
+    '''
+    The class will copy the nearly complete anonimised dataframe
+    which has implications on the RAM footprint of the package
+    '''
+
+    def __init__(self, spec_dict, data):
+        '''
+        Doc string
+        '''
+
+        self.spec_dict = spec_dict
+        self.data = data
+        self.dtypes = data.dtypes
+        self.nan_data = data
+        self.wt = spec_dict.get("weights_table", None)
+
+        # only copy the data if there are conditional constraints meaning
+        # we can't be sure the required columns HADN'T HAD data already made
+        # missing in an earlier step.
+        if spec_dict["constraints"]["custom_constraints"]:
+            self.nan_data = data.copy()
+
+    def add_missing_data(self):
+        '''
+        Returns the original data, modified in place to include nan values
+
+        Since Missing data (categorical) has its own weights, if we're adding
+        any Missing data to the dataframe, we must re-generate the contunious
+        variables to make sure we use the Missing data weights and not the original.
+
+        We also need to re-scale each continuous column where we either added a nan
+        or where the categorical columns had Missing data added to them.
+
+        1) Find cells to exclude - there can't be nans in them
+        2) Find linked and paired columns - nulls are propagated from the root column
+        3) Add nulls to the remaining columns, always mindful of the indices from 1)
+        '''
+
+        missing_link_cols = self._find_columns_with_linked_missing_data()
+        geospatial_cols = [c for c, _ in get_attr_values(
+        self.spec_dict, "type", col_names=True, types=["geospatial"])]
+
+        standalone_cols = (
+            set(self.spec_dict["columns"].keys()) - 
+            {col for col_set in missing_link_cols for col in col_set} -
+            set(self.spec_dict.get("derived_columns", {}).keys()) -
+            set(self.spec_dict["metadata"].get("uuid_columns", set()) or set()) -
+            set(geospatial_cols)
+        )
+
+        #1) Generate nulls in standalone columns, including continuous
+        # make sure to change the seed for each standalone column to avoid creating
+        # relationships where NA in Column A is also NA is column B if both have the
+        # same miss_probability.
+        for i, col_name in enumerate(sorted(standalone_cols)):
+
+            # reset the generator for each column
+            rng = np.random.default_rng(seed=i)
+
+            # special case for user linked columns which can have "Missing data" already
+            # if it appeared in the source for the linkage along with its probability.
+            # hierarchical linkage is not affected because having multiple NAN CAs for 
+            # different HBs, for example, means the linkage is no longer hierarchical
+            # and doesn't map 1 to many and is instead many to many.
+            
+            if any(self.nan_data[col_name] == MISSING_DATA_STR):
+                self.nan_data[col_name] = (
+                        self.nan_data[col_name].map(
+                            lambda x: pd.NA if x == MISSING_DATA_STR else x))
+                continue
+            
+            miss_pct = self.spec_dict["columns"][col_name]["miss_probability"]
+            rands = rng.random(size=self.nan_data.shape[0]) # pylint: disable=no-member
+            col_type = self.spec_dict["columns"][col_name]["type"]
+            miss_value = pd.NaT if col_type == "date" else np.NaN
+            repl_column = self.nan_data[col_name]
+            
+            # numpy default type detection messes up date columns in Pandas
+            if col_type == "date":
+                repl_column = np.array(self.nan_data[col_name], dtype=object)
+
+            self.nan_data[col_name] = np.where(
+                rands < miss_pct,
+                miss_value, repl_column)
+
+            if col_type == "continuous":
+                precision = self.spec_dict["columns"][col_name].get("precision", None)
+                if precision == "integer":
+                    self.nan_data[col_name] = (
+                        self.nan_data[col_name].astype("float").round().astype("Int64"))
+
+        #2) Generate nulls in linked and paired columns
+        for cols in missing_link_cols:
+            
+            # reset the generator for each column (keeping the seed to maintain links)
+            rng = np.random.default_rng(seed=0)
+         
+            # miss probability will be the same for all columns in cols
+            miss_pct = self.spec_dict["columns"][next(iter(cols))]["miss_probability"]
+            # rands is shared for all columns in cols
+            rands = rng.random(size=self.nan_data.shape[0]) # pylint: disable=no-member
+
+            self.nan_data.loc[:, list(cols)] = np.where(
+                (rands < miss_pct)[..., None],
+                (np.NaN, ) * len(cols),
+                self.nan_data.loc[:, list(cols)]
+            )
+
+        #3) Generate nulls in geospacial columns (lat / long)
+        # Similar to linked / paired, keeping the random seed the same between
+        # lat and long, only changing if more than one column to generate.
+
+        for col in geospatial_cols:
+            geo_cols = [f"{col}_latitude", f"{col}_longitude"]
+            rng = np.random.default_rng(seed=0)
+            miss_pct = self.spec_dict["columns"][col]["miss_probability"]
+            rands = rng.random(size=self.nan_data.shape[0])
+
+            self.nan_data.loc[:, geo_cols] = np.where(
+                (rands < miss_pct)[..., None],
+                (np.NaN, ) * len(geo_cols),
+                self.nan_data.loc[:, geo_cols]
+            )
+            
+        #4) Generate nulls in indices explicitly defined in custom_constraints
+        make_null_idx = self._find_make_null_idx()
+
+        for idx, col_name in make_null_idx:
+            self.nan_data.loc[idx, col_name] = np.NaN
+
+        #5) Re-introduce the saved no_nulls rows from the original data
+        not_null_idx = self._find_not_null_idx()
+        for idx, col_name in not_null_idx:
+            self.nan_data.loc[idx, col_name] = self.data.loc[idx, col_name]
+
+        #6) Replace NA with missing data placeholder for categorical columns and
+        # re-generate continuous variables for those rows according to proper weights
+        # only go through this step if there are nulls in categorical columns
+        # and the spec_dict includes numerical columns that would be affected
+        # otherwise, return early.
+        cat_cols = list(self.spec_dict["metadata"]["categorical_columns"])
+        num_cols = (
+            set(self.spec_dict["metadata"]["numerical_columns"]) -
+            set(self.spec_dict.get("derived_columns", {}).keys()))
+
+        if not (any(self.nan_data[cat_cols].isna()) and num_cols):
+            return self.nan_data.astype(self.dtypes)
+
+        cat_mask = self.nan_data[cat_cols].isna().any(axis=1)
+        self.nan_data[cat_cols] = self.nan_data[cat_cols].fillna(MISSING_DATA_STR)
+        
+        for num_col in num_cols:
+
+            # reset the generator for each column
+            rng = np.random.default_rng(seed=0)
+           
+            # Extract relevant num col variables from the user spec
+            num_col_dict = self.spec_dict["columns"][num_col]
+
+            dist = num_col_dict["distribution"]
+            dist_params = num_col_dict["distribution_parameters"]
+            precision = num_col_dict["precision"]
+            
+            # if it's already NA, don't re-generate; it's NA for a reason!
+            num_mask = self.nan_data[num_col].isna()
+            mask = cat_mask & ~num_mask
+
+            # it's possible to have the left side be Int64 type and the right side
+            # to be float64 (newly generated, unscaled); assigning different types
+            # doesn't work so we'll delay assignment and scale / cast type first!
+            unscaled_new_series = self.nan_data.loc[mask, cat_cols].apply(
+                func=generate_cont_val,
+                axis=1,
+                weights_table=self.wt,
+                num_col=num_col,
+                rng=rng,
+                dist=dist,
+                dist_params=dist_params
+            )
+
+            # rescale the masked section, but make sure to change target_sum!
+            # take a copy of the dist_params as full target_sum is used elsewhere
+            new_dist_params = dist_params.copy()
+
+            if dist_params.get("target_sum", None) is not None:
+                old_sum = self.nan_data.loc[~mask, num_col].sum()
+                new_dist_params["target_sum"] = dist_params["target_sum"] - old_sum
+
+            scaled_new_series = scale_continuous_column(
+                series=unscaled_new_series,
+                precision=precision,
+                **new_dist_params
+            )
+
+            # for some reason assigning a series back, rather than values
+            # creates nulls in certain rows, but not others; maybe Pandas bug.
+            # when the array is empty, Pandas generates a ValueError
+            if len(scaled_new_series) != 0:
+                self.nan_data.loc[mask, num_col] = scaled_new_series.values
+
+        # replace Missing data back with np.nan
+        # since we're applying the function across all columns, including numerical,
+        # these can contain pd.NA which is a "special" type that will error out if
+        # trying to evaluate it against a string. Replace with a standard np.NAN.
+        self.nan_data = self.nan_data.map(
+            lambda x: np.nan if pd.isna(x) or x == MISSING_DATA_STR else x)
+
+        return self.nan_data.astype(self.dtypes)
+
+    def _find_columns_with_linked_missing_data(self):
+        '''
+        Returns a list of column groupings where a missing value in one
+        means always a missing value in all in the grouping. The requirement
+        for that is that the missing_probability attribute of the spec is the
+        same for all such linked / paired columns.
+
+        Returns a list with sets of columns
+        '''
+        
+        result = []
+        processed_pairs = set()
+        miss_probs = get_attr_values(
+            self.spec_dict, "miss_probability", col_names=True, types="categorical")
+
+        for col, attrs in self.spec_dict["columns"].items():
+
+            if col in processed_pairs or attrs["type"] != "categorical":
+                continue
+
+            pairs = set()
+            
+            # paired columns first
+            if attrs["paired_columns"]:
+
+                pairs.update([col] + attrs["paired_columns"])
+
+            # linked groups
+            for i, linked_group in (self.spec_dict["linked_columns"] or []):
+                # zero numbered linked group is reserved for user defined linkage
+                if i == 0:
+                    continue
+
+                if col in linked_group:
+                    pairs.update(linked_group)
+
+            processed_pairs.update(pairs)
+
+            # check that miss_probabilities are the same for all paired columns
+            miss_probs = sorted(
+                miss_probs, key=lambda x, pairs=pairs: x.col_name in pairs)
+            groups = groupby(miss_probs, lambda x, pairs=pairs: x.col_name in pairs)
+
+            for key, group in groups:
+
+                if key and len({v for k, v in group}) == 1:
+
+                    result.append(pairs)
+
+        return result
+
+
+    def _find_make_null_idx(self):
+        '''
+        The reason for keeping this and _find_not_null_idx separate is that
+        they are needed at different points in time - not_null_idx happens AFTER
+        all other sources of nan-generation have been exhausted and we're using
+        the data WITH nans to determine indices to pick up real values from the
+        original data passed in to the generator.
+        '''
+        
+        cc = self.spec_dict["constraints"]["custom_constraints"] or {}
+
+        make_null_idx = []
+        
+        for _, constraint in cc.items():
+
+            cc_filter = constraint.get("filter", None)
+            cc_targets = constraint.get("targets", {})
+            clean_cc_filter = clean_up_constraint_string(cc_filter)
+            cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter)
+
+            for target_str, action_str in cc_targets.items():
+
+                if "make_null" in action_str:
+
+                    target_cols = [x.strip() for x in target_str.split(",")]
+
+                    for target in target_cols:
+
+                        make_null_idx.append(
+                            (self.nan_data.loc[cc_mask].index, target)
+                        )
+
+        return make_null_idx
+
+    def _find_not_null_idx(self):
+        '''
+        Doc string
+        '''
+        
+        cc = self.spec_dict["constraints"]["custom_constraints"] or {}
+
+        not_null_idx = []
+            
+        for _, constraint in cc.items():
+
+            cc_filter = constraint.get("filter", None)
+            cc_targets = constraint.get("targets", {})
+            clean_cc_filter = clean_up_constraint_string(cc_filter)
+            cc_mask = get_constraint_mask(self.nan_data, clean_cc_filter)
+
+            for target_str, action_str in cc_targets.items():
+
+                if "make_not_null" in action_str:
+
+                    target_cols = [x.strip() for x in target_str.split(",")]
+
+                    for target in target_cols:
+
+                        not_null_idx.append(
+                            (self.nan_data.loc[cc_mask].index, target)
+                        )
+
+        return not_null_idx
diff --git a/exhibit/core/generate/tests/test_derived.py b/exhibit/core/generate/tests/test_derived.py
index a3be948..b41dec8 100644
--- a/exhibit/core/generate/tests/test_derived.py
+++ b/exhibit/core/generate/tests/test_derived.py
@@ -1,103 +1,103 @@
-'''
-Test the generation of continuous columns & values
-'''
-
-# Standard library imports
-import unittest
-
-# External library imports
-import pandas as pd
-from pandas.testing import assert_series_equal
-import numpy as np
-
-# Module under test
-from exhibit.core.generate import derived as tm
-
-class derivedTests(unittest.TestCase):
-    '''
-    Doc string
-    '''
-
-    def test_generate_derived_column_basic(self):
-        '''
-        All of the work is done by pandas.eval() method;
-        we're just testing column names with whitespace are OK
-        '''
-
-        test_df = pd.DataFrame(
-            data=np.ones((5, 2)),
-            columns=["Hello World", "A"])
-
-        calc = "Hello World + A"
-
-        self.assertEqual(tm.generate_derived_column(test_df, calc).sum(), 10)
-
-    def test_generate_derived_column_groupby(self):
-        '''
-        We want to allow users to create aggregated columns, like peer values.
-        Make sure that column names are enclosed in single spaces.
-        '''
-
-        test_df = pd.DataFrame(
-            data={
-                "C1":["A", "A", "B", "B", "C", "C"], #locations
-                "C2":["spam", "eggs"] * 3, #groupby dimension(s)
-                "C3":[1, 10] * 3 #aggregation column
-            }
-        )
-
-        calc = "df.groupby('C2')['C3'].sum()"
-
-        expected = pd.Series([3, 30, 3, 30, 3, 30], name="C3")
-
-        assert_series_equal(
-            left=tm.generate_derived_column(test_df, calc),
-            right=expected,
-            check_dtype=False
-            )
-        
-    def test_generate_derived_column_with_timestamp(self):
-        '''
-        We want to allow users to create timestamps using generated columns with
-        hours, minutes and seconds. Bear in mind that missing values in all column
-        types are represented as np.nan.
-        '''
-
-        dates = pd.date_range(
-            start="01-01-2022",
-            periods=3,
-            freq="M",            
-        )
-
-        test_df = pd.DataFrame(
-            data={
-                "dates"  : dates,
-                "hours"  : pd.Categorical(["1", "2", np.nan]),
-                "minutes": [0, np.nan, 59],
-                "seconds": [0, 1, 10],
-            }
-        )
-
-        calc = "@create_timestamp(hours, minutes, seconds)"
-
-        expected = pd.Series([
-            "2022-01-31 01:00:00",
-            "2022-02-28 02:00:01",
-            "2022-03-31 00:59:10"
-        ])
-
-        # can add dates and timedelta timestamps easily
-        result = (
-            test_df["dates"] + tm.generate_derived_column(test_df, calc)
-        ).astype(str)
- 
-        assert_series_equal(
-            left=result,
-            right=expected,
-            check_dtype=False
-        )
-
-if __name__ == "__main__" and __package__ is None:
-    #overwrite __package__ builtin as per PEP 366
-    __package__ = "exhibit"
-    unittest.main(warnings="ignore")
+'''
+Test the generation of continuous columns & values
+'''
+
+# Standard library imports
+import unittest
+
+# External library imports
+import pandas as pd
+from pandas.testing import assert_series_equal
+import numpy as np
+
+# Module under test
+from exhibit.core.generate import derived as tm
+
+class derivedTests(unittest.TestCase):
+    '''
+    Doc string
+    '''
+
+    def test_generate_derived_column_basic(self):
+        '''
+        All of the work is done by pandas.eval() method;
+        we're just testing column names with whitespace are OK
+        '''
+
+        test_df = pd.DataFrame(
+            data=np.ones((5, 2)),
+            columns=["Hello World", "A"])
+
+        calc = "Hello World + A"
+
+        self.assertEqual(tm.generate_derived_column(test_df, calc).sum(), 10)
+
+    def test_generate_derived_column_groupby(self):
+        '''
+        We want to allow users to create aggregated columns, like peer values.
+        Make sure that column names are enclosed in single spaces.
+        '''
+
+        test_df = pd.DataFrame(
+            data={
+                "C1":["A", "A", "B", "B", "C", "C"], #locations
+                "C2":["spam", "eggs"] * 3, #groupby dimension(s)
+                "C3":[1, 10] * 3 #aggregation column
+            }
+        )
+
+        calc = "df.groupby('C2')['C3'].sum()"
+
+        expected = pd.Series([3, 30, 3, 30, 3, 30], name="C3")
+
+        assert_series_equal(
+            left=tm.generate_derived_column(test_df, calc),
+            right=expected,
+            check_dtype=False
+            )
+        
+    def test_generate_derived_column_with_timestamp(self):
+        '''
+        We want to allow users to create timestamps using generated columns with
+        hours, minutes and seconds. Bear in mind that missing values in all column
+        types are represented as np.nan.
+        '''
+
+        dates = pd.date_range(
+            start="01-01-2022",
+            periods=3,
+            freq="ME",            
+        )
+
+        test_df = pd.DataFrame(
+            data={
+                "dates"  : dates,
+                "hours"  : pd.Categorical(["1", "2", np.nan]),
+                "minutes": [0, np.nan, 59],
+                "seconds": [0, 1, 10],
+            }
+        )
+
+        calc = "@create_timestamp(hours, minutes, seconds)"
+
+        expected = pd.Series([
+            "2022-01-31 01:00:00",
+            "2022-02-28 02:00:01",
+            "2022-03-31 00:59:10"
+        ])
+
+        # can add dates and timedelta timestamps easily
+        result = (
+            test_df["dates"] + tm.generate_derived_column(test_df, calc)
+        ).astype(str)
+ 
+        assert_series_equal(
+            left=result,
+            right=expected,
+            check_dtype=False
+        )
+
+if __name__ == "__main__" and __package__ is None:
+    #overwrite __package__ builtin as per PEP 366
+    __package__ = "exhibit"
+    unittest.main(warnings="ignore")
diff --git a/exhibit/core/generate/tests/test_missing.py b/exhibit/core/generate/tests/test_missing.py
index 002b3aa..10b4262 100644
--- a/exhibit/core/generate/tests/test_missing.py
+++ b/exhibit/core/generate/tests/test_missing.py
@@ -1,618 +1,618 @@
-'''
-Test the handling & generation of missing values
-'''
-
-# Standard library imports
-import unittest
-from collections import namedtuple
-from unittest.mock import Mock, patch
-
-# External library imports
-import pandas as pd
-import numpy as np
-from pandas.testing import assert_frame_equal, assert_series_equal
-
-# Exhibit imports
-from exhibit.db import db_util
-from exhibit.core.constants import MISSING_DATA_STR, ORIGINAL_VALUES_PAIRED
-from exhibit.core.tests.test_reference import temp_exhibit
-
-# Module under test
-from exhibit.core.generate import missing as tm
-
-class missingDataTests(unittest.TestCase):
-    '''
-    Doc string
-    '''
-
-    @classmethod
-    def tearDownClass(cls):
-        '''
-        Clean up local exhibit.db from temp tables
-        '''
-
-        db_util.purge_temp_tables()
-
-    def test_feeding_data_to_missing_generator(self):
-        '''
-        Doc string
-        '''
-
-        test_df = pd.DataFrame()
-
-        path = "exhibit.core.generate.missing.MissingDataGenerator.__init__"
-        with patch(path) as mock_init:
-            mock_init.return_value = None
-            generatorMock = tm.MissingDataGenerator(Mock(), Mock())
-
-        setattr(generatorMock, "data", test_df)
-
-        self.assertTrue(
-            isinstance(generatorMock.data,
-            pd.DataFrame))
-
-    def test_never_null_indices_are_identified(self):
-        '''
-        Some cells can't ever have nulls due to custom constraints.
-        Filter and Partition fields are optional when defining custom
-        constraints.
-        '''
-
-        test_dict = {
-            "constraints" : {
-                "custom_constraints": {
-                    "cc1" : {
-                        "filter"  : "~Test.isna()",
-                        "targets" : {
-                            "Num" : "make_not_null"
-                        }
-                    },
-                }
-            }
-        }
-
-        test_data = pd.DataFrame(data={
-            "Test" : [1, 2, 3, pd.NA, 5],
-            "Num"  : [1, 2, 3, 4, 5]
-        })
-
-        test_gen = tm.MissingDataGenerator(test_dict, test_data)
-
-        not_null_idx = test_gen._find_not_null_idx()
-
-        result = not_null_idx[0]
-
-        assert_series_equal(
-            test_data.loc[result],
-            test_data.loc[[0, 1, 2, 4], "Num"])
-
-    def test_paired_columns_with_missing_data_identified(self):
-        '''
-        Doc string
-        '''
-
-        test_dict = {
-            "columns" : {
-                "A" : {
-                    "type"            : "categorical",
-                    "paired_columns"  : ["B"],
-                    "miss_probability": 0.5,
-                    "original_values" : ORIGINAL_VALUES_PAIRED
-                },
-                "B" : {
-                    "type"            : "categorical",
-                    "paired_columns" : ["A"],
-                    "miss_probability" : 0.5,
-                    "original_values" : pd.DataFrame()
-                },
-                "C" : {
-                    "type"            : "categorical",
-                    "paired_columns" : ["D"],
-                    "miss_probability" : 0.6,
-                    "original_values" : pd.DataFrame()
-                },
-                "D" : {
-                    "type"            : "categorical",
-                    "paired_columns" : ["C"],
-                    "miss_probability" : 0.7,
-                    "original_values" : ORIGINAL_VALUES_PAIRED
-                }
-            },
-            "constraints" : {
-                "custom_constraints" : {},
-                
-                },
-            "linked_columns" : []
-        }
-
-        expected = [
-            {"A", "B"},
-        ]
-
-        test_gen = tm.MissingDataGenerator(test_dict, Mock())
-        result = test_gen._find_columns_with_linked_missing_data()
-
-        self.assertCountEqual(expected, result) 
-
-    def test_linked_columns_with_missing_data_identified(self):
-        '''
-        Doc string
-        '''
-
-        test_dict = {
-            "columns" : {
-                "A" : {
-                    "type"            : "categorical",
-                    "paired_columns"  : [],
-                    "miss_probability": 0.5,
-                    "original_values" : pd.DataFrame()
-                    },
-                "B" : {
-                    "type"            : "categorical",
-                    "paired_columns" : [],
-                    "miss_probability" : 0.5,
-                    "original_values" : pd.DataFrame()
-                    },
-                "C" : {
-                    "type"            : "categorical",
-                    "paired_columns" : [],
-                    "miss_probability" : 0.6,
-                    "original_values" : pd.DataFrame()
-                    },
-                "D" : {
-                    "type"            : "categorical",
-                    "paired_columns" : [],
-                    "miss_probability" : 0.5,
-                    "original_values" : pd.DataFrame()
-                    }
-            },
-            "constraints" : {
-                "custom_constraints" : {},
-                },
-            "linked_columns" : [
-                (1, ["A", "B"]),
-                (2, ["C", "D"])
-                ]
-        }
-
-        expected = [
-            {"A", "B"},
-        ]
-
-        test_gen = tm.MissingDataGenerator(test_dict, Mock())
-        result = test_gen._find_columns_with_linked_missing_data()
-
-        self.assertCountEqual(expected, result)     
-
-    def test_linked_and_paired_columns_with_missing_data_identified(self):
-        '''
-        Doc string
-        '''
-
-        test_dict = {
-            "columns" : {
-                "A" : {
-                    "type"            : "categorical",
-                    "paired_columns"  : ["B"],
-                    "miss_probability": 0.5,
-                    "original_values" : pd.DataFrame()
-                    },
-                "B" : {
-                    "type"            : "categorical",
-                    "paired_columns" : ["A"],
-                    "miss_probability" : 0.5,
-                    "original_values" : ORIGINAL_VALUES_PAIRED
-                    },
-                "C" : {
-                    "type"            : "categorical",
-                    "paired_columns" : [],
-                    "miss_probability" : 0.5,
-                    "original_values" : pd.DataFrame()
-                    }
-            },
-            "constraints" : {
-                "custom_constraints" : {},
-                },
-            "linked_columns" : [
-                (0, ["A", "C"]),
-                ]
-        }
-
-        expected = [
-            {"A", "B", "C"},
-        ]
-
-        test_gen = tm.MissingDataGenerator(test_dict, Mock())
-        result = test_gen._find_columns_with_linked_missing_data()
-
-        self.assertTrue(expected[0], result[0])      
-        
-    def test_make_null_constraint_in_columns(self):
-        '''
-        When we're adding nulls to categorical columns, the non-null 
-        numerical values must be re-calulcated and re-scaled because
-        Missing data (null identifier in categorical columns) can have
-        vastly different weights compared to the old values. However,
-        we shouldn't rescaled the whole column anew, just the added values.
-        '''
-
-        Weights = namedtuple("Weights", ["weight", "equal_weight"])
-
-        #demo weights table
-        weights_df = pd.DataFrame(
-            data=[
-                ("C", "A", "spam", Weights(0.5, 0.5)),
-                ("C", "A", "eggs", Weights(0.5, 0.5)),
-                ("C", "B", "bacon", Weights(0.5, 0.5)),
-                ("C", "A", MISSING_DATA_STR, Weights(0.5, 0.5)),
-                ("C", "B", MISSING_DATA_STR, Weights(0.5, 0.5)),
-            ],
-            columns=["num_col", "cat_col", "cat_value", "weights"])
-
-        #reformat into dictionary
-        weights = (
-            weights_df
-                .set_index(["num_col", "cat_col", "cat_value"])
-                .to_dict(orient="index")
-        )
-
-        test_dict = {
-            "_rng" : np.random.default_rng(seed=0),
-            "metadata" : {
-                "categorical_columns": ["A", "B"],
-                "numerical_columns" : ["C"]
-            },
-            "columns": {
-                "A": {
-                    "type"            : "categorical",
-                    "paired_columns"  : [],
-                    "miss_probability": 0,
-                    "original_values" : pd.DataFrame()
-                    
-                },
-                "B": {
-                    "type"            : "categorical",
-                    "paired_columns"  : [],
-                    "miss_probability": 0,
-                    "original_values" : pd.DataFrame()
-
-                },
-                "C": {
-                    "type"            : "continuous",
-                    "precision"       : "integer",
-                    "distribution"    : "weighted_uniform",
-                    "distribution_parameters": {
-                        "dispersion": 0,
-                        "target_sum" : 10,
-                    },
-                    "miss_probability": 0
-                },
-
-            },
-            "constraints" : {
-                "custom_constraints": {
-                    "cc1" : {
-                        "filter"  : "A == 'spam'",
-                        "targets" : {
-                            "B" : "make_null"
-                        }
-                    }
-                }
-            },
-            "linked_columns" : [],
-            "weights_table" : weights,
-            "weights_table_target_cols": ["A", "B"]
-        }
-
-        test_data = pd.DataFrame(data={
-            "A" : ["spam", "spam", "eggs", "eggs"],
-            "B" : ["bacon"] * 4,
-            "C" : [10, 20, 4, 4],
-        })
-
-        expected = pd.DataFrame(data={
-            "A" : ["spam", "spam", "eggs", "eggs"],
-            "B" : [pd.NA, pd.NA, "bacon", "bacon"],
-            "C" : [1, 1, 4, 4],
-        })
-
-        test_gen = tm.MissingDataGenerator(test_dict, test_data)
-        result = test_gen.add_missing_data()
-
-        assert_frame_equal(result, expected, check_dtype=False)
-
-    def test_not_null_constraint_in_columns(self):
-        '''
-        Doc string
-        '''
-
-        test_dict = {
-            "_rng" : np.random.default_rng(seed=0),
-            "metadata" : {
-                "categorical_columns": ["C", "D"],
-                "numerical_columns" : []
-            },
-            "columns": {
-                "C": {
-                    "type"            : "categorical",
-                    "paired_columns"  : [],
-                    "miss_probability": 0.2,
-                    "original_values" : pd.DataFrame()
-                    
-                },
-                "D": {
-                    "type"            : "categorical",
-                    "paired_columns"  : [],
-                    "miss_probability": 0.5,
-                    "original_values" : pd.DataFrame()
-
-                }
-            },
-            "constraints" : {
-                "custom_constraints": {
-                    "cc1" : {
-                        "filter"  : "~C.isna()",
-                        "targets" : {
-                            "D" : "make_not_null"
-                        }
-                    }
-                }
-            },
-            "linked_columns" : []
-        }
-
-        test_data = pd.DataFrame(data={
-            "C" : np.random.random(1000), #pylint: disable=no-member
-            "D" : np.random.random(1000), #pylint: disable=no-member
-
-        })
-
-        test_gen = tm.MissingDataGenerator(test_dict, test_data)
-        result = test_gen.add_missing_data()
-
-        self.assertTrue(result["C"].isna().any())
-        self.assertTrue(result["D"].isna().any())
-        self.assertFalse(result.loc[~result["C"].isna(), "D"].isna().any())
-
-    def test_paired_columns_are_respected_for_missing_data(self):
-        '''
-        Doc string
-        '''
-
-        test_dict = {
-            "_rng" : np.random.default_rng(seed=0),
-            "metadata" : {
-                "categorical_columns": ["A", "B"],
-                "numerical_columns" : []
-            },
-            "columns": {
-                "A": {
-                    "type"            : "categorical",
-                    "paired_columns"  : ["B"],
-                    "miss_probability": 0.5,
-                    "original_values" : pd.DataFrame()
-                },
-                "B": {
-                    "type"            : "categorical",
-                    "paired_columns"  : ["A"],
-                    "miss_probability": 0.5,
-                    "original_values" : ORIGINAL_VALUES_PAIRED
-
-                },
-            },
-            "constraints" : {
-                "custom_constraints" : {},
-            },
-            "linked_columns" : [],
-        }
-
-        test_data = pd.DataFrame(data={
-            "A" : np.random.random(1000), #pylint: disable=no-member
-            "B" : np.random.random(1000), #pylint: disable=no-member
-        })
-
-        test_gen = tm.MissingDataGenerator(test_dict, test_data)
-        result = test_gen.add_missing_data()
-
-        self.assertTrue(result["A"].isna().any())
-        self.assertTrue(result["B"].isna().any())
-        assert_series_equal(result["B"].isna(), result["A"].isna(), check_names=False)
-
-    def test_missing_data_added_to_standalone_categorical_column(self):
-        '''
-        Doc string
-        '''
-
-        test_dict = {
-            "_rng" : np.random.default_rng(seed=0),
-            "metadata" : {
-                "categorical_columns": ["A", "B"],
-                "numerical_columns" : []
-            },
-            "columns": {
-                "A": {
-                    "type"            : "categorical",
-                    "paired_columns"  : [],
-                    "miss_probability": 1,
-                    "original_values" : pd.DataFrame()
-                },
-                "B": {
-                    "type"            : "categorical",
-                    "paired_columns"  : [],
-                    "miss_probability": 0,
-                    "original_values" : pd.DataFrame()
-
-                },
-            },
-            "constraints" : {
-                "custom_constraints" : {}
-            },
-            "linked_columns" : [],
-        }
-
-        test_data = pd.DataFrame(data={
-            "A" : list("ABCDE"),
-            "B" : list("ABCDE")
-        })
-
-        expected = pd.DataFrame(data={
-            "A" : [pd.NA] * 5,
-            "B" : list("ABCDE")
-        })
-
-        test_gen = tm.MissingDataGenerator(test_dict, test_data)
-        result = test_gen.add_missing_data()
-
-        assert_frame_equal(expected, result, check_dtype=False)
-
-    def test_continuous_column_adjusted_to_categorical_missing_data(self):
-        '''
-        Remember that continuous columns depend on values in categorical columns
-        in the same row for their weights, including for Missing data values.
-        Adding Missing data also changes the target_sum of the continuous column
-        so we need to re-scale the whole column after adding missing data either
-        to it or to the categorical columns.
-        
-        We rely on np.random to generate reasonable number of NAs with 0.5 prob,
-        but that can sometimes fail so we ensure that the seed is constant.
-        '''
-
-        Weights = namedtuple("Weights", ["weight", "equal_weight"])
-
-        #demo weights table
-        weights_df = pd.DataFrame(
-            data=[
-                ("C2", "C1", "A", Weights(0.1, 0.5)),
-                ("C2", "C1", "B", Weights(0.9, 0.5)),
-                ("C2", "C1", MISSING_DATA_STR, Weights(0.2, 0.5)),
-            ],
-            columns=["num_col", "cat_col", "cat_value", "weights"])
-
-        #reformat into dictionary
-        weights = (
-            weights_df
-                .set_index(["num_col", "cat_col", "cat_value"])
-                .to_dict(orient="index")
-        )
-
-        test_dict = {
-            "_rng" : np.random.default_rng(seed=0),
-            "metadata": {
-                "categorical_columns": [
-                    "C1"
-                ],
-                "numerical_columns" : [
-                    "C2"
-                ]
-            },
-            "columns": {
-                "C1": {
-                    "type"            : "categorical",
-                    "paired_columns"  : [],
-                    "miss_probability": 0.5,
-                    "original_values" : pd.DataFrame()
-                },
-                "C2": {
-                    "type"            : "continuous",
-                    "precision"       : "integer",
-                    "distribution"    : "weighted_uniform",
-                    "distribution_parameters": {
-                        "uniform_base_value" : 100,
-                        "dispersion": 0,
-                        "target_sum" : 200, # factor of two
-                    },
-                    "miss_probability": 0
-                },
-            },
-            "constraints" : {
-                "custom_constraints" : {}
-            },
-            "linked_columns" : [],
-            "weights_table" : weights,
-            "weights_table_target_cols": ["C1"]
-        }
-
-        test_data = pd.DataFrame(data={
-            "C1" : ["A", "A", "A", "B", "B"] * 20,
-            "C2" : [1] * 100
-        })
-
-
-        test_gen = tm.MissingDataGenerator(test_dict, test_data)
-        result = test_gen.add_missing_data()
-
-        self.assertTrue(result["C1"].isna().any())
-        self.assertEqual(result["C2"].sum(), 200)
-
-    def test_user_linked_columns_having_missing_data(self):
-        '''
-        Because user linked columns can have complex relationships, we 
-        need to make sure missing data is handled correctly.
-        '''
-
-        test_df = pd.DataFrame(data={
-            "A": ["spam", "spam", "eggs", "eggs", "spam"],
-            "B": ["bacon", "spamspam", np.nan, "parrot", "bacon"],
-            "C": range(5)
-        })
-
-        test_dict = {
-            "metadata" : {
-                "number_of_rows" : 1000
-            }
-        }
-
-        fromdata_test = {
-            "linked_columns" : ["A", "B"]
-        }
-
-        _, df = temp_exhibit(
-            filename=test_df, fromdata_namespace=fromdata_test,
-            test_spec_dict=test_dict, return_spec=False)
-
-        self.assertTrue(df.query("A == 'eggs'")["B"].isna().any())    
-
-    def test_categorical_numerical_missing_data_with_make_null_cc(self):
-        '''
-        Typing issues (categorical vs object) can cause bugs when we have categorical columns,
-        a make_null custom constraint, a filter casting categorical column to integers (which
-        assumes object, not categorical - because you can't cast categorical to int if there 
-        is a Missing data categorical value - without removing unused categories first) AND
-        a numerical column. Commenting out the numerical column used to pass the test, and
-        uncommenting it used to fail it - which is wrong.
-
-        Without extra checks, AGE.astype('int') will fail if AGE is dtype="category" because
-        it'll have numbers as strings (which can be cast to int) and "invisible" Missing data
-        which can't.
-        '''
-
-        test_df = pd.DataFrame(data={
-            "AGE": ["1", "2", "3", "4", "4"],
-            "NULLED" : list("ABCAB"),
-            "NUMS": range(5)
-        })
-
-        test_dict = {
-            "metadata" : {
-                "number_of_rows" : 10,
-                "categorical_columns": ["AGE", "NULLED"],
-                "numerical_columns" : ["NUMS"]
-            },
-            "constraints" : {
-                "custom_constraints" : {
-                    "test_nulls" : {
-                        "filter" : "AGE.astype('int') > 1",
-                        "targets" : {"NULLED" : "make_null"}
-                    }
-                }
-            }
-        }
-
-        _, df = temp_exhibit(filename=test_df, test_spec_dict=test_dict, return_spec=False)
-
-        self.assertTrue(df.NULLED.isna().any())    
-        
-if __name__ == "__main__" and __package__ is None:
-    #overwrite __package__ builtin as per PEP 366
-    __package__ = "exhibit"
-    unittest.main(warnings="ignore")
+'''
+Test the handling & generation of missing values
+'''
+
+# Standard library imports
+import unittest
+from collections import namedtuple
+from unittest.mock import Mock, patch
+
+# External library imports
+import pandas as pd
+import numpy as np
+from pandas.testing import assert_frame_equal, assert_series_equal
+
+# Exhibit imports
+from exhibit.db import db_util
+from exhibit.core.constants import MISSING_DATA_STR, ORIGINAL_VALUES_PAIRED
+from exhibit.core.tests.test_reference import temp_exhibit
+
+# Module under test
+from exhibit.core.generate import missing as tm
+
+class missingDataTests(unittest.TestCase):
+    '''
+    Doc string
+    '''
+
+    @classmethod
+    def tearDownClass(cls):
+        '''
+        Clean up local exhibit.db from temp tables
+        '''
+
+        db_util.purge_temp_tables()
+
+    def test_feeding_data_to_missing_generator(self):
+        '''
+        Doc string
+        '''
+
+        test_df = pd.DataFrame()
+
+        path = "exhibit.core.generate.missing.MissingDataGenerator.__init__"
+        with patch(path) as mock_init:
+            mock_init.return_value = None
+            generatorMock = tm.MissingDataGenerator(Mock(), Mock())
+
+        setattr(generatorMock, "data", test_df)
+
+        self.assertTrue(
+            isinstance(generatorMock.data,
+            pd.DataFrame))
+
+    def test_never_null_indices_are_identified(self):
+        '''
+        Some cells can't ever have nulls due to custom constraints.
+        Filter and Partition fields are optional when defining custom
+        constraints.
+        '''
+
+        test_dict = {
+            "constraints" : {
+                "custom_constraints": {
+                    "cc1" : {
+                        "filter"  : "~Test.isna()",
+                        "targets" : {
+                            "Num" : "make_not_null"
+                        }
+                    },
+                }
+            }
+        }
+
+        test_data = pd.DataFrame(data={
+            "Test" : [1, 2, 3, pd.NA, 5],
+            "Num"  : [1, 2, 3, 4, 5]
+        })
+
+        test_gen = tm.MissingDataGenerator(test_dict, test_data)
+
+        not_null_idx = test_gen._find_not_null_idx()
+
+        result = not_null_idx[0]
+
+        assert_series_equal(
+            test_data.loc[result],
+            test_data.loc[[0, 1, 2, 4], "Num"])
+
+    def test_paired_columns_with_missing_data_identified(self):
+        '''
+        Doc string
+        '''
+
+        test_dict = {
+            "columns" : {
+                "A" : {
+                    "type"            : "categorical",
+                    "paired_columns"  : ["B"],
+                    "miss_probability": 0.5,
+                    "original_values" : ORIGINAL_VALUES_PAIRED
+                },
+                "B" : {
+                    "type"            : "categorical",
+                    "paired_columns" : ["A"],
+                    "miss_probability" : 0.5,
+                    "original_values" : pd.DataFrame()
+                },
+                "C" : {
+                    "type"            : "categorical",
+                    "paired_columns" : ["D"],
+                    "miss_probability" : 0.6,
+                    "original_values" : pd.DataFrame()
+                },
+                "D" : {
+                    "type"            : "categorical",
+                    "paired_columns" : ["C"],
+                    "miss_probability" : 0.7,
+                    "original_values" : ORIGINAL_VALUES_PAIRED
+                }
+            },
+            "constraints" : {
+                "custom_constraints" : {},
+                
+                },
+            "linked_columns" : []
+        }
+
+        expected = [
+            {"A", "B"},
+        ]
+
+        test_gen = tm.MissingDataGenerator(test_dict, Mock())
+        result = test_gen._find_columns_with_linked_missing_data()
+
+        self.assertCountEqual(expected, result) 
+
+    def test_linked_columns_with_missing_data_identified(self):
+        '''
+        Doc string
+        '''
+
+        test_dict = {
+            "columns" : {
+                "A" : {
+                    "type"            : "categorical",
+                    "paired_columns"  : [],
+                    "miss_probability": 0.5,
+                    "original_values" : pd.DataFrame()
+                    },
+                "B" : {
+                    "type"            : "categorical",
+                    "paired_columns" : [],
+                    "miss_probability" : 0.5,
+                    "original_values" : pd.DataFrame()
+                    },
+                "C" : {
+                    "type"            : "categorical",
+                    "paired_columns" : [],
+                    "miss_probability" : 0.6,
+                    "original_values" : pd.DataFrame()
+                    },
+                "D" : {
+                    "type"            : "categorical",
+                    "paired_columns" : [],
+                    "miss_probability" : 0.5,
+                    "original_values" : pd.DataFrame()
+                    }
+            },
+            "constraints" : {
+                "custom_constraints" : {},
+                },
+            "linked_columns" : [
+                (1, ["A", "B"]),
+                (2, ["C", "D"])
+                ]
+        }
+
+        expected = [
+            {"A", "B"},
+        ]
+
+        test_gen = tm.MissingDataGenerator(test_dict, Mock())
+        result = test_gen._find_columns_with_linked_missing_data()
+
+        self.assertCountEqual(expected, result)     
+
+    def test_linked_and_paired_columns_with_missing_data_identified(self):
+        '''
+        Doc string
+        '''
+
+        test_dict = {
+            "columns" : {
+                "A" : {
+                    "type"            : "categorical",
+                    "paired_columns"  : ["B"],
+                    "miss_probability": 0.5,
+                    "original_values" : pd.DataFrame()
+                    },
+                "B" : {
+                    "type"            : "categorical",
+                    "paired_columns" : ["A"],
+                    "miss_probability" : 0.5,
+                    "original_values" : ORIGINAL_VALUES_PAIRED
+                    },
+                "C" : {
+                    "type"            : "categorical",
+                    "paired_columns" : [],
+                    "miss_probability" : 0.5,
+                    "original_values" : pd.DataFrame()
+                    }
+            },
+            "constraints" : {
+                "custom_constraints" : {},
+                },
+            "linked_columns" : [
+                (0, ["A", "C"]),
+                ]
+        }
+
+        expected = [
+            {"A", "B", "C"},
+        ]
+
+        test_gen = tm.MissingDataGenerator(test_dict, Mock())
+        result = test_gen._find_columns_with_linked_missing_data()
+
+        self.assertTrue(expected[0], result[0])      
+        
+    def test_make_null_constraint_in_columns(self):
+        '''
+        When we're adding nulls to categorical columns, the non-null 
+        numerical values must be re-calulcated and re-scaled because
+        Missing data (null identifier in categorical columns) can have
+        vastly different weights compared to the old values. However,
+        we shouldn't rescaled the whole column anew, just the added values.
+        '''
+
+        Weights = namedtuple("Weights", ["weight", "equal_weight"])
+
+        #demo weights table
+        weights_df = pd.DataFrame(
+            data=[
+                ("C", "A", "spam", Weights(0.5, 0.5)),
+                ("C", "A", "eggs", Weights(0.5, 0.5)),
+                ("C", "B", "bacon", Weights(0.5, 0.5)),
+                ("C", "A", MISSING_DATA_STR, Weights(0.5, 0.5)),
+                ("C", "B", MISSING_DATA_STR, Weights(0.5, 0.5)),
+            ],
+            columns=["num_col", "cat_col", "cat_value", "weights"])
+
+        #reformat into dictionary
+        weights = (
+            weights_df
+                .set_index(["num_col", "cat_col", "cat_value"])
+                .to_dict(orient="index")
+        )
+
+        test_dict = {
+            "_rng" : np.random.default_rng(seed=0),
+            "metadata" : {
+                "categorical_columns": ["A", "B"],
+                "numerical_columns" : ["C"]
+            },
+            "columns": {
+                "A": {
+                    "type"            : "categorical",
+                    "paired_columns"  : [],
+                    "miss_probability": 0,
+                    "original_values" : pd.DataFrame()
+                    
+                },
+                "B": {
+                    "type"            : "categorical",
+                    "paired_columns"  : [],
+                    "miss_probability": 0,
+                    "original_values" : pd.DataFrame()
+
+                },
+                "C": {
+                    "type"            : "continuous",
+                    "precision"       : "integer",
+                    "distribution"    : "weighted_uniform",
+                    "distribution_parameters": {
+                        "dispersion": 0,
+                        "target_sum" : 10,
+                    },
+                    "miss_probability": 0
+                },
+
+            },
+            "constraints" : {
+                "custom_constraints": {
+                    "cc1" : {
+                        "filter"  : "A == 'spam'",
+                        "targets" : {
+                            "B" : "make_null"
+                        }
+                    }
+                }
+            },
+            "linked_columns" : [],
+            "weights_table" : weights,
+            "weights_table_target_cols": ["A", "B"]
+        }
+
+        test_data = pd.DataFrame(data={
+            "A" : ["spam", "spam", "eggs", "eggs"],
+            "B" : ["bacon"] * 4,
+            "C" : [10, 20, 4, 4],
+        })
+
+        expected = pd.DataFrame(data={
+            "A" : ["spam", "spam", "eggs", "eggs"],
+            "B" : [np.nan, np.nan, "bacon", "bacon"],
+            "C" : [1, 1, 4, 4],
+        })
+
+        test_gen = tm.MissingDataGenerator(test_dict, test_data)
+        result = test_gen.add_missing_data()
+
+        assert_frame_equal(result, expected, check_dtype=False)
+
+    def test_not_null_constraint_in_columns(self):
+        '''
+        Doc string
+        '''
+
+        test_dict = {
+            "_rng" : np.random.default_rng(seed=0),
+            "metadata" : {
+                "categorical_columns": ["C", "D"],
+                "numerical_columns" : []
+            },
+            "columns": {
+                "C": {
+                    "type"            : "categorical",
+                    "paired_columns"  : [],
+                    "miss_probability": 0.2,
+                    "original_values" : pd.DataFrame()
+                    
+                },
+                "D": {
+                    "type"            : "categorical",
+                    "paired_columns"  : [],
+                    "miss_probability": 0.5,
+                    "original_values" : pd.DataFrame()
+
+                }
+            },
+            "constraints" : {
+                "custom_constraints": {
+                    "cc1" : {
+                        "filter"  : "~C.isna()",
+                        "targets" : {
+                            "D" : "make_not_null"
+                        }
+                    }
+                }
+            },
+            "linked_columns" : []
+        }
+
+        test_data = pd.DataFrame(data={
+            "C" : np.random.random(1000), #pylint: disable=no-member
+            "D" : np.random.random(1000), #pylint: disable=no-member
+
+        })
+
+        test_gen = tm.MissingDataGenerator(test_dict, test_data)
+        result = test_gen.add_missing_data()
+
+        self.assertTrue(result["C"].isna().any())
+        self.assertTrue(result["D"].isna().any())
+        self.assertFalse(result.loc[~result["C"].isna(), "D"].isna().any())
+
+    def test_paired_columns_are_respected_for_missing_data(self):
+        '''
+        Doc string
+        '''
+
+        test_dict = {
+            "_rng" : np.random.default_rng(seed=0),
+            "metadata" : {
+                "categorical_columns": ["A", "B"],
+                "numerical_columns" : []
+            },
+            "columns": {
+                "A": {
+                    "type"            : "categorical",
+                    "paired_columns"  : ["B"],
+                    "miss_probability": 0.5,
+                    "original_values" : pd.DataFrame()
+                },
+                "B": {
+                    "type"            : "categorical",
+                    "paired_columns"  : ["A"],
+                    "miss_probability": 0.5,
+                    "original_values" : ORIGINAL_VALUES_PAIRED
+
+                },
+            },
+            "constraints" : {
+                "custom_constraints" : {},
+            },
+            "linked_columns" : [],
+        }
+
+        test_data = pd.DataFrame(data={
+            "A" : np.random.random(1000), #pylint: disable=no-member
+            "B" : np.random.random(1000), #pylint: disable=no-member
+        })
+
+        test_gen = tm.MissingDataGenerator(test_dict, test_data)
+        result = test_gen.add_missing_data()
+
+        self.assertTrue(result["A"].isna().any())
+        self.assertTrue(result["B"].isna().any())
+        assert_series_equal(result["B"].isna(), result["A"].isna(), check_names=False)
+
+    def test_missing_data_added_to_standalone_categorical_column(self):
+        '''
+        Doc string
+        '''
+
+        test_dict = {
+            "_rng" : np.random.default_rng(seed=0),
+            "metadata" : {
+                "categorical_columns": ["A", "B"],
+                "numerical_columns" : []
+            },
+            "columns": {
+                "A": {
+                    "type"            : "categorical",
+                    "paired_columns"  : [],
+                    "miss_probability": 1,
+                    "original_values" : pd.DataFrame()
+                },
+                "B": {
+                    "type"            : "categorical",
+                    "paired_columns"  : [],
+                    "miss_probability": 0,
+                    "original_values" : pd.DataFrame()
+
+                },
+            },
+            "constraints" : {
+                "custom_constraints" : {}
+            },
+            "linked_columns" : [],
+        }
+
+        test_data = pd.DataFrame(data={
+            "A" : list("ABCDE"),
+            "B" : list("ABCDE")
+        })
+
+        expected = pd.DataFrame(data={
+            "A" : [np.nan] * 5,
+            "B" : list("ABCDE")
+        })
+
+        test_gen = tm.MissingDataGenerator(test_dict, test_data)
+        result = test_gen.add_missing_data()
+
+        assert_frame_equal(expected, result, check_dtype=False)
+
+    def test_continuous_column_adjusted_to_categorical_missing_data(self):
+        '''
+        Remember that continuous columns depend on values in categorical columns
+        in the same row for their weights, including for Missing data values.
+        Adding Missing data also changes the target_sum of the continuous column
+        so we need to re-scale the whole column after adding missing data either
+        to it or to the categorical columns.
+        
+        We rely on np.random to generate reasonable number of NAs with 0.5 prob,
+        but that can sometimes fail so we ensure that the seed is constant.
+        '''
+
+        Weights = namedtuple("Weights", ["weight", "equal_weight"])
+
+        #demo weights table
+        weights_df = pd.DataFrame(
+            data=[
+                ("C2", "C1", "A", Weights(0.1, 0.5)),
+                ("C2", "C1", "B", Weights(0.9, 0.5)),
+                ("C2", "C1", MISSING_DATA_STR, Weights(0.2, 0.5)),
+            ],
+            columns=["num_col", "cat_col", "cat_value", "weights"])
+
+        #reformat into dictionary
+        weights = (
+            weights_df
+                .set_index(["num_col", "cat_col", "cat_value"])
+                .to_dict(orient="index")
+        )
+
+        test_dict = {
+            "_rng" : np.random.default_rng(seed=0),
+            "metadata": {
+                "categorical_columns": [
+                    "C1"
+                ],
+                "numerical_columns" : [
+                    "C2"
+                ]
+            },
+            "columns": {
+                "C1": {
+                    "type"            : "categorical",
+                    "paired_columns"  : [],
+                    "miss_probability": 0.5,
+                    "original_values" : pd.DataFrame()
+                },
+                "C2": {
+                    "type"            : "continuous",
+                    "precision"       : "integer",
+                    "distribution"    : "weighted_uniform",
+                    "distribution_parameters": {
+                        "uniform_base_value" : 100,
+                        "dispersion": 0,
+                        "target_sum" : 200, # factor of two
+                    },
+                    "miss_probability": 0
+                },
+            },
+            "constraints" : {
+                "custom_constraints" : {}
+            },
+            "linked_columns" : [],
+            "weights_table" : weights,
+            "weights_table_target_cols": ["C1"]
+        }
+
+        test_data = pd.DataFrame(data={
+            "C1" : ["A", "A", "A", "B", "B"] * 20,
+            "C2" : [1] * 100
+        })
+
+
+        test_gen = tm.MissingDataGenerator(test_dict, test_data)
+        result = test_gen.add_missing_data()
+
+        self.assertTrue(result["C1"].isna().any())
+        self.assertEqual(result["C2"].sum(), 200)
+
+    def test_user_linked_columns_having_missing_data(self):
+        '''
+        Because user linked columns can have complex relationships, we 
+        need to make sure missing data is handled correctly.
+        '''
+
+        test_df = pd.DataFrame(data={
+            "A": ["spam", "spam", "eggs", "eggs", "spam"],
+            "B": ["bacon", "spamspam", np.nan, "parrot", "bacon"],
+            "C": range(5)
+        })
+
+        test_dict = {
+            "metadata" : {
+                "number_of_rows" : 1000
+            }
+        }
+
+        fromdata_test = {
+            "linked_columns" : ["A", "B"]
+        }
+
+        _, df = temp_exhibit(
+            filename=test_df, fromdata_namespace=fromdata_test,
+            test_spec_dict=test_dict, return_spec=False)
+
+        self.assertTrue(df.query("A == 'eggs'")["B"].isna().any())    
+
+    def test_categorical_numerical_missing_data_with_make_null_cc(self):
+        '''
+        Typing issues (categorical vs object) can cause bugs when we have categorical columns,
+        a make_null custom constraint, a filter casting categorical column to integers (which
+        assumes object, not categorical - because you can't cast categorical to int if there 
+        is a Missing data categorical value - without removing unused categories first) AND
+        a numerical column. Commenting out the numerical column used to pass the test, and
+        uncommenting it used to fail it - which is wrong.
+
+        Without extra checks, AGE.astype('int') will fail if AGE is dtype="category" because
+        it'll have numbers as strings (which can be cast to int) and "invisible" Missing data
+        which can't.
+        '''
+
+        test_df = pd.DataFrame(data={
+            "AGE": ["1", "2", "3", "4", "4"],
+            "NULLED" : list("ABCAB"),
+            "NUMS": range(5)
+        })
+
+        test_dict = {
+            "metadata" : {
+                "number_of_rows" : 10,
+                "categorical_columns": ["AGE", "NULLED"],
+                "numerical_columns" : ["NUMS"]
+            },
+            "constraints" : {
+                "custom_constraints" : {
+                    "test_nulls" : {
+                        "filter" : "AGE.astype('int') > 1",
+                        "targets" : {"NULLED" : "make_null"}
+                    }
+                }
+            }
+        }
+
+        _, df = temp_exhibit(filename=test_df, test_spec_dict=test_dict, return_spec=False)
+
+        self.assertTrue(df.NULLED.isna().any())    
+        
+if __name__ == "__main__" and __package__ is None:
+    #overwrite __package__ builtin as per PEP 366
+    __package__ = "exhibit"
+    unittest.main(warnings="ignore")
diff --git a/exhibit/core/linkage/hierarchical.py b/exhibit/core/linkage/hierarchical.py
index 0e66171..22e1600 100644
--- a/exhibit/core/linkage/hierarchical.py
+++ b/exhibit/core/linkage/hierarchical.py
@@ -637,7 +637,7 @@ def scenario_2(self):
             orig_df = self.spec_dict["columns"][self.base_col]["original_values"]
             repl = self.sql_df[self.base_col].unique()[0:self.base_col_unique_count]
             aliases = dict(zip(orig_df[self.base_col].values[:-1], repl))
-            aliased_df = orig_df.applymap(lambda x: aliases.get(x, x))
+            aliased_df = orig_df.map(lambda x: aliases.get(x, x))
             self.spec_dict["columns"][self.base_col]["original_values"] = aliased_df
             base_col_vals = aliased_df[self.base_col].iloc[:-1].unique()
 
@@ -699,7 +699,7 @@ def scenario_3(self):
             orig_df = self.spec_dict["columns"][self.base_col]["original_values"]
             repl = self.sql_df[self.base_col].unique()[0:self.base_col_unique_count]
             aliases = dict(zip(orig_df[self.base_col].values[:-1], repl))
-            aliased_df = orig_df.applymap(lambda x: aliases.get(x, x))
+            aliased_df = orig_df.map(lambda x: aliases.get(x, x))
             self.spec_dict["columns"][self.base_col]["original_values"] = aliased_df
             base_col_vals = aliased_df[self.base_col].iloc[:-1].unique()
 
diff --git a/exhibit/core/linkage/matrix.py b/exhibit/core/linkage/matrix.py
index 43c662a..66107dc 100644
--- a/exhibit/core/linkage/matrix.py
+++ b/exhibit/core/linkage/matrix.py
@@ -1,389 +1,389 @@
-'''
-Module isolating methods and classes to find, process and generate
-user-defined linked columns where the relationships are coded in a
-lookup + matrix. For hierarchical linkage see the hierarchical module,
-'''
-
-# Standard library imports
-import sys
-import textwrap
-from functools import partial
-from multiprocessing import Pool
-
-# External imports
-import numpy as np
-import pandas as pd
-
-# Exhibit imports
-from ..constants import MISSING_DATA_STR
-from ..sql import create_temp_table, query_exhibit_database
-
-def save_predefined_linked_cols_to_db(df, id):
-    """
-    Derive and save everything that's required to generate
-    user defined linked columns on demand from a future spec
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        original dataframe with just the categorical columns;
-        we assume that linked columns defined by the user are
-        categorical. Maybe need a special case for time?
-    id : str
-        taken from metadata[id]
-
-    Returns
-    -------
-    nothing
-    """
-    
-    prefixed_df = add_prefix(df)
-    orig_label_to_pos_label = {} # age__0-9 : age__0, etc.
-    pos_labels_inc_column = []   # age__0, age__1, etc.
-    sep = "__"
-
-    for col in prefixed_df.columns:
-
-        col_vals = sorted(prefixed_df[col].unique())
-
-        # add Missing data by hand if not already there OR
-        # pop and reinsert at the end to align with the spec!
-        # make sure the values are sorted AFTER we remove the existing
-        # Missing data, but BEFORE we reinsert it.
-        col_miss_val = f"{col}{sep}{MISSING_DATA_STR}"
-
-        # don't forget that we need to test equality element-wise, hence conversion
-        # to an array from; lists don't compare in the same way.
-        if col_miss_val in col_vals:
-            col_vals = sorted(np.delete(col_vals, np.array(col_vals) == col_miss_val))
-
-        col_vals = np.append(col_vals, col_miss_val)
-
-        pos_labels_temp = [
-            f"{col}{sep}{x}" for x in range(len(col_vals))
-            ]
-
-        pos_labels_inc_column.extend(pos_labels_temp)
-
-        orig_label_to_pos_label.update(
-            {k:v for v, k in zip(pos_labels_temp, col_vals)}
-        )
-
-    # age__0 : 0, etc.
-    pos_label_to_id = dict(
-        zip(pos_labels_inc_column, range(len(pos_labels_inc_column))) 
-        ) 
-
-    # convert the original, prefixed values first to positional labels
-    # and then just to numerical IDs
-    temp_df = (prefixed_df
-            .applymap(lambda x: orig_label_to_pos_label.get(x, x))
-            .applymap(lambda x: pos_label_to_id.get(x, x)))
-
-    label_matrix = np.unique(temp_df.values, axis=0).astype(np.intc)
-
-    # make sure column names don't have spaces
-    col_names = [x.replace(" ", "$") for x in prefixed_df.columns]
-
-    # save the label matrix to SQLite db
-    create_temp_table(
-        table_name=f"temp_{id}_matrix",
-        col_names=col_names,
-        data=label_matrix,
-    )
-
-    # save the lookup to SQLite db; note that numerical_ids are
-    # upcast to strings by numpy when creating the array!
-    create_temp_table(
-        table_name=f"temp_{id}_lookup",
-        col_names=["pos_label", "num_label"],
-        data=list(pos_label_to_id.items()),
-    )
-
-def add_prefix(df, sep="__"):
-    """
-    Add column name as prefix to the column values
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        df must have purely categorical columns - no checks are made
-    sep : str, optional
-        separator must be consistent between add_prefix and remove_prefix
-        by default "__"
-
-    Returns
-    -------
-    new DataFrame where values are prefixed with column name
-    """
-
-    data_dict = {}
-    
-    for col in df.columns:
-        # cast to str in case we're dealing with integer-based categorical columns, like age
-        df_col_str = df[col].fillna(MISSING_DATA_STR).astype(str)
-        data_dict[col] = np.add(f"{col}{sep}", df_col_str.values)
-        
-    return pd.DataFrame(data_dict)
-
-def generate_user_linked_anon_df(
-    spec_dict, linked_cols, num_rows, starting_col_matrix=None):
-    '''
-    Main function to generated user-defined linked columns.
-
-    Parameters
-    ----------
-    spec_dict    : dictionary
-        specification plus internal keys, like _rng
-    linked_cols : list
-        there can be only one user-linked group (0, [linked_col_1, linked_col_2, ])
-    num_rows     : int
-        number of rows to generate
-    starting_col_matrix : np.Array shaped (num_rows, len(linked_cols))
-        the matrix is either filled with None values or pre-populated if the function
-        is run multiple times (like when regenerating values after applying custom 
-        actions like make_same)
-
-    Returns
-    -------
-    Data Frame with linked columns
-    '''
-    
-    table_id = spec_dict["metadata"]["id"]
-    rng = spec_dict["_rng"]
-    lookup, matrix = get_lookup_and_matrix_from_db(table_id)
-    new_label_lookup, proba_lookup = build_new_lookups(spec_dict, linked_cols, lookup)
-    # DANGER WHEN REVERSING THE DICT - SAME VALUES IN MULTIPLE COLUMNS WILL BE LOST
-    rev_label_lookup = {key:value for value, key in new_label_lookup.items()}
-    # linked columns dispersion list
-    lcd = [spec_dict["columns"][col]["dispersion"] for col in linked_cols]
-
-    # if re-creating linked values from a pre-generated sequence, reverse the dict to
-    # get the numerical mapping as expected, also changing the dtype for performance.
-
-    if starting_col_matrix is not None:
-        starting_col_matrix = (
-            pd.DataFrame(starting_col_matrix)
-            .fillna(MISSING_DATA_STR)
-            .applymap(lambda x: rev_label_lookup.get(x, x)).values.astype(np.int16)
-        )
-
-    else:
-        starting_col_matrix = np.full(
-            shape=(num_rows, len(linked_cols)), fill_value=-1)
-
-    # multiprocessing only on unix
-    if sys.platform != "win32":
-        with Pool(processes=4) as pool:
-
-            new_rows = pool.map(
-                partial(process_row, matrix, proba_lookup, lcd, rng),
-                starting_col_matrix
-                )
-    else: #pragma: no cover
-
-        new_rows = []
-
-        for i in range(num_rows):
-            new_row = process_row(
-                matrix, proba_lookup, lcd, rng, starting_col_matrix[i])
-            new_rows.append(new_row)
-
-    new_matrix = np.stack(new_rows)
-
-    new_df = pd.DataFrame(
-        new_matrix, columns=linked_cols).applymap(lambda x: new_label_lookup.get(x, x))
-
-    return new_df
-
-def get_lookup_and_matrix_from_db(table_id):
-    '''
-    The names of the two tables required for user defined linkage don't change:
-    one is lookup and another is matrix.
-    '''
-
-    lookup = dict(query_exhibit_database(f"temp_{table_id}_lookup").values)
-    matrix = query_exhibit_database(f"temp_{table_id}_matrix").values
-
-    return lookup, matrix
-
-def process_row(
-    label_matrix, proba_lookup, lcd, rng, ref_array, acc_array=None, i=0):
-    '''
-    Recursive function to generate new rows of data from the 
-    existing linked matrix. It's possible the function will be 
-    called multiple times to generate a column value if there
-    are no valid values that follow on from earlier values in the sequence.
-
-    For example, if A => A1 => A11 and B => B2 => B12 then if the second
-    column has dispersion set to > 0, the row generation might go like this:
-    A => B2 (due to dispersion) => B12 (falling back to a valid 2-member sequence 
-    rather than generating a random value because there isn't a A => B2 predefined
-    in the linkage matrix taken from the original data).
-
-    Parameters
-    ----------
-    label_matrix      : np.array
-        array where shape[0] is the number_unique_combinations_of_all_linked_col_values
-        and shape[1] is the number of linked columns
-    proba_lookup      : dictionary
-        dictionary where keys are encoded original values (0, 1, 2, etc.) and values
-        are their probabilities taken either from the specification or equalised from db
-    lcd               : list
-        list with dispersion values for each column in linked_columns
-    rng               : np.rng
-        shared RNG generator
-    ref_array         : np.Array
-        array of either None values or pre-populated with existing df values
-    acc_array         : np.Array
-        accummulated array that is being processed and returned
-    i                 : integer
-        a counter in case we need to reduce the sequence size to check for valid
-        combinations to determine the next valid value 
-
-    Returns
-    -------
-    np.array of a single row with encoded column values
-    '''
-
-    if acc_array is None:
-        acc_array = np.array([])
-      
-    arr_len = len(acc_array)
-    ref_arr_len = len(ref_array)
-    
-    if arr_len == label_matrix.shape[1]:
-        return acc_array
-
-    # if there are no valid targets due to dispersion throwing in a non-valid target,
-    # rather than continue checking the full array (which will always fail to produce
-    # a valid next value), change the first position of the array being checked from 0
-    # to counter i and increase until you exhaust the prior possibilities. The fallback
-    # is that there will always be valid targets for previous sequence length = 1 aka
-    # from one column to the next.
-    
-    _ref_array = np.where(ref_array == -1, label_matrix, ref_array)
-    mask = np.all(label_matrix[:, i:ref_arr_len] == _ref_array[:, i:], axis=1)
-
-    valid_targets = np.unique(label_matrix[mask, arr_len])
-
-    if len(valid_targets) == 0:
-
-        i = i + 1
-        return process_row(
-            label_matrix, proba_lookup, lcd, rng, ref_array, acc_array, i)
-        
-    target_proba = np.array([proba_lookup[x] for x in valid_targets])
-
-    # typically, there will be more than 1 value in target_proba, but we have to guard against
-    # possibility of there being just one value, and if its probability is zero (Missing data)
-    # then summing it to 1 will result in NA (division by zero). As a workaround, set proba to
-    # 1 whenever it's the only possible value - since having it less than 1 doesn't make sense.
-    if len(target_proba) == 1:
-        target_proba = np.array([1])
-
-    # make sure the probabilities sum up to 1
-    target_proba = target_proba * (1 / sum(target_proba))
-
-    # take dispersion from the spec
-    dispersion = lcd[arr_len]
-
-    # default is to pick a random valid target
-    next_val = rng.choice(a=valid_targets, p=target_proba)
-
-    # except when it's already pre-generated
-    if ref_array[arr_len] != -1:
-        next_val = ref_array[arr_len]
-
-    # or dispersion is in effect; this part is expensive so only calculate if needed
-    elif dispersion and rng.random() < dispersion:
-        all_targets = np.unique(label_matrix[:, arr_len])
-        non_valid_targets = np.setdiff1d(all_targets, valid_targets)
-        if len(non_valid_targets) > 0:
-            next_val = rng.choice(a=non_valid_targets)
-
-    new_array = np.append(acc_array, next_val)
-
-    # update the ref_array to capture the just generated value
-    if ref_array[arr_len] == -1:
-        ref_array[arr_len] = next_val
-    
-    return process_row(label_matrix, proba_lookup, lcd, rng, ref_array, new_array)
-
-def build_new_lookups(spec_dict, linked_cols, original_lookup):
-    '''
-    Build two lookups: 
-        - from the numerical id to its aliased value. {0: 'hb_code__S08000015', ...}
-        - from the numerical id to the probability value {0: 0.5}
-         
-    Be mindful of all the intermediate steps. The intermediate lookup is created
-    with the numerical ID to a tuple and then split into two.
-
-    original_lookup is a positional to numerical_id, like so:
-        {'hb_code__0': 0} which is to say that the zero-th value in the list of
-        all hb_code values is aliased to the numerical id zero.
-
-    Special case if original values are not stored in the spec, but instead have
-    been put into the DB
-    '''
-
-    pos_labels_inc_column = []   # age__0, age__1, etc.
-    pos_label_to_orig_tuple = {} # age__0: (age__0-9, 0.5), etc.
-
-    for col in linked_cols:
-        
-        orig_vals = spec_dict["columns"][col]["original_values"]
-        prob_vector = None
-
-        if not isinstance(orig_vals, pd.DataFrame):
-
-            safe_col = col.replace(" ", "$")
-            table_id = spec_dict["metadata"]["id"]
-            orig_vals_db = query_exhibit_database(table_name=f"temp_{table_id}_{safe_col}")
-            orig_vals_sorted = (
-                sorted([x for x in orig_vals_db[col] if x != MISSING_DATA_STR]) + 
-                [MISSING_DATA_STR]
-            )
-
-            orig_vals = pd.DataFrame(data={col:orig_vals_sorted})
-
-            if "probability_vector" not in orig_vals_db.columns:
-                prob_vector = np.ones(orig_vals.shape[0])
-                prob_vector[-1] = spec_dict["columns"][col]["miss_probability"]
-            else:
-                prob_vector = orig_vals_db["probability_vector"].astype(float).values
-                prob_vector = np.append(
-                    prob_vector, spec_dict["columns"][col]["miss_probability"])          
-
-            prob_vector /= prob_vector.sum()
-
-        if prob_vector is None:
-            prob_vector = orig_vals["probability_vector"].values
-        
-        pos_labels_temp = [f"{col}__{x}" for x in range(len(orig_vals[col].values))]
-        pos_labels_inc_column.extend(pos_labels_temp)
-        pos_label_to_orig_tuple.update(
-            dict(zip(
-                pos_labels_temp, tuple(zip(orig_vals[col].values, prob_vector))
-            ))
-        )
-
-    # 0: age__0, etc. using the ORIGINAL lookup which has all the relationships
-    id_to_pos_label = {v:k for k, v in original_lookup.items()}
-
-    # if we don't check for the user removed values here, the next line
-    # will error out with an obscure Key not found message. 
-    if len(original_lookup) != len(pos_label_to_orig_tuple):
-        raise ValueError(textwrap.dedent("""
-        The number of values in user linked columns doesn't match original data.
-        If you would like to remove values, set their probability to zero.
-        """))
-
-    # 0: 'hb_code__aliased_code'
-    rev_labels = {k: pos_label_to_orig_tuple[v] for k, v in id_to_pos_label.items()}
-
-    # finally, split the tuple dictionary into two separate ones:
-    label_lookup = {k:v[0] for k, v in rev_labels.items()}
-    proba_lookup = {k:v[1] for k, v in rev_labels.items()}
-
-    return label_lookup, proba_lookup
+'''
+Module isolating methods and classes to find, process and generate
+user-defined linked columns where the relationships are coded in a
+lookup + matrix. For hierarchical linkage see the hierarchical module,
+'''
+
+# Standard library imports
+import sys
+import textwrap
+from functools import partial
+from multiprocessing import Pool
+
+# External imports
+import numpy as np
+import pandas as pd
+
+# Exhibit imports
+from ..constants import MISSING_DATA_STR
+from ..sql import create_temp_table, query_exhibit_database
+
+def save_predefined_linked_cols_to_db(df, id):
+    """
+    Derive and save everything that's required to generate
+    user defined linked columns on demand from a future spec
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        original dataframe with just the categorical columns;
+        we assume that linked columns defined by the user are
+        categorical. Maybe need a special case for time?
+    id : str
+        taken from metadata[id]
+
+    Returns
+    -------
+    nothing
+    """
+    
+    prefixed_df = add_prefix(df)
+    orig_label_to_pos_label = {} # age__0-9 : age__0, etc.
+    pos_labels_inc_column = []   # age__0, age__1, etc.
+    sep = "__"
+
+    for col in prefixed_df.columns:
+
+        col_vals = sorted(prefixed_df[col].unique())
+
+        # add Missing data by hand if not already there OR
+        # pop and reinsert at the end to align with the spec!
+        # make sure the values are sorted AFTER we remove the existing
+        # Missing data, but BEFORE we reinsert it.
+        col_miss_val = f"{col}{sep}{MISSING_DATA_STR}"
+
+        # don't forget that we need to test equality element-wise, hence conversion
+        # to an array from; lists don't compare in the same way.
+        if col_miss_val in col_vals:
+            col_vals = sorted(np.delete(col_vals, np.array(col_vals) == col_miss_val))
+
+        col_vals = np.append(col_vals, col_miss_val)
+
+        pos_labels_temp = [
+            f"{col}{sep}{x}" for x in range(len(col_vals))
+            ]
+
+        pos_labels_inc_column.extend(pos_labels_temp)
+
+        orig_label_to_pos_label.update(
+            {k:v for v, k in zip(pos_labels_temp, col_vals)}
+        )
+
+    # age__0 : 0, etc.
+    pos_label_to_id = dict(
+        zip(pos_labels_inc_column, range(len(pos_labels_inc_column))) 
+        ) 
+
+    # convert the original, prefixed values first to positional labels
+    # and then just to numerical IDs
+    temp_df = (prefixed_df
+            .map(lambda x: orig_label_to_pos_label.get(x, x))
+            .map(lambda x: pos_label_to_id.get(x, x)))
+
+    label_matrix = np.unique(temp_df.values, axis=0).astype(np.intc)
+
+    # make sure column names don't have spaces
+    col_names = [x.replace(" ", "$") for x in prefixed_df.columns]
+
+    # save the label matrix to SQLite db
+    create_temp_table(
+        table_name=f"temp_{id}_matrix",
+        col_names=col_names,
+        data=label_matrix,
+    )
+
+    # save the lookup to SQLite db; note that numerical_ids are
+    # upcast to strings by numpy when creating the array!
+    create_temp_table(
+        table_name=f"temp_{id}_lookup",
+        col_names=["pos_label", "num_label"],
+        data=list(pos_label_to_id.items()),
+    )
+
+def add_prefix(df, sep="__"):
+    """
+    Add column name as prefix to the column values
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        df must have purely categorical columns - no checks are made
+    sep : str, optional
+        separator must be consistent between add_prefix and remove_prefix
+        by default "__"
+
+    Returns
+    -------
+    new DataFrame where values are prefixed with column name
+    """
+
+    data_dict = {}
+    
+    for col in df.columns:
+        # cast to str in case we're dealing with integer-based categorical columns, like age
+        df_col_str = df[col].fillna(MISSING_DATA_STR).astype(str)
+        data_dict[col] = np.add(f"{col}{sep}", df_col_str.values)
+        
+    return pd.DataFrame(data_dict)
+
+def generate_user_linked_anon_df(
+    spec_dict, linked_cols, num_rows, starting_col_matrix=None):
+    '''
+    Main function to generated user-defined linked columns.
+
+    Parameters
+    ----------
+    spec_dict    : dictionary
+        specification plus internal keys, like _rng
+    linked_cols : list
+        there can be only one user-linked group (0, [linked_col_1, linked_col_2, ])
+    num_rows     : int
+        number of rows to generate
+    starting_col_matrix : np.Array shaped (num_rows, len(linked_cols))
+        the matrix is either filled with None values or pre-populated if the function
+        is run multiple times (like when regenerating values after applying custom 
+        actions like make_same)
+
+    Returns
+    -------
+    Data Frame with linked columns
+    '''
+    
+    table_id = spec_dict["metadata"]["id"]
+    rng = spec_dict["_rng"]
+    lookup, matrix = get_lookup_and_matrix_from_db(table_id)
+    new_label_lookup, proba_lookup = build_new_lookups(spec_dict, linked_cols, lookup)
+    # DANGER WHEN REVERSING THE DICT - SAME VALUES IN MULTIPLE COLUMNS WILL BE LOST
+    rev_label_lookup = {key:value for value, key in new_label_lookup.items()}
+    # linked columns dispersion list
+    lcd = [spec_dict["columns"][col]["dispersion"] for col in linked_cols]
+
+    # if re-creating linked values from a pre-generated sequence, reverse the dict to
+    # get the numerical mapping as expected, also changing the dtype for performance.
+
+    if starting_col_matrix is not None:
+        starting_col_matrix = (
+            pd.DataFrame(starting_col_matrix).infer_objects(copy=False)
+            .fillna(MISSING_DATA_STR)
+            .map(lambda x: rev_label_lookup.get(x, x)).values.astype(np.int16)
+        )
+
+    else:
+        starting_col_matrix = np.full(
+            shape=(num_rows, len(linked_cols)), fill_value=-1)
+
+    # multiprocessing only on unix
+    if sys.platform != "win32":
+        with Pool(processes=4) as pool:
+
+            new_rows = pool.map(
+                partial(process_row, matrix, proba_lookup, lcd, rng),
+                starting_col_matrix
+                )
+    else: #pragma: no cover
+
+        new_rows = []
+
+        for i in range(num_rows):
+            new_row = process_row(
+                matrix, proba_lookup, lcd, rng, starting_col_matrix[i])
+            new_rows.append(new_row)
+
+    new_matrix = np.stack(new_rows)
+
+    new_df = pd.DataFrame(
+        new_matrix, columns=linked_cols).map(lambda x: new_label_lookup.get(x, x))
+
+    return new_df
+
+def get_lookup_and_matrix_from_db(table_id):
+    '''
+    The names of the two tables required for user defined linkage don't change:
+    one is lookup and another is matrix.
+    '''
+
+    lookup = dict(query_exhibit_database(f"temp_{table_id}_lookup").values)
+    matrix = query_exhibit_database(f"temp_{table_id}_matrix").values
+
+    return lookup, matrix
+
+def process_row(
+    label_matrix, proba_lookup, lcd, rng, ref_array, acc_array=None, i=0):
+    '''
+    Recursive function to generate new rows of data from the 
+    existing linked matrix. It's possible the function will be 
+    called multiple times to generate a column value if there
+    are no valid values that follow on from earlier values in the sequence.
+
+    For example, if A => A1 => A11 and B => B2 => B12 then if the second
+    column has dispersion set to > 0, the row generation might go like this:
+    A => B2 (due to dispersion) => B12 (falling back to a valid 2-member sequence 
+    rather than generating a random value because there isn't a A => B2 predefined
+    in the linkage matrix taken from the original data).
+
+    Parameters
+    ----------
+    label_matrix      : np.array
+        array where shape[0] is the number_unique_combinations_of_all_linked_col_values
+        and shape[1] is the number of linked columns
+    proba_lookup      : dictionary
+        dictionary where keys are encoded original values (0, 1, 2, etc.) and values
+        are their probabilities taken either from the specification or equalised from db
+    lcd               : list
+        list with dispersion values for each column in linked_columns
+    rng               : np.rng
+        shared RNG generator
+    ref_array         : np.Array
+        array of either None values or pre-populated with existing df values
+    acc_array         : np.Array
+        accummulated array that is being processed and returned
+    i                 : integer
+        a counter in case we need to reduce the sequence size to check for valid
+        combinations to determine the next valid value 
+
+    Returns
+    -------
+    np.array of a single row with encoded column values
+    '''
+
+    if acc_array is None:
+        acc_array = np.array([])
+      
+    arr_len = len(acc_array)
+    ref_arr_len = len(ref_array)
+    
+    if arr_len == label_matrix.shape[1]:
+        return acc_array
+
+    # if there are no valid targets due to dispersion throwing in a non-valid target,
+    # rather than continue checking the full array (which will always fail to produce
+    # a valid next value), change the first position of the array being checked from 0
+    # to counter i and increase until you exhaust the prior possibilities. The fallback
+    # is that there will always be valid targets for previous sequence length = 1 aka
+    # from one column to the next.
+    
+    _ref_array = np.where(ref_array == -1, label_matrix, ref_array)
+    mask = np.all(label_matrix[:, i:ref_arr_len] == _ref_array[:, i:], axis=1)
+
+    valid_targets = np.unique(label_matrix[mask, arr_len])
+
+    if len(valid_targets) == 0:
+
+        i = i + 1
+        return process_row(
+            label_matrix, proba_lookup, lcd, rng, ref_array, acc_array, i)
+        
+    target_proba = np.array([proba_lookup[x] for x in valid_targets])
+
+    # typically, there will be more than 1 value in target_proba, but we have to guard against
+    # possibility of there being just one value, and if its probability is zero (Missing data)
+    # then summing it to 1 will result in NA (division by zero). As a workaround, set proba to
+    # 1 whenever it's the only possible value - since having it less than 1 doesn't make sense.
+    if len(target_proba) == 1:
+        target_proba = np.array([1])
+
+    # make sure the probabilities sum up to 1
+    target_proba = target_proba * (1 / sum(target_proba))
+
+    # take dispersion from the spec
+    dispersion = lcd[arr_len]
+
+    # default is to pick a random valid target
+    next_val = rng.choice(a=valid_targets, p=target_proba)
+
+    # except when it's already pre-generated
+    if ref_array[arr_len] != -1:
+        next_val = ref_array[arr_len]
+
+    # or dispersion is in effect; this part is expensive so only calculate if needed
+    elif dispersion and rng.random() < dispersion:
+        all_targets = np.unique(label_matrix[:, arr_len])
+        non_valid_targets = np.setdiff1d(all_targets, valid_targets)
+        if len(non_valid_targets) > 0:
+            next_val = rng.choice(a=non_valid_targets)
+
+    new_array = np.append(acc_array, next_val)
+
+    # update the ref_array to capture the just generated value
+    if ref_array[arr_len] == -1:
+        ref_array[arr_len] = next_val
+    
+    return process_row(label_matrix, proba_lookup, lcd, rng, ref_array, new_array)
+
+def build_new_lookups(spec_dict, linked_cols, original_lookup):
+    '''
+    Build two lookups: 
+        - from the numerical id to its aliased value. {0: 'hb_code__S08000015', ...}
+        - from the numerical id to the probability value {0: 0.5}
+         
+    Be mindful of all the intermediate steps. The intermediate lookup is created
+    with the numerical ID to a tuple and then split into two.
+
+    original_lookup is a positional to numerical_id, like so:
+        {'hb_code__0': 0} which is to say that the zero-th value in the list of
+        all hb_code values is aliased to the numerical id zero.
+
+    Special case if original values are not stored in the spec, but instead have
+    been put into the DB
+    '''
+
+    pos_labels_inc_column = []   # age__0, age__1, etc.
+    pos_label_to_orig_tuple = {} # age__0: (age__0-9, 0.5), etc.
+
+    for col in linked_cols:
+        
+        orig_vals = spec_dict["columns"][col]["original_values"]
+        prob_vector = None
+
+        if not isinstance(orig_vals, pd.DataFrame):
+
+            safe_col = col.replace(" ", "$")
+            table_id = spec_dict["metadata"]["id"]
+            orig_vals_db = query_exhibit_database(table_name=f"temp_{table_id}_{safe_col}")
+            orig_vals_sorted = (
+                sorted([x for x in orig_vals_db[col] if x != MISSING_DATA_STR]) + 
+                [MISSING_DATA_STR]
+            )
+
+            orig_vals = pd.DataFrame(data={col:orig_vals_sorted})
+
+            if "probability_vector" not in orig_vals_db.columns:
+                prob_vector = np.ones(orig_vals.shape[0])
+                prob_vector[-1] = spec_dict["columns"][col]["miss_probability"]
+            else:
+                prob_vector = orig_vals_db["probability_vector"].astype(float).values
+                prob_vector = np.append(
+                    prob_vector, spec_dict["columns"][col]["miss_probability"])          
+
+            prob_vector /= prob_vector.sum()
+
+        if prob_vector is None:
+            prob_vector = orig_vals["probability_vector"].values
+        
+        pos_labels_temp = [f"{col}__{x}" for x in range(len(orig_vals[col].values))]
+        pos_labels_inc_column.extend(pos_labels_temp)
+        pos_label_to_orig_tuple.update(
+            dict(zip(
+                pos_labels_temp, tuple(zip(orig_vals[col].values, prob_vector))
+            ))
+        )
+
+    # 0: age__0, etc. using the ORIGINAL lookup which has all the relationships
+    id_to_pos_label = {v:k for k, v in original_lookup.items()}
+
+    # if we don't check for the user removed values here, the next line
+    # will error out with an obscure Key not found message. 
+    if len(original_lookup) != len(pos_label_to_orig_tuple):
+        raise ValueError(textwrap.dedent("""
+        The number of values in user linked columns doesn't match original data.
+        If you would like to remove values, set their probability to zero.
+        """))
+
+    # 0: 'hb_code__aliased_code'
+    rev_labels = {k: pos_label_to_orig_tuple[v] for k, v in id_to_pos_label.items()}
+
+    # finally, split the tuple dictionary into two separate ones:
+    label_lookup = {k:v[0] for k, v in rev_labels.items()}
+    proba_lookup = {k:v[1] for k, v in rev_labels.items()}
+
+    return label_lookup, proba_lookup
diff --git a/exhibit/core/linkage/tests/test_linkage_hierarchical.py b/exhibit/core/linkage/tests/test_linkage_hierarchical.py
index 0cb3f8e..23a407f 100644
--- a/exhibit/core/linkage/tests/test_linkage_hierarchical.py
+++ b/exhibit/core/linkage/tests/test_linkage_hierarchical.py
@@ -506,7 +506,7 @@ def test_scenario_2_random_4_cols(self):
         #first test that high-level column (A) is correctly split ~20-80
         self.assertAlmostEqual(
             0.2/0.8,
-            result.groupby("A").size().agg(lambda x: x[0]/x[1]),
+            result.groupby("A").size().agg(lambda x: x.iloc[0]/x.iloc[1]),
             delta=0.1
         )
 
@@ -648,7 +648,7 @@ def test_scenario_3_random(self):
         #between A0 and A1 + A2 (derived from children's probabilieis).
         self.assertAlmostEqual(
             0.2/0.8,
-            result.groupby("A").size().agg(lambda x: x[0] / (x[1] + x[2])),
+            result.groupby("A").size().agg(lambda x: x.iloc[0] / (x.iloc[1] + x.iloc[2])),
             delta=0.1
         )
 
@@ -709,7 +709,7 @@ def test_scenario_3_aliased(self):
         #between A0 and A1 + A2 (derived from children's probabilieis).
         self.assertAlmostEqual(
             0.2/0.8,
-            result.groupby("A").size().agg(lambda x: x[0] / (x[1] + x[2])),
+            result.groupby("A").size().agg(lambda x: x.iloc[0] / (x.iloc[1] + x.iloc[2])),
             delta=0.1
         )
 
diff --git a/exhibit/core/tests/test_reference.py b/exhibit/core/tests/test_reference.py
index 8071d0b..ac886e5 100644
--- a/exhibit/core/tests/test_reference.py
+++ b/exhibit/core/tests/test_reference.py
@@ -1,796 +1,796 @@
-'''
-Reference tests for the Exhibit package
-'''
-
-# Standard library imports
-import unittest
-from pathlib import Path
-import tempfile
-from os.path import join
-from collections import namedtuple
-
-# External imports
-from pandas.testing import assert_frame_equal
-import pandas as pd
-import numpy as np
-
-# Exhibit imports
-from exhibit.core.utils import package_dir
-from exhibit.db import db_util
-from exhibit.core.constants import MISSING_DATA_STR
-from exhibit.sample.sample import inpatients_anon, uuid_anon
-
-# Module under test
-from exhibit.core import exhibit  as tm
-
-def replace_nested_dict_values(d1, d2):
-    '''
-    Recursive replacement of dictionary values in matching keys
-    or adding new ones.
-    '''
-
-    for key2 in d2:
-        if key2 in d1:
-            if isinstance(d1[key2], dict):
-                replace_nested_dict_values(d1[key2], d2[key2])
-            else:
-                d1[key2] = d2[key2]
-        else:
-            d1[key2] = d2[key2]
-
-def temp_exhibit(
-    filename="inpatients.csv",
-    fromdata_namespace=None,
-    fromspec_namespace=None,
-    test_spec_dict=None,
-    return_spec=True,
-    return_df=True,
-    ):
-    '''
-    A helper method to generate and read custom specifications 
-
-    Parameters
-    ----------
-    filename : str or pd.DataFrame
-        the .csv to use as the base for spec / df generation
-    fromdata_namespace : dict
-        dictionary with testing values for creating a spec
-    fromspec_namespace : dict
-        dictionary with testing values for running generation command
-    test_spec_dict : dict
-        dictionary with testing values for user spec: used to update the spec
-        generated from filename csv or DataFrame
-    return_spec : boolean
-        sometimes you only want to generate the csv and don't need the spec, 
-        like in performance benchmark testing
-    return_df : boolean
-        sometimes you only want to generate a spec; if return_df is False
-        then the second element in the return tuple is None
-
-    Returns
-    -------
-    A named tuples with spec dict and the generated dataframe     
-    '''
-
-    returnTuple = namedtuple("TestRun", ["temp_spec", "temp_df"])
-    temp_spec = None
-    temp_df = None
-
-    if isinstance(filename, dict) or \
-    (isinstance(filename, str) and filename[-3:] == "yml"):
-        source = "yml"
-    else:
-        source = "csv"
-
-    # function has five paths:
-    # 1) given .csv filename (or DataFrame) produce just a spec
-    # 2) given .csv filename (or DataFrame) produce a spec and a demo .csv
-    # 3) given a .yml filename (or dict) produce a demo .csv
-    # 4) given a .yml filename (or dict) produce a spec
-    # 5) given a .yml filename (or dict) produce a spec and a demo .csv
-    
-    # it's important to only generate appropriate parts because they are measured
-    # separetely in performance benchmarking tests.
-
-    # if source is data, we always produce a spec (can't have demo data without it)
-    if source == "csv":
-
-        with tempfile.TemporaryDirectory() as td:
-
-            temp_spec_name = "_.yml"
-            f_name = join(td, temp_spec_name)
-
-            # for internal use when testing with a custom dataframe, not a static file
-            if isinstance(filename, pd.DataFrame):
-                default_data_path = filename
-            else:
-                default_data_path = Path(package_dir("sample", "_data", filename))
-
-            fromdata_defaults = {
-                "command"           : "fromdata",
-                "source"            : default_data_path,
-                "inline_limit"      : 30,
-                "verbose"           : True,
-                "output"            : f_name,
-                "skip_columns"      : [],
-                "equal_weights"     : False,
-                "linked_columns"    : None
-            }
-
-            #Update namespaces
-            if fromdata_namespace:
-                fromdata_defaults.update(fromdata_namespace)
-
-            xA = tm.Exhibit(**fromdata_defaults)
-            xA.read_data()
-            xA.generate_spec()
-
-            if return_df:
-
-                xA.write_spec()
-                fromspec_defaults = {
-                    "command"           : "fromspec",
-                    "source"            : Path(f_name),
-                    "verbose"           : True,
-                }
-
-                if fromspec_namespace:
-                    fromspec_defaults.update(fromspec_namespace)
-
-                xA = tm.Exhibit(**fromspec_defaults)
-                xA.read_spec()
-
-                if test_spec_dict:
-                    replace_nested_dict_values(xA.spec_dict, test_spec_dict)
-
-                if xA.validate_spec():
-                    xA.execute_spec()
-
-                temp_df = xA.anon_df
-            
-            if return_spec:
-                temp_spec=xA.spec_dict
-
-    if source == "yml":
-
-        # for internal use when testing with a custom spec_dict, not a static file
-        if isinstance(filename, dict):
-            default_spec_path = filename
-        else:
-            default_spec_path = Path(package_dir("sample", "_spec", filename))
-
-        fromspec_defaults = {
-            "command"           : "fromspec",
-            "source"            : default_spec_path,
-            "verbose"           : True,
-        }
-
-        if fromspec_namespace:
-            fromspec_defaults.update(fromspec_namespace)
-        
-        xA = tm.Exhibit(**fromspec_defaults)
-        xA.read_spec()
-
-        if test_spec_dict:
-            replace_nested_dict_values(xA.spec_dict, test_spec_dict)
-
-        if return_spec:
-            temp_spec = xA.spec_dict
-        
-        if return_df:
-
-            if xA.validate_spec():
-                xA.execute_spec()
-
-            temp_df = xA.anon_df
-
-    return returnTuple(temp_spec, temp_df)
-
-class referenceTests(unittest.TestCase):
-    '''
-    Main test suite; command line arguments are mocked
-    via patch context manager; internal intermediate functions
-    are mocked inside each test.
-    '''
-
-    @classmethod
-    def setUpClass(cls):
-        '''
-        Create a list of tables to drop after reference tests finish
-        '''
-
-        cls._temp_tables = []
-
-    @classmethod
-    def tearDownClass(cls):
-        '''
-        Clean up local exhibit.db from temp tables
-        '''
-
-        db_util.drop_tables(cls._temp_tables)
-    
-    def test_reference_prescribing_non_linked_anon_data(self):
-        '''
-        What this reference test is covering:
-            - paired 1:1 anonymisation set (birds)
-            - designating paired columns as complete columns
-            - unlinking of columns
-        '''
-
-        expected_df = pd.read_csv(
-            package_dir(
-                "core", "tests", "_reference_data",
-                "prescribing_anon_non_linked.csv"),
-            parse_dates=["PaidDateMonth"]
-            )
-        
-        test_dict = {
-            "metadata":{"number_of_rows":1500},
-            "columns":{
-                "HB2014":{
-                    "cross_join_all_unique_values": True
-                },
-                "HB2014Name":{
-                    "cross_join_all_unique_values": True
-                },
-                "BNFItemCode":{"anonymising_set":"birds"},
-                "BNFItemDescription":{"anonymising_set":"birds"},
-                "GPPracticeName":{"anonymising_set":"random"}
-            },
-            "linked_columns":[]
-        }
-
-        temp_spec, temp_df = temp_exhibit(
-            filename="prescribing.csv",
-            test_spec_dict=test_dict
-        )
-
-        #save ID to tidy up temp columns created as part of testing
-        self._temp_tables.append(temp_spec["metadata"]["id"])
-
-        #sort column names to make sure they are the same
-        temp_df.sort_index(axis=1, inplace=True)
-        expected_df.sort_index(axis=1, inplace=True)
-
-        assert_frame_equal(
-            left=expected_df,
-            right=temp_df,
-            check_exact=False,
-            check_dtype=False,
-            check_categorical=False
-        )
-    
-    def test_reference_prescribing_linked_mnt_anon_data(self):
-        '''
-        What this reference test is covering:
-            - one of the linked columns is in the spec, another is in DB
-            - anonymisation is done using "mountains" set
-            - NumberOfPaidItems is generated from a shifted normal distribution
-
-        Note that prescribing dataset has duplicate categorical rows
-        '''
-
-        expected_df = pd.read_csv(
-            package_dir(
-                "core", "tests", "_reference_data",
-                "prescribing_anon_mnt_linked.csv"),
-            parse_dates=["PaidDateMonth"]
-            )
-
-        test_dict = {
-            "columns":{
-                "HB2014":{"anonymising_set":"mountains"},
-                "HB2014Name":{"anonymising_set":"mountains"},
-                "GPPracticeName":{"anonymising_set":"mountains"},
-                "NumberOfPaidItems":{"distribution":"normal"}
-            }
-        }
-
-        temp_spec, temp_df = temp_exhibit(
-            filename="prescribing.csv",
-            test_spec_dict=test_dict
-        )
-
-        #save ID to tidy up temp columns created as part of testing
-        self._temp_tables.append(temp_spec["metadata"]["id"])
-
-        #sort column names to make sure they are the same
-        temp_df.sort_index(axis=1, inplace=True)
-        expected_df.sort_index(axis=1, inplace=True)
-
-        assert_frame_equal(
-            left=expected_df,
-            right=temp_df,
-            check_exact=False,
-            check_dtype=False,
-            check_categorical=False
-        )
-    
-    def test_reference_inpatient_anon_data(self):
-        '''
-        What this reference test is covering:
-            - duplicates are removed
-            - manually change labels in Sex column (Female to A, Male to B)
-            - manually added derived column (avlos)
-            - removed linked columns from spec
-            - removed Scotland from HBs and deleted loc columns
-            - changed the totals for stays (100 000) and los (200 000)
-            - changed basic constraint to los >= stays
-            - DB is not used at all so no need for ID
-
-        Note that when basic constraints are added, generated totals can
-        be different from those set in the spec as target sum is enforced
-        BEFORE basic constraints are adjusted.
-        '''
-        
-        args = {
-            "command"      : "fromspec",
-            "source"       : Path(package_dir("sample", "_spec", "inpatients_demo.yml")),
-            "skip_columns" : [],
-            "verbose"      : True,
-        }
-
-        xA = tm.Exhibit(**args)
-        xA.read_spec()
-        if xA.validate_spec():
-            xA.execute_spec()
-
-        table_id = xA.spec_dict["metadata"]["id"]
-        
-        #save ID to tidy up temp columns created as part of testing
-        self._temp_tables.append(table_id)
-
-        #sort column names to make sure they are the same
-        inpatients_anon.sort_index(axis=1, inplace=True)
-        xA.anon_df.sort_index(axis=1, inplace=True)
-
-        # there is a quirk of how int is cast on Windows and Unix: int32 vs int64
-        # see SO answer:
-        # Why do Pandas integer `dtypes` not behave the same on Unix and Windows?
-        assert_frame_equal(
-            left=inpatients_anon,
-            right=xA.anon_df,
-            check_exact=False,
-            check_dtype=False,
-            check_categorical=False
-        )
-
-    def test_reference_inpatient_il10_random_data(self):
-        '''
-        What this reference test is covering:
-            - number of unique values exceeds inline limit in all linked columns
-            - anonymisation method is "random"
-            - non-linked categorical column (Sex) has missing data
-            - linked columns share missing categorical data
-
-        Because by default the spec includes the basic constraints of los >= avlos,
-        if avlos is null (0.065 probability in source data) then los will also be null
-        in ~130 records.
-        '''
-
-        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
-        
-        # dayfirst=True would trigger warnings when encountering dates in Y-m-d format
-        test_dataframe = pd.read_csv(
-            source_data_path,
-            parse_dates=["quarter_date"],
-        )
-
-        # Modify test_dataframe to suit test conditions
-        # Gives us 500/10225 ~ 5% chance of missing data
-        rng = np.random.default_rng(seed=0)
-        rand_idx = rng.choice(
-            range(test_dataframe.shape[0]),
-            size=500,
-            replace=False)
-
-        linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"]
-        test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN)
-
-        # Gives us ~10% chance of missing data
-        rand_idx2 = rng.choice(
-            range(test_dataframe.shape[0]),
-            size=1000,
-            replace=False)
-
-        na_cols = ["sex"]
-        test_dataframe.loc[rand_idx2, na_cols] = np.NaN
-
-        # modify CLI namespace
-        fromdata_namespace = {
-            "source"            : test_dataframe,
-            "inline_limit": 10,
-        }
-        
-        # modify spec
-        test_spec_dict = {
-            "metadata": {"number_of_rows": 2000, "random_seed": 2},
-            "columns" : {"sex": {"cross_join_all_unique_values" : True}}
-        }
-
-        temp_spec, temp_df = temp_exhibit(
-            fromdata_namespace=fromdata_namespace,
-            test_spec_dict=test_spec_dict
-            )
-
-        inpatients_anon_il10 = pd.read_csv(
-            package_dir(
-                "core", "tests", "_reference_data",
-                "inpatients_anon_rnd_il10.csv"),
-                parse_dates=["quarter_date"]
-            )
-
-        #save ID to tidy up temp columns created as part of testing
-        table_id = temp_spec["metadata"]["id"]
-        self._temp_tables.append(table_id)
-            
-        assert_frame_equal(
-            left=inpatients_anon_il10,
-            right=temp_df,
-            check_exact=False,
-            check_dtype=False,
-            check_categorical=False
-        )
-
-    def test_reference_inpatient_il50_random_data(self):
-        '''
-        What this reference test is covering:
-            - number of unique values is within inline limit in all columns
-            - anonymisation method is "random"
-            - linked columns share missing categorical data
-            - manually change date frequency from QS to M
-        '''
-
-        rng = np.random.default_rng(seed=0)
-
-        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
-
-        test_dataframe = pd.read_csv(
-            source_data_path,
-            parse_dates=["quarter_date"],
-        )
-
-        # Modify test_dataframe to suit test conditions
-        rand_idx = rng.choice(
-            range(test_dataframe.shape[0]),
-            size=500,
-            replace=False)
-
-        linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"]
-        test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN)
-
-        # modify CLI namespace
-        fromdata_namespace = {
-            "source"            : test_dataframe,
-            "inline_limit": 50,
-        }
-
-        # modify spec
-        test_spec_dict = {
-            "metadata": {"number_of_rows": 2000},
-            "columns" : {"quarter_date": 
-                    {"from" : "2018-01-01", "frequency": "M"}
-                }
-            }
-
-        temp_spec, temp_df = temp_exhibit(
-            fromdata_namespace=fromdata_namespace,
-            test_spec_dict=test_spec_dict
-            )
-
-        inpatients_anon_il50 = pd.read_csv(
-            package_dir(
-                "core", "tests", "_reference_data",
-                "inpatients_anon_rnd_il50.csv"),
-            parse_dates=["quarter_date"]
-            )
-
-        #save ID to tidy up temp columns created as part of testing
-        table_id = temp_spec["metadata"]["id"]
-        self._temp_tables.append(table_id)
-            
-        assert_frame_equal(
-            left=inpatients_anon_il50,
-            right=temp_df,
-            check_exact=False,
-            check_dtype=False,
-            check_categorical=False
-        )
-
-    def test_reference_inpatient_il10_mountains_data(self):
-        '''
-        What this reference test is covering:
-            - number of unique values exceeds inline limit in all columns
-            - anonymisation method is hierarchical "mountains"
-            - anon columns are specified using dot notation
-            - sex is a "complete" categorical column, but there will be gaps
-            where missind data is generated in other columns - categorical
-            values are generated first, and then "blanked" based on miss_pct            
-            - only the most granular linked column has missing values
-            - avlos is not derived and is calculated "blindly"
-        '''
-
-        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
-
-        test_dataframe = pd.read_csv(
-            source_data_path,
-            parse_dates=["quarter_date"],
-        )
-
-        # Modify test_dataframe to suit test conditions
-        rng = np.random.default_rng(seed=0)
-        rand_idx = rng.choice(
-            range(test_dataframe.shape[0]),
-            size=500,
-            replace=False)
-
-        linked_cols = ["loc_code", "loc_name"]
-        test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN)
-
-        # modify CLI namespace
-        fromdata_namespace = {
-            "source"            : test_dataframe,
-            "inline_limit": 10,
-        }
-
-        # Modify test_dataframe to suit test conditions
-        test_spec_dict = {
-            "metadata": 
-                {"number_of_rows": 2000},
-            "columns": {
-                "sex" : 
-                    {"cross_join_all_unique_values": True}
-                ,
-                "hb_code": 
-                    {"anonymising_set":"mountains.range"}
-                ,
-                "hb_name": 
-                    {"anonymising_set":"mountains.range"}
-                ,
-                "loc_code": 
-                    {"anonymising_set":"mountains.peak"}
-                ,
-                "loc_name": 
-                    {"anonymising_set":"mountains.peak"}
-                },
-            "constraints": {
-                "basic_constraints" : {}
-            }
-        }
-                 
-        temp_spec, temp_df = temp_exhibit(
-            fromdata_namespace=fromdata_namespace,
-            test_spec_dict=test_spec_dict
-            )
-
-        inpatients_anon_mnt_il10 = pd.read_csv(
-            package_dir(
-                "core", "tests", "_reference_data",
-                "inpatients_anon_mnt_il10.csv"),
-            parse_dates=["quarter_date"]
-            )
-
-        #save ID to tidy up temp columns created as part of testing
-        table_id = temp_spec["metadata"]["id"]
-        self._temp_tables.append(table_id)
-            
-        assert_frame_equal(
-            left=inpatients_anon_mnt_il10,
-            right=temp_df,
-            check_exact=False,
-            check_dtype=False,
-            check_categorical=False
-        )
-
-    def test_reference_inpatient_il50_mountains_data(self):
-        '''
-        What this reference test is covering:
-            - number of unique values is within inline limit in all columns
-            - anonymisation method is hierarchical "mountains"
-            - linked columns share missing categorical data
-        '''
-
-        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
-    
-        test_dataframe = pd.read_csv(
-            source_data_path,
-            parse_dates=["quarter_date"],
-        )
-
-        # Modify test_dataframe to suit test conditions
-        rng = np.random.default_rng(seed=0)
-        rand_idx = rng.choice(
-            range(test_dataframe.shape[0]),
-            size=500,
-            replace=False)
-
-        linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"]
-        test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN)
-
-        # modify CLI namespace
-        fromdata_namespace = {
-            "source"            : test_dataframe,
-            "inline_limit": 50,
-        }
-
-        # modify spec
-        test_spec_dict = {
-            "metadata": 
-                {"number_of_rows": 2000},
-            "columns": {
-                "hb_code": 
-                    {"anonymising_set":"mountains"}
-                ,
-                "hb_name": 
-                    {"anonymising_set":"mountains"}
-                ,
-                "loc_code": 
-                    {"anonymising_set":"mountains"}
-                ,
-                "loc_name": 
-                    {"anonymising_set":"mountains"}
-                },
-        }
-                 
-        temp_spec, temp_df = temp_exhibit(
-            fromdata_namespace=fromdata_namespace,
-            test_spec_dict=test_spec_dict
-            )
-
-        inpatients_anon_mnt_il50 = pd.read_csv(
-            package_dir(
-                "core", "tests", "_reference_data",
-                "inpatients_anon_mnt_il50.csv"),
-            parse_dates=["quarter_date"]
-            )
-
-        #save ID to tidy up temp columns created as part of testing
-        table_id = temp_spec["metadata"]["id"]
-        self._temp_tables.append(table_id)
-
-        assert_frame_equal(
-            left=inpatients_anon_mnt_il50,
-            right=temp_df,
-            check_exact=False,
-            check_dtype=False,
-            check_categorical=False
-        )
-
-    def test_reference_inpatient_modified_linked_columns_scenario_2(self):
-        '''
-        What this reference test is covering:
-         - scenario 2
-         - custom value in one of the linked columns
-         - number of linked columns in spec is less than in original SQL
-        '''
-
-        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
-
-        test_dataframe = pd.read_csv(
-            source_data_path,
-            parse_dates=["quarter_date"],
-        )
-
-        # modify CLI namespace
-        fromdata_namespace = {
-            "source"            : test_dataframe,
-        }
-        
-        # modify spec
-        test_spec_dict = {
-            "metadata": {"number_of_rows": 2000, "random_seed": 0},
-            "columns": {
-                "hb_name" : {
-                    "uniques" : 2,
-                    "original_values" : pd.DataFrame(data={
-                        "hb_name": ["PHS A&A", "NHS Borders", MISSING_DATA_STR],
-                        "paired_hb_code": ["S08000015", "S08000016", MISSING_DATA_STR],
-                        "probability_vector" : [0.5, 0.5, 0],
-                        "avlos": [0.5, 0.5, 0],
-                        "los": [0.5, 0.5, 0],
-                        "stays": [0.5, 0.5, 0]})
-                }
-            }
-        }
-
-        temp_spec, temp_df = temp_exhibit(
-            fromdata_namespace=fromdata_namespace,
-            test_spec_dict=test_spec_dict,
-            )
-       
-        #save ID to tidy up temp columns created as part of testing
-        table_id = temp_spec["metadata"]["id"]
-        self._temp_tables.append(table_id)
-
-        self.assertCountEqual(
-            temp_df["hb_name"].unique(),
-            ["PHS A&A", "NHS Borders"])
-
-    def test_reference_inpatient_modified_linked_columns_scenario_3(self):
-        '''
-        What this reference test is covering:
-         - scenario 3
-         - custom value in one of the linked columns
-         - number of linked columns in spec is less than in original SQL
-        '''
-
-        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
-
-        test_dataframe = pd.read_csv(
-            source_data_path,
-            parse_dates=["quarter_date"],
-        )
-
-        # modify CLI namespace
-        fromdata_namespace = {
-            "source"            : test_dataframe,
-            "inline_limit": 50
-        }
-        
-        # modify spec
-        test_spec_dict = {
-            "metadata": {"number_of_rows": 2000, "random_seed": 0},
-            "columns": {
-                "loc_name" : {
-                    "uniques" : 5,
-                    "original_values" : pd.DataFrame(data={
-                        "loc_name": list("ABCDE") + [MISSING_DATA_STR],
-                        "paired_loc_code": list("ABCDE") + [MISSING_DATA_STR],
-                        "probability_vector" : [0.2] * 5 + [0],
-                        "avlos": [0.2] * 5 + [0],
-                        "los": [0.2] * 5 + [0],
-                        "stays": [0.2] * 5 + [0]})
-                }
-            }
-        }
-
-        temp_spec, temp_df = temp_exhibit(
-            fromdata_namespace=fromdata_namespace,
-            test_spec_dict=test_spec_dict,
-            )
-       
-        #save ID to tidy up temp columns created as part of testing
-        table_id = temp_spec["metadata"]["id"]
-        self._temp_tables.append(table_id)
-
-        self.assertCountEqual(temp_df["loc_name"].unique(), list("ABCDE"))
-
-    def test_reference_uuid_data(self):
-        '''
-        What this reference test is covering:
-            - uuid column type
-            - generate_as_sequence, make_same and sorting custom actions
-            - no db
-        '''
-        
-        args = {
-            "command"      : "fromspec",
-            "source"       : Path(package_dir("sample", "_spec", "uuid_demo.yml")),
-            "skip_columns" : [],
-            "verbose"      : True,
-        }
-
-        xA = tm.Exhibit(**args)
-        xA.read_spec()
-        if xA.validate_spec():
-            xA.execute_spec()
-
-        #sort column names to make sure they are the same
-        uuid_anon.sort_index(axis=1, inplace=True)
-        xA.anon_df.sort_index(axis=1, inplace=True)
-
-        # there is a quirk of how int is cast on Windows and Unix: int32 vs int64
-        # see SO answer:
-        # Why do Pandas integer `dtypes` not behave the same on Unix and Windows?
-        assert_frame_equal(
-            left=uuid_anon,
-            right=xA.anon_df,
-            check_exact=False,
-            check_dtype=False,
-            check_categorical=False
-        )
-
-if __name__ == "__main__" and __package__ is None:
-    #overwrite __package__ builtin as per PEP 366
-    __package__ = "exhibit"
-    unittest.main(warnings="ignore")
+'''
+Reference tests for the Exhibit package
+'''
+
+# Standard library imports
+import unittest
+from pathlib import Path
+import tempfile
+from os.path import join
+from collections import namedtuple
+
+# External imports
+from pandas.testing import assert_frame_equal
+import pandas as pd
+import numpy as np
+
+# Exhibit imports
+from exhibit.core.utils import package_dir
+from exhibit.db import db_util
+from exhibit.core.constants import MISSING_DATA_STR
+from exhibit.sample.sample import inpatients_anon, uuid_anon
+
+# Module under test
+from exhibit.core import exhibit  as tm
+
+def replace_nested_dict_values(d1, d2):
+    '''
+    Recursive replacement of dictionary values in matching keys
+    or adding new ones.
+    '''
+
+    for key2 in d2:
+        if key2 in d1:
+            if isinstance(d1[key2], dict):
+                replace_nested_dict_values(d1[key2], d2[key2])
+            else:
+                d1[key2] = d2[key2]
+        else:
+            d1[key2] = d2[key2]
+
+def temp_exhibit(
+    filename="inpatients.csv",
+    fromdata_namespace=None,
+    fromspec_namespace=None,
+    test_spec_dict=None,
+    return_spec=True,
+    return_df=True,
+    ):
+    '''
+    A helper method to generate and read custom specifications 
+
+    Parameters
+    ----------
+    filename : str or pd.DataFrame
+        the .csv to use as the base for spec / df generation
+    fromdata_namespace : dict
+        dictionary with testing values for creating a spec
+    fromspec_namespace : dict
+        dictionary with testing values for running generation command
+    test_spec_dict : dict
+        dictionary with testing values for user spec: used to update the spec
+        generated from filename csv or DataFrame
+    return_spec : boolean
+        sometimes you only want to generate the csv and don't need the spec, 
+        like in performance benchmark testing
+    return_df : boolean
+        sometimes you only want to generate a spec; if return_df is False
+        then the second element in the return tuple is None
+
+    Returns
+    -------
+    A named tuples with spec dict and the generated dataframe     
+    '''
+
+    returnTuple = namedtuple("TestRun", ["temp_spec", "temp_df"])
+    temp_spec = None
+    temp_df = None
+
+    if isinstance(filename, dict) or \
+    (isinstance(filename, str) and filename[-3:] == "yml"):
+        source = "yml"
+    else:
+        source = "csv"
+
+    # function has five paths:
+    # 1) given .csv filename (or DataFrame) produce just a spec
+    # 2) given .csv filename (or DataFrame) produce a spec and a demo .csv
+    # 3) given a .yml filename (or dict) produce a demo .csv
+    # 4) given a .yml filename (or dict) produce a spec
+    # 5) given a .yml filename (or dict) produce a spec and a demo .csv
+    
+    # it's important to only generate appropriate parts because they are measured
+    # separetely in performance benchmarking tests.
+
+    # if source is data, we always produce a spec (can't have demo data without it)
+    if source == "csv":
+
+        with tempfile.TemporaryDirectory() as td:
+
+            temp_spec_name = "_.yml"
+            f_name = join(td, temp_spec_name)
+
+            # for internal use when testing with a custom dataframe, not a static file
+            if isinstance(filename, pd.DataFrame):
+                default_data_path = filename
+            else:
+                default_data_path = Path(package_dir("sample", "_data", filename))
+
+            fromdata_defaults = {
+                "command"           : "fromdata",
+                "source"            : default_data_path,
+                "inline_limit"      : 30,
+                "verbose"           : True,
+                "output"            : f_name,
+                "skip_columns"      : [],
+                "equal_weights"     : False,
+                "linked_columns"    : None
+            }
+
+            #Update namespaces
+            if fromdata_namespace:
+                fromdata_defaults.update(fromdata_namespace)
+
+            xA = tm.Exhibit(**fromdata_defaults)
+            xA.read_data()
+            xA.generate_spec()
+
+            if return_df:
+
+                xA.write_spec()
+                fromspec_defaults = {
+                    "command"           : "fromspec",
+                    "source"            : Path(f_name),
+                    "verbose"           : True,
+                }
+
+                if fromspec_namespace:
+                    fromspec_defaults.update(fromspec_namespace)
+
+                xA = tm.Exhibit(**fromspec_defaults)
+                xA.read_spec()
+
+                if test_spec_dict:
+                    replace_nested_dict_values(xA.spec_dict, test_spec_dict)
+
+                if xA.validate_spec():
+                    xA.execute_spec()
+
+                temp_df = xA.anon_df
+            
+            if return_spec:
+                temp_spec=xA.spec_dict
+
+    if source == "yml":
+
+        # for internal use when testing with a custom spec_dict, not a static file
+        if isinstance(filename, dict):
+            default_spec_path = filename
+        else:
+            default_spec_path = Path(package_dir("sample", "_spec", filename))
+
+        fromspec_defaults = {
+            "command"           : "fromspec",
+            "source"            : default_spec_path,
+            "verbose"           : True,
+        }
+
+        if fromspec_namespace:
+            fromspec_defaults.update(fromspec_namespace)
+        
+        xA = tm.Exhibit(**fromspec_defaults)
+        xA.read_spec()
+
+        if test_spec_dict:
+            replace_nested_dict_values(xA.spec_dict, test_spec_dict)
+
+        if return_spec:
+            temp_spec = xA.spec_dict
+        
+        if return_df:
+
+            if xA.validate_spec():
+                xA.execute_spec()
+
+            temp_df = xA.anon_df
+
+    return returnTuple(temp_spec, temp_df)
+
+class referenceTests(unittest.TestCase):
+    '''
+    Main test suite; command line arguments are mocked
+    via patch context manager; internal intermediate functions
+    are mocked inside each test.
+    '''
+
+    @classmethod
+    def setUpClass(cls):
+        '''
+        Create a list of tables to drop after reference tests finish
+        '''
+
+        cls._temp_tables = []
+
+    @classmethod
+    def tearDownClass(cls):
+        '''
+        Clean up local exhibit.db from temp tables
+        '''
+
+        db_util.drop_tables(cls._temp_tables)
+    
+    def test_reference_prescribing_non_linked_anon_data(self):
+        '''
+        What this reference test is covering:
+            - paired 1:1 anonymisation set (birds)
+            - designating paired columns as complete columns
+            - unlinking of columns
+        '''
+
+        expected_df = pd.read_csv(
+            package_dir(
+                "core", "tests", "_reference_data",
+                "prescribing_anon_non_linked.csv"),
+            parse_dates=["PaidDateMonth"]
+            )
+        
+        test_dict = {
+            "metadata":{"number_of_rows":1500},
+            "columns":{
+                "HB2014":{
+                    "cross_join_all_unique_values": True
+                },
+                "HB2014Name":{
+                    "cross_join_all_unique_values": True
+                },
+                "BNFItemCode":{"anonymising_set":"birds"},
+                "BNFItemDescription":{"anonymising_set":"birds"},
+                "GPPracticeName":{"anonymising_set":"random"}
+            },
+            "linked_columns":[]
+        }
+
+        temp_spec, temp_df = temp_exhibit(
+            filename="prescribing.csv",
+            test_spec_dict=test_dict
+        )
+
+        #save ID to tidy up temp columns created as part of testing
+        self._temp_tables.append(temp_spec["metadata"]["id"])
+
+        #sort column names to make sure they are the same
+        temp_df.sort_index(axis=1, inplace=True)
+        expected_df.sort_index(axis=1, inplace=True)
+
+        assert_frame_equal(
+            left=expected_df,
+            right=temp_df,
+            check_exact=False,
+            check_dtype=False,
+            check_categorical=False
+        )
+    
+    def test_reference_prescribing_linked_mnt_anon_data(self):
+        '''
+        What this reference test is covering:
+            - one of the linked columns is in the spec, another is in DB
+            - anonymisation is done using "mountains" set
+            - NumberOfPaidItems is generated from a shifted normal distribution
+
+        Note that prescribing dataset has duplicate categorical rows
+        '''
+
+        expected_df = pd.read_csv(
+            package_dir(
+                "core", "tests", "_reference_data",
+                "prescribing_anon_mnt_linked.csv"),
+            parse_dates=["PaidDateMonth"]
+            )
+
+        test_dict = {
+            "columns":{
+                "HB2014":{"anonymising_set":"mountains"},
+                "HB2014Name":{"anonymising_set":"mountains"},
+                "GPPracticeName":{"anonymising_set":"mountains"},
+                "NumberOfPaidItems":{"distribution":"normal"}
+            }
+        }
+
+        temp_spec, temp_df = temp_exhibit(
+            filename="prescribing.csv",
+            test_spec_dict=test_dict
+        )
+
+        #save ID to tidy up temp columns created as part of testing
+        self._temp_tables.append(temp_spec["metadata"]["id"])
+
+        #sort column names to make sure they are the same
+        temp_df.sort_index(axis=1, inplace=True)
+        expected_df.sort_index(axis=1, inplace=True)
+
+        assert_frame_equal(
+            left=expected_df,
+            right=temp_df,
+            check_exact=False,
+            check_dtype=False,
+            check_categorical=False
+        )
+    
+    def test_reference_inpatient_anon_data(self):
+        '''
+        What this reference test is covering:
+            - duplicates are removed
+            - manually change labels in Sex column (Female to A, Male to B)
+            - manually added derived column (avlos)
+            - removed linked columns from spec
+            - removed Scotland from HBs and deleted loc columns
+            - changed the totals for stays (100 000) and los (200 000)
+            - changed basic constraint to los >= stays
+            - DB is not used at all so no need for ID
+
+        Note that when basic constraints are added, generated totals can
+        be different from those set in the spec as target sum is enforced
+        BEFORE basic constraints are adjusted.
+        '''
+        
+        args = {
+            "command"      : "fromspec",
+            "source"       : Path(package_dir("sample", "_spec", "inpatients_demo.yml")),
+            "skip_columns" : [],
+            "verbose"      : True,
+        }
+
+        xA = tm.Exhibit(**args)
+        xA.read_spec()
+        if xA.validate_spec():
+            xA.execute_spec()
+
+        table_id = xA.spec_dict["metadata"]["id"]
+        
+        #save ID to tidy up temp columns created as part of testing
+        self._temp_tables.append(table_id)
+
+        #sort column names to make sure they are the same
+        inpatients_anon.sort_index(axis=1, inplace=True)
+        xA.anon_df.sort_index(axis=1, inplace=True)
+
+        # there is a quirk of how int is cast on Windows and Unix: int32 vs int64
+        # see SO answer:
+        # Why do Pandas integer `dtypes` not behave the same on Unix and Windows?
+        assert_frame_equal(
+            left=inpatients_anon,
+            right=xA.anon_df,
+            check_exact=False,
+            check_dtype=False,
+            check_categorical=False
+        )
+
+    def test_reference_inpatient_il10_random_data(self):
+        '''
+        What this reference test is covering:
+            - number of unique values exceeds inline limit in all linked columns
+            - anonymisation method is "random"
+            - non-linked categorical column (Sex) has missing data
+            - linked columns share missing categorical data
+
+        Because by default the spec includes the basic constraints of los >= avlos,
+        if avlos is null (0.065 probability in source data) then los will also be null
+        in ~130 records.
+        '''
+
+        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
+        
+        # dayfirst=True would trigger warnings when encountering dates in Y-m-d format
+        test_dataframe = pd.read_csv(
+            source_data_path,
+            parse_dates=["quarter_date"],
+        )
+
+        # Modify test_dataframe to suit test conditions
+        # Gives us 500/10225 ~ 5% chance of missing data
+        rng = np.random.default_rng(seed=0)
+        rand_idx = rng.choice(
+            range(test_dataframe.shape[0]),
+            size=500,
+            replace=False)
+
+        linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"]
+        test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN)
+
+        # Gives us ~10% chance of missing data
+        rand_idx2 = rng.choice(
+            range(test_dataframe.shape[0]),
+            size=1000,
+            replace=False)
+
+        na_cols = ["sex"]
+        test_dataframe.loc[rand_idx2, na_cols] = np.NaN
+
+        # modify CLI namespace
+        fromdata_namespace = {
+            "source"            : test_dataframe,
+            "inline_limit": 10,
+        }
+        
+        # modify spec
+        test_spec_dict = {
+            "metadata": {"number_of_rows": 2000, "random_seed": 2},
+            "columns" : {"sex": {"cross_join_all_unique_values" : True}}
+        }
+
+        temp_spec, temp_df = temp_exhibit(
+            fromdata_namespace=fromdata_namespace,
+            test_spec_dict=test_spec_dict
+            )
+
+        inpatients_anon_il10 = pd.read_csv(
+            package_dir(
+                "core", "tests", "_reference_data",
+                "inpatients_anon_rnd_il10.csv"),
+                parse_dates=["quarter_date"]
+            )
+
+        #save ID to tidy up temp columns created as part of testing
+        table_id = temp_spec["metadata"]["id"]
+        self._temp_tables.append(table_id)
+            
+        assert_frame_equal(
+            left=inpatients_anon_il10,
+            right=temp_df,
+            check_exact=False,
+            check_dtype=False,
+            check_categorical=False
+        )
+
+    def test_reference_inpatient_il50_random_data(self):
+        '''
+        What this reference test is covering:
+            - number of unique values is within inline limit in all columns
+            - anonymisation method is "random"
+            - linked columns share missing categorical data
+            - manually change date frequency from QS to M
+        '''
+
+        rng = np.random.default_rng(seed=0)
+
+        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
+
+        test_dataframe = pd.read_csv(
+            source_data_path,
+            parse_dates=["quarter_date"],
+        )
+
+        # Modify test_dataframe to suit test conditions
+        rand_idx = rng.choice(
+            range(test_dataframe.shape[0]),
+            size=500,
+            replace=False)
+
+        linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"]
+        test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN)
+
+        # modify CLI namespace
+        fromdata_namespace = {
+            "source"            : test_dataframe,
+            "inline_limit": 50,
+        }
+
+        # modify spec
+        test_spec_dict = {
+            "metadata": {"number_of_rows": 2000},
+            "columns" : {"quarter_date": 
+                    {"from" : "2018-01-01", "frequency": "ME"}
+                }
+            }
+
+        temp_spec, temp_df = temp_exhibit(
+            fromdata_namespace=fromdata_namespace,
+            test_spec_dict=test_spec_dict
+            )
+
+        inpatients_anon_il50 = pd.read_csv(
+            package_dir(
+                "core", "tests", "_reference_data",
+                "inpatients_anon_rnd_il50.csv"),
+            parse_dates=["quarter_date"]
+            )
+
+        #save ID to tidy up temp columns created as part of testing
+        table_id = temp_spec["metadata"]["id"]
+        self._temp_tables.append(table_id)
+            
+        assert_frame_equal(
+            left=inpatients_anon_il50,
+            right=temp_df,
+            check_exact=False,
+            check_dtype=False,
+            check_categorical=False
+        )
+
+    def test_reference_inpatient_il10_mountains_data(self):
+        '''
+        What this reference test is covering:
+            - number of unique values exceeds inline limit in all columns
+            - anonymisation method is hierarchical "mountains"
+            - anon columns are specified using dot notation
+            - sex is a "complete" categorical column, but there will be gaps
+            where missind data is generated in other columns - categorical
+            values are generated first, and then "blanked" based on miss_pct            
+            - only the most granular linked column has missing values
+            - avlos is not derived and is calculated "blindly"
+        '''
+
+        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
+
+        test_dataframe = pd.read_csv(
+            source_data_path,
+            parse_dates=["quarter_date"],
+        )
+
+        # Modify test_dataframe to suit test conditions
+        rng = np.random.default_rng(seed=0)
+        rand_idx = rng.choice(
+            range(test_dataframe.shape[0]),
+            size=500,
+            replace=False)
+
+        linked_cols = ["loc_code", "loc_name"]
+        test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN)
+
+        # modify CLI namespace
+        fromdata_namespace = {
+            "source"            : test_dataframe,
+            "inline_limit": 10,
+        }
+
+        # Modify test_dataframe to suit test conditions
+        test_spec_dict = {
+            "metadata": 
+                {"number_of_rows": 2000},
+            "columns": {
+                "sex" : 
+                    {"cross_join_all_unique_values": True}
+                ,
+                "hb_code": 
+                    {"anonymising_set":"mountains.range"}
+                ,
+                "hb_name": 
+                    {"anonymising_set":"mountains.range"}
+                ,
+                "loc_code": 
+                    {"anonymising_set":"mountains.peak"}
+                ,
+                "loc_name": 
+                    {"anonymising_set":"mountains.peak"}
+                },
+            "constraints": {
+                "basic_constraints" : {}
+            }
+        }
+                 
+        temp_spec, temp_df = temp_exhibit(
+            fromdata_namespace=fromdata_namespace,
+            test_spec_dict=test_spec_dict
+            )
+
+        inpatients_anon_mnt_il10 = pd.read_csv(
+            package_dir(
+                "core", "tests", "_reference_data",
+                "inpatients_anon_mnt_il10.csv"),
+            parse_dates=["quarter_date"]
+            )
+
+        #save ID to tidy up temp columns created as part of testing
+        table_id = temp_spec["metadata"]["id"]
+        self._temp_tables.append(table_id)
+            
+        assert_frame_equal(
+            left=inpatients_anon_mnt_il10,
+            right=temp_df,
+            check_exact=False,
+            check_dtype=False,
+            check_categorical=False
+        )
+
+    def test_reference_inpatient_il50_mountains_data(self):
+        '''
+        What this reference test is covering:
+            - number of unique values is within inline limit in all columns
+            - anonymisation method is hierarchical "mountains"
+            - linked columns share missing categorical data
+        '''
+
+        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
+    
+        test_dataframe = pd.read_csv(
+            source_data_path,
+            parse_dates=["quarter_date"],
+        )
+
+        # Modify test_dataframe to suit test conditions
+        rng = np.random.default_rng(seed=0)
+        rand_idx = rng.choice(
+            range(test_dataframe.shape[0]),
+            size=500,
+            replace=False)
+
+        linked_cols = ["hb_code", "hb_name", "loc_code", "loc_name"]
+        test_dataframe.loc[rand_idx, linked_cols] = (np.NaN, np.NaN, np.NaN, np.NaN)
+
+        # modify CLI namespace
+        fromdata_namespace = {
+            "source"            : test_dataframe,
+            "inline_limit": 50,
+        }
+
+        # modify spec
+        test_spec_dict = {
+            "metadata": 
+                {"number_of_rows": 2000},
+            "columns": {
+                "hb_code": 
+                    {"anonymising_set":"mountains"}
+                ,
+                "hb_name": 
+                    {"anonymising_set":"mountains"}
+                ,
+                "loc_code": 
+                    {"anonymising_set":"mountains"}
+                ,
+                "loc_name": 
+                    {"anonymising_set":"mountains"}
+                },
+        }
+                 
+        temp_spec, temp_df = temp_exhibit(
+            fromdata_namespace=fromdata_namespace,
+            test_spec_dict=test_spec_dict
+            )
+
+        inpatients_anon_mnt_il50 = pd.read_csv(
+            package_dir(
+                "core", "tests", "_reference_data",
+                "inpatients_anon_mnt_il50.csv"),
+            parse_dates=["quarter_date"]
+            )
+
+        #save ID to tidy up temp columns created as part of testing
+        table_id = temp_spec["metadata"]["id"]
+        self._temp_tables.append(table_id)
+
+        assert_frame_equal(
+            left=inpatients_anon_mnt_il50,
+            right=temp_df,
+            check_exact=False,
+            check_dtype=False,
+            check_categorical=False
+        )
+
+    def test_reference_inpatient_modified_linked_columns_scenario_2(self):
+        '''
+        What this reference test is covering:
+         - scenario 2
+         - custom value in one of the linked columns
+         - number of linked columns in spec is less than in original SQL
+        '''
+
+        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
+
+        test_dataframe = pd.read_csv(
+            source_data_path,
+            parse_dates=["quarter_date"],
+        )
+
+        # modify CLI namespace
+        fromdata_namespace = {
+            "source"            : test_dataframe,
+        }
+        
+        # modify spec
+        test_spec_dict = {
+            "metadata": {"number_of_rows": 2000, "random_seed": 0},
+            "columns": {
+                "hb_name" : {
+                    "uniques" : 2,
+                    "original_values" : pd.DataFrame(data={
+                        "hb_name": ["PHS A&A", "NHS Borders", MISSING_DATA_STR],
+                        "paired_hb_code": ["S08000015", "S08000016", MISSING_DATA_STR],
+                        "probability_vector" : [0.5, 0.5, 0],
+                        "avlos": [0.5, 0.5, 0],
+                        "los": [0.5, 0.5, 0],
+                        "stays": [0.5, 0.5, 0]})
+                }
+            }
+        }
+
+        temp_spec, temp_df = temp_exhibit(
+            fromdata_namespace=fromdata_namespace,
+            test_spec_dict=test_spec_dict,
+            )
+       
+        #save ID to tidy up temp columns created as part of testing
+        table_id = temp_spec["metadata"]["id"]
+        self._temp_tables.append(table_id)
+
+        self.assertCountEqual(
+            temp_df["hb_name"].unique(),
+            ["PHS A&A", "NHS Borders"])
+
+    def test_reference_inpatient_modified_linked_columns_scenario_3(self):
+        '''
+        What this reference test is covering:
+         - scenario 3
+         - custom value in one of the linked columns
+         - number of linked columns in spec is less than in original SQL
+        '''
+
+        source_data_path = Path(package_dir("sample", "_data", "inpatients.csv"))
+
+        test_dataframe = pd.read_csv(
+            source_data_path,
+            parse_dates=["quarter_date"],
+        )
+
+        # modify CLI namespace
+        fromdata_namespace = {
+            "source"            : test_dataframe,
+            "inline_limit": 50
+        }
+        
+        # modify spec
+        test_spec_dict = {
+            "metadata": {"number_of_rows": 2000, "random_seed": 0},
+            "columns": {
+                "loc_name" : {
+                    "uniques" : 5,
+                    "original_values" : pd.DataFrame(data={
+                        "loc_name": list("ABCDE") + [MISSING_DATA_STR],
+                        "paired_loc_code": list("ABCDE") + [MISSING_DATA_STR],
+                        "probability_vector" : [0.2] * 5 + [0],
+                        "avlos": [0.2] * 5 + [0],
+                        "los": [0.2] * 5 + [0],
+                        "stays": [0.2] * 5 + [0]})
+                }
+            }
+        }
+
+        temp_spec, temp_df = temp_exhibit(
+            fromdata_namespace=fromdata_namespace,
+            test_spec_dict=test_spec_dict,
+            )
+       
+        #save ID to tidy up temp columns created as part of testing
+        table_id = temp_spec["metadata"]["id"]
+        self._temp_tables.append(table_id)
+
+        self.assertCountEqual(temp_df["loc_name"].unique(), list("ABCDE"))
+
+    def test_reference_uuid_data(self):
+        '''
+        What this reference test is covering:
+            - uuid column type
+            - generate_as_sequence, make_same and sorting custom actions
+            - no db
+        '''
+        
+        args = {
+            "command"      : "fromspec",
+            "source"       : Path(package_dir("sample", "_spec", "uuid_demo.yml")),
+            "skip_columns" : [],
+            "verbose"      : True,
+        }
+
+        xA = tm.Exhibit(**args)
+        xA.read_spec()
+        if xA.validate_spec():
+            xA.execute_spec()
+
+        #sort column names to make sure they are the same
+        uuid_anon.sort_index(axis=1, inplace=True)
+        xA.anon_df.sort_index(axis=1, inplace=True)
+
+        # there is a quirk of how int is cast on Windows and Unix: int32 vs int64
+        # see SO answer:
+        # Why do Pandas integer `dtypes` not behave the same on Unix and Windows?
+        assert_frame_equal(
+            left=uuid_anon,
+            right=xA.anon_df,
+            check_exact=False,
+            check_dtype=False,
+            check_categorical=False
+        )
+
+if __name__ == "__main__" and __package__ is None:
+    #overwrite __package__ builtin as per PEP 366
+    __package__ = "exhibit"
+    unittest.main(warnings="ignore")
diff --git a/exhibit/core/tests/test_spec.py b/exhibit/core/tests/test_spec.py
index f5230d7..1a3a72a 100644
--- a/exhibit/core/tests/test_spec.py
+++ b/exhibit/core/tests/test_spec.py
@@ -69,7 +69,7 @@ def test_column_order_in_spec_is_correctly_based_on_types(self):
             "ints"  : range(5),
             "floats": np.linspace(0, 1, num=5),
             "bools" : [True, True, True, True, False],
-            "dates" : pd.date_range(start="1/1/2018", periods=5, freq="M"),
+            "dates" : pd.date_range(start="1/1/2018", periods=5, freq="ME"),
             "cats"  : list("ABCDE")
         })
 
diff --git a/exhibit/core/tests/test_utils.py b/exhibit/core/tests/test_utils.py
index 4e7f1b3..52585a0 100644
--- a/exhibit/core/tests/test_utils.py
+++ b/exhibit/core/tests/test_utils.py
@@ -87,13 +87,13 @@ def test_date_frequency_guesser(self):
         returns correct values.
         '''
         
-        test_frequencies = ["D", "M", "MS", "Q", "QS", "BA-MAR"]
+        test_frequencies = ["D", "ME", "MS", "QE", "QS", "BYE-MAR"]
         test_cases = [pd.Series(pd.date_range(start="2015/01/01", periods=12, freq=f))
                       for f in test_frequencies]
 
         result = [tm.guess_date_frequency(x) for x in test_cases]
 
-        expected = ["D", "M", "MS", "Q", "QS", "YS"]
+        expected = ["D", "ME", "MS", "QE", "QS", "YS"]
 
         self.assertEqual(result, expected)
 
@@ -154,7 +154,8 @@ def test_float_or_int(self):
         '''
 
         test_series_1 = pd.Series([1, 2, 3, 4, 5, 0.0])
-        test_series_2 = pd.Series([1, pd.NA, 2, 3])
+        # default dtype for the below range is object rather than int64
+        test_series_2 = pd.Series([1, pd.NA, 2, 3], dtype="Int64")
         test_series_3 = pd.Series([0.1, 0.2, 3, 4])
 
         self.assertTrue(tm.float_or_int(test_series_1), "integer")
diff --git a/exhibit/core/utils.py b/exhibit/core/utils.py
index 95a7dae..5323cdc 100644
--- a/exhibit/core/utils.py
+++ b/exhibit/core/utils.py
@@ -208,10 +208,10 @@ def guess_date_frequency(timeseries):
 
     for period_range, period_alias in aliases.items():
         if first_period in period_range:
-            # decide whether it's period start (QS) or end (Q)
+            # decide whether it's period start or end (M/Q/YE)
             if period_alias in ["MS", "QS"]:
                 if not (timeseries.dt.day == 1).all():
-                    return period_alias[0]
+                    return period_alias[0] + "E"
             return period_alias
             
     return None #pragma: no cover