diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py
index bebb74d..2bdd7ab 100644
--- a/exhibit/core/generate/categorical.py
+++ b/exhibit/core/generate/categorical.py
@@ -219,6 +219,10 @@ def _generate_anon_series(self, col_name):
         # ignoring the standard date genderation parameters, like from / to.        
         anon_set = col_attrs.get("anonymising_set", None)
 
+        # Users can pass custom functions to generate categorical / date columns
+        if callable(anon_set):
+            return self._generate_using_custom_function(col_name, anon_set)
+
         # check if the anonymising set is a SQL statement starting with SELECT
         # note that for dates, all other parameters, like from / to will be ignored
         if anon_set is not None and anon_set.strip().upper()[:6] == "SELECT":
@@ -501,8 +505,14 @@ def _generate_using_external_table(self, col_name, anon_set):
         # duplicates in case user didn't specify DISTINC in his SQL query;
         # the anon_df would typically be from UUIDs that are generated before
         # categorical columns.
+
+        # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
         if self.anon_df is None or self.anon_df.empty:
-            existing_data = pd.concat(self.generated_dfs, axis=1)
+            # self.generated_dfs has cat. columns generated BEFORE this particular column
+            if not self.generated_dfs: #pragma: no cover
+                existing_data = pd.DataFrame()
+            else:
+                existing_data = pd.concat(self.generated_dfs, axis=1)
         else:
             existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
 
@@ -600,3 +610,36 @@ def _generate_using_external_table(self, col_name, anon_set):
             final_result = final_result.astype("datetime64[ns]")
 
         return final_result
+    
+    def _generate_using_custom_function(self, col_name, anon_set):
+        '''
+        _summary_
+
+        Parameters
+        ----------
+        col_name : _type_
+            _description_
+        anon_set : _type_
+            _description_
+        '''
+        # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns
+        if self.anon_df is None or self.anon_df.empty:
+            # self.generated_dfs has cat. columns generated BEFORE this particular column
+            if not self.generated_dfs:
+                existing_data = pd.DataFrame()
+            else:
+                existing_data = pd.concat(self.generated_dfs, axis=1)
+        else: #pragma: no cover
+            existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1)
+
+        if existing_data.empty:
+            result = pd.Series(
+                data=[anon_set(pd.Series) for _ in range(self.num_rows)],
+                name=col_name
+            )
+            return result
+
+        result = existing_data.apply(anon_set, axis=1)
+        result.name = col_name
+
+        return result
diff --git a/exhibit/core/generate/tests/test_categorical.py b/exhibit/core/generate/tests/test_categorical.py
index e838d1f..c3e26d5 100644
--- a/exhibit/core/generate/tests/test_categorical.py
+++ b/exhibit/core/generate/tests/test_categorical.py
@@ -3,6 +3,7 @@
 '''
 
 # Standard library imports
+import datetime
 import unittest
 import tempfile
 from unittest.mock import Mock, patch
@@ -532,6 +533,124 @@ def test_date_column_with_impossible_combination_of_from_to_and_period(self):
 
         self.assertWarns(RuntimeWarning, gen.generate)
 
+    def test_generate_column_with_custom_function_in_anonymised_set(self):
+        '''
+        This option is only valid for when Exhibit is used as a script. For
+        specification-based generation, use custom ML models. Note that 
+        while numerical weights are respected, probability vectors are not.
+        '''
+
+        def _generate_spam(_):
+            '''
+            Basic function to generate menu items in a fictitious bistro.
+
+            Parameters
+            ----------
+            _ : None
+                the anonymising_set function return one value at a time
+                and has access to the current row in the DF generated so far.
+                This argument is mandatory to include, even if it's unused.
+
+            Returns
+            ----------
+            Scalar value
+            '''
+
+            rng = np.random.default_rng()
+            val = rng.choice([
+                "Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam",
+                "Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam",
+                "Spam, bacon, sausage, and Spam", "Lobster Thermidor",
+            ])
+
+            return val
+
+        test_dict = {
+            "_rng" : np.random.default_rng(seed=0),
+            "metadata": {
+                "categorical_columns" : ["menu"],
+                "inline_limit" : 5,
+                "id" : "main"
+                },
+            "columns": {
+                "menu": {
+                    "type": "categorical",
+                    "uniques" : 7,
+                    "original_values" : pd.DataFrame(),
+                    "paired_columns": None,
+                    "anonymising_set" : _generate_spam,
+                    "cross_join_all_unique_values" : False,
+                },
+            }
+        }
+
+        gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50)
+        result = gen.generate()
+
+        self.assertIsInstance(result, pd.DataFrame)
+        self.assertEqual(result.shape[0], 50)
+
+    def test_generate_column_with_custom_date_function_in_anonymised_set(self):
+        '''
+        This option is only valid for when Exhibit is used as a script. For
+        specification-based generation, use custom ML models. Note that 
+        while numerical weights are respected, probability vectors are not.
+        '''
+
+        def _increment_date(row):
+            '''
+            Basic function to generate menu items in a fictitious bistro.
+
+            Parameters
+            ----------
+            row : pd.Series
+                the anonymising_set function return one value at a time
+                and has access to the current row in the DF generated so far.
+                This argument is mandatory to include, even if it's unused.
+
+            Returns
+            ----------
+            Scalar value
+            '''
+            rng = np.random.default_rng()
+            cur_date = row["date"]
+            new_date = cur_date + datetime.timedelta(days=int(rng.integers(1, 10)))
+
+            return new_date
+
+        test_dict = {
+            "_rng" : np.random.default_rng(seed=0),
+            "metadata": {
+                "date_columns" : ["date", "future_date"],
+                "inline_limit" : 5,
+                "id" : "main"
+                },
+            "columns": {
+                "date": {
+                    "type": "date",
+                    "from": "2023-01-01",
+                    "to"  : "2024-01-01",
+                    "uniques" : 50,
+                    "frequency" : "D",
+                    "cross_join_all_unique_values" : False,
+                },
+                "future_date": {
+                    "type": "date",
+                    "from": "2023-01-01",
+                    "to"  : "2024-01-01",
+                    "uniques" : 50,
+                    "frequency" : "D",
+                    "cross_join_all_unique_values" : False,
+                    "anonymising_set" : _increment_date
+                }
+            }
+        }
+
+        gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50)
+        result = gen.generate()
+
+        self.assertTrue((result["future_date"] > result["date"]).all())
+
 if __name__ == "__main__" and __package__ is None:
     #overwrite __package__ builtin as per PEP 366
     __package__ = "exhibit"
diff --git a/exhibit/core/generate/weights.py b/exhibit/core/generate/weights.py
index 416ff5e..c888a1f 100644
--- a/exhibit/core/generate/weights.py
+++ b/exhibit/core/generate/weights.py
@@ -180,9 +180,19 @@ def target_columns_for_weights_table(spec_dict):
     cat_cols = spec_dict["metadata"]["categorical_columns"] #includes linked
     cat_cols_set = set(cat_cols)
 
-    #drop paired columns and regex columns
+    #drop columns, like(paired / regex columns) that we don't expect to have num. weights
     for cat_col in cat_cols:
         anon_set = spec_dict["columns"][cat_col]["anonymising_set"]
+
+        # if we're missing original_values, there can be no weights
+        orig_vals = spec_dict["columns"][cat_col]["original_values"]
+        if orig_vals is None or (isinstance(orig_vals, pd.DataFrame) and orig_vals.empty): #pragma: no cover
+            cat_cols_set.remove(cat_col)
+            continue
+
+        # skip the checks for custom functions
+        if callable(anon_set):
+            continue
         if (
             is_paired(spec_dict, cat_col) or
             # we keep the columns if they are in fixed sets or have custom SQL;
diff --git a/exhibit/core/tests/test_spec.py b/exhibit/core/tests/test_spec.py
index e716a8e..94b8ad3 100644
--- a/exhibit/core/tests/test_spec.py
+++ b/exhibit/core/tests/test_spec.py
@@ -147,6 +147,70 @@ def test_categorical_column_initialised_from_list(self):
         anon_df = exhibit_data.generate()
 
         self.assertEqual(anon_df.shape, (100, 1))
+
+    def test_mix_of_categorical_and_numerical_columns_with_incomplete_weights(self):
+        '''
+        This test covers both categorical and continuous column generation.
+
+        Remember that weights are relative to each other, meaning that if we provide
+        weights for just one value, it doesn't matter because it has no reference point.
+        If we provide weights for two values, they will be rescaled to sum to 1, while
+        other values without weights, will be treated as 1, meaning providing incomplete
+        weights will lead to smaller values relative to missing values. 
+        '''
+
+        def _generate_spam(_):
+            '''
+            Basic function to generate menu items in a fictitious bistro.
+
+            Parameters
+            ----------
+            _ : None
+                the anonymising_set function return one value at a time
+                and has access to the current row in the DF generated so far.
+                This argument is mandatory to include, even if it's unused.
+
+            Returns
+            ----------
+            Scalar value
+            '''
+
+            rng = np.random.default_rng()
+            val = rng.choice([
+                "Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam",
+                "Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam",
+                "Spam, bacon, sausage, and Spam", "Lobster Thermidor",
+            ])
+
+            return val
+
+        spec = tm.Spec()
+        spec_dict = spec.generate()
+
+        spec_dict["metadata"]["number_of_rows"] = 50
+        spec_dict["metadata"]["categorical_columns"] = ["menu"]
+        spec_dict["metadata"]["numerical_columns"] = ["price"]
+        spec_dict["metadata"]["id"] = "main"
+
+        menu_df = pd.DataFrame(data={
+            "menu" : ["Egg and bacon", "Lobster Thermidor", "Missing Data"],
+            "price": [0.5, 0.5, 0.0]
+        })
+
+        spec_dict["columns"]["menu"] = tm.CategoricalColumn("menu", uniques=7, original_values=menu_df, anon_set=_generate_spam)
+        spec_dict["columns"]["price"] = tm.NumericalColumn(distribution_parameters={"target_sum" : 1000, "dispersion": 0.2})
+
+        exhibit_data = xbt.Exhibit(command="fromspec", source=spec_dict, output="dataframe")
+        anon_df = exhibit_data.generate()
+
+        test_items = ["Egg and bacon", "Lobster Thermidor"]
+
+        # check that the average price of the two test items is about half the rest
+        self.assertAlmostEqual(
+            anon_df[anon_df["menu"].isin(test_items)]["price"].mean() * 2,
+            anon_df[~anon_df["menu"].isin(test_items)]["price"].mean(),
+            delta=3
+        )
             
 if __name__ == "__main__" and __package__ is None:
     #overwrite __package__ builtin as per PEP 366
diff --git a/exhibit/core/validator.py b/exhibit/core/validator.py
index 9cbf36b..24a13ad 100644
--- a/exhibit/core/validator.py
+++ b/exhibit/core/validator.py
@@ -206,6 +206,9 @@ def validate_anonymising_set_length(self, spec_dict=None):
                 attr="anonymising_set",
                 col_names=True,
                 types=["categorical"]):
+            # ignore anonymising_sets that have custom functions 
+            if callable(v):
+                return True
             
             if v.split(".")[0] in self.fixed_sql_sets:
                 col_uniques = spec_dict["columns"][c]["uniques"]
diff --git a/recipes/Create peer groups.ipynb b/recipes/Create peer groups.ipynb
index 36ab791..faf7ab5 100644
--- a/recipes/Create peer groups.ipynb	
+++ b/recipes/Create peer groups.ipynb	
@@ -59,7 +59,7 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <td>0</td>\n",
+       "      <th>0</th>\n",
        "      <td>2018-12-31</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -73,7 +73,7 @@
        "      <td>20-29</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>1</td>\n",
+       "      <th>1</th>\n",
        "      <td>2018-03-31</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -87,7 +87,7 @@
        "      <td>0-9</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>2</td>\n",
+       "      <th>2</th>\n",
        "      <td>2018-06-30</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -101,7 +101,7 @@
        "      <td>0-9</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>3</td>\n",
+       "      <th>3</th>\n",
        "      <td>2018-09-30</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -115,7 +115,7 @@
        "      <td>0-9</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>4</td>\n",
+       "      <th>4</th>\n",
        "      <td>2018-12-31</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -358,27 +358,27 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <td>0</td>\n",
+       "      <th>0</th>\n",
        "      <td>Arran War Memorial Hospital</td>\n",
        "      <td>Arran War Memorial Hospital</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>1</td>\n",
+       "      <th>1</th>\n",
        "      <td>Arran War Memorial Hospital</td>\n",
        "      <td>West Glasgow</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>2</td>\n",
+       "      <th>2</th>\n",
        "      <td>Arran War Memorial Hospital</td>\n",
        "      <td>Lorn &amp; Islands Hospital</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>3</td>\n",
+       "      <th>3</th>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>4</td>\n",
+       "      <th>4</th>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
        "      <td>NHS Greater Glasgow &amp; Clyde</td>\n",
        "    </tr>\n",
@@ -481,7 +481,7 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <td>0</td>\n",
+       "      <th>0</th>\n",
        "      <td>Aberdeen Royal Infirmary</td>\n",
        "      <td>2018-03-31</td>\n",
        "      <td>All Daycases</td>\n",
@@ -491,7 +491,7 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>1</td>\n",
+       "      <th>1</th>\n",
        "      <td>Aberdeen Royal Infirmary</td>\n",
        "      <td>2018-03-31</td>\n",
        "      <td>All Daycases</td>\n",
@@ -501,7 +501,7 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>2</td>\n",
+       "      <th>2</th>\n",
        "      <td>Aberdeen Royal Infirmary</td>\n",
        "      <td>2018-03-31</td>\n",
        "      <td>All Daycases</td>\n",
@@ -511,7 +511,7 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>3</td>\n",
+       "      <th>3</th>\n",
        "      <td>Aberdeen Royal Infirmary</td>\n",
        "      <td>2018-03-31</td>\n",
        "      <td>All Daycases</td>\n",
@@ -521,7 +521,7 @@
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>4</td>\n",
+       "      <th>4</th>\n",
        "      <td>Aberdeen Royal Infirmary</td>\n",
        "      <td>2018-03-31</td>\n",
        "      <td>All Daycases</td>\n",
@@ -559,7 +559,7 @@
     "peer_aggregated_df = (pd\n",
     "    .merge(peer_reference_df, source, how=\"left\", left_on=\"peer_location\", right_on=\"loc_name\")\n",
     "    .drop(columns=\"avlos\")   \n",
-    "    .groupby([\"peer_group_name\"] + join_cols).sum()\n",
+    "    .groupby([\"peer_group_name\"] + join_cols)[[\"stays\", \"los\"]].sum()\n",
     "    .rename(columns=lambda x: x + \"_peer\")\n",
     "    .reset_index()\n",
     "    .rename(columns={\"peer_group_name\" : \"loc_name\"}))\n",
@@ -634,7 +634,7 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <td>0</td>\n",
+       "      <th>0</th>\n",
        "      <td>2018-12-31</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -651,7 +651,7 @@
        "      <td>3.333333</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>1</td>\n",
+       "      <th>1</th>\n",
        "      <td>2018-03-31</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -668,7 +668,7 @@
        "      <td>2.151188</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>2</td>\n",
+       "      <th>2</th>\n",
        "      <td>2018-06-30</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -685,7 +685,7 @@
        "      <td>2.351852</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>3</td>\n",
+       "      <th>3</th>\n",
        "      <td>2018-09-30</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -702,7 +702,7 @@
        "      <td>2.494975</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>4</td>\n",
+       "      <th>4</th>\n",
        "      <td>2018-12-31</td>\n",
        "      <td>S08000015</td>\n",
        "      <td>NHS Ayrshire &amp; Arran</td>\n",
@@ -758,9 +758,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Data Analysis [conda env:data]",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "conda-env-data-data"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -772,7 +772,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.4"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,
diff --git a/recipes/Using custom functions in anonymising sets.ipynb b/recipes/Using custom functions in anonymising sets.ipynb
new file mode 100644
index 0000000..bf61190
--- /dev/null
+++ b/recipes/Using custom functions in anonymising sets.ipynb	
@@ -0,0 +1,464 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "60cfa429-cd74-459f-9be6-6c306b668989",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datetime\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from faker import Faker\n",
+    "\n",
+    "from exhibit import exhibit as xbt\n",
+    "from exhibit.core.spec import Spec, UUIDColumn, CategoricalColumn, DateColumn"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2859c22-ec83-4c7f-8346-4c4990cce9cb",
+   "metadata": {},
+   "source": [
+    "### User-defined functions with custom logic to generate categorical and date values\n",
+    "When using Exhibit as an importable library and building the specification using Python objects, you can use custom functions in place of the `anonymising_set` attribute for categorical and date columns. This feature provides a lot of flexibility when it comes to data generation and provides a quick and easy way to augment the dataset without adding custom ML models. Custom functions have only two restrictions - they must define an argument for dataset row (even if it's unused) and return a single value. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "04e7a653-ccf0-4976-a032-eac67eeb31d1",
+   "metadata": {},
+   "source": [
+    "#### Basic example using dates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6bca2837-e61a-4e84-bd9a-0dee71ecd27c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def increment_date(row):\n",
+    "    '''\n",
+    "    Basic function to increase a date by a random number between 1 and 10\n",
+    "\n",
+    "    Parameters\n",
+    "    ----------\n",
+    "    row : pd.Series\n",
+    "        the anonymising_set function return one value at a time\n",
+    "        and has access to the current row in the DF generated so far.\n",
+    "        This argument is mandatory to include, even if it's unused.\n",
+    "\n",
+    "    Returns\n",
+    "    ----------\n",
+    "    Scalar value\n",
+    "    '''\n",
+    "    \n",
+    "    rng = np.random.default_rng()\n",
+    "\n",
+    "    # note the use of row argument to get the value in the date column that was already in the generated dataset\n",
+    "    # columns are generated in the order they appear (or are added) in the spec so if we reversed the order, we\n",
+    "    # wouldn't have been able to access the \"date\" column. Similarly, numerical and geo columns are generated after\n",
+    "    # the categorical ones so those values are also not yet \n",
+    "    cur_date = row[\"date\"]\n",
+    "    new_date = cur_date + datetime.timedelta(days=int(rng.integers(1, 10)))\n",
+    "\n",
+    "    return new_date"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0edff8f0-47e3-4729-b87e-7c220f76016f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec = Spec()\n",
+    "spec_dict = spec.generate()\n",
+    "\n",
+    "spec_dict[\"metadata\"][\"number_of_rows\"] = 100\n",
+    "spec_dict[\"metadata\"][\"date_columns\"] = [\"date\", \"future_date\"]\n",
+    "spec_dict[\"metadata\"][\"id\"] = \"main\"\n",
+    "\n",
+    "spec_dict[\"columns\"][\"date\"]        = DateColumn(\"date\", uniques=200, from_date=\"2023-01-01\", cross_join=False)\n",
+    "spec_dict[\"columns\"][\"future_date\"] = DateColumn(\"future_date\", uniques=200, cross_join=False, anonymising_set=increment_date)\n",
+    "\n",
+    "exhibit_data = xbt.Exhibit(command=\"fromspec\", source=spec_dict, output=\"dataframe\")\n",
+    "anon_df = exhibit_data.generate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "1ae914dc-8ad0-4f06-b734-33d367eaeb6a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>future_date</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2023-02-20</td>\n",
+       "      <td>2023-02-28</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2023-01-17</td>\n",
+       "      <td>2023-01-20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2023-02-25</td>\n",
+       "      <td>2023-03-04</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2023-01-09</td>\n",
+       "      <td>2023-01-14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>2023-02-15</td>\n",
+       "      <td>2023-02-21</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>95</th>\n",
+       "      <td>2023-01-06</td>\n",
+       "      <td>2023-01-13</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>96</th>\n",
+       "      <td>2023-03-17</td>\n",
+       "      <td>2023-03-21</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>2023-06-22</td>\n",
+       "      <td>2023-06-27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>2023-03-13</td>\n",
+       "      <td>2023-03-20</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>2023-05-05</td>\n",
+       "      <td>2023-05-09</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>100 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         date future_date\n",
+       "0  2023-02-20  2023-02-28\n",
+       "1  2023-01-17  2023-01-20\n",
+       "2  2023-02-25  2023-03-04\n",
+       "3  2023-01-09  2023-01-14\n",
+       "4  2023-02-15  2023-02-21\n",
+       "..        ...         ...\n",
+       "95 2023-01-06  2023-01-13\n",
+       "96 2023-03-17  2023-03-21\n",
+       "97 2023-06-22  2023-06-27\n",
+       "98 2023-03-13  2023-03-20\n",
+       "99 2023-05-05  2023-05-09\n",
+       "\n",
+       "[100 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "anon_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9c19d01f-d578-4945-a8a7-50e05b502083",
+   "metadata": {},
+   "source": [
+    "#### Using external libraries to generate realistic data\n",
+    "Faker is a well-known library with fake data used for testing purposes. It has a number of providers and fake datasets. In this example, we'll use Faker to generate the name and address details and augment them with a unique ID and a smoker attribute using Exhibit."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7e0eb2de-53a4-4bb2-82d2-725a65b11ee3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Arthur Washington'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fake = Faker()\n",
+    "fake.name()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e021381a-9431-4315-8d44-0f89150d0fa9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# rememeber that we need to include a function argument for the dataset row, even if it's unused.\n",
+    "# under the hood, Exhibit will attempt to pass the function to Pandas' apply so if you don't include\n",
+    "# a placeholder argument, you will get an error.\n",
+    "def fake_name(_):\n",
+    "    return fake.name()\n",
+    "\n",
+    "def fake_address(_):\n",
+    "    return fake.address()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a9349232-1076-4788-99ff-0d6499cf7721",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec = Spec()\n",
+    "spec_dict = spec.generate()\n",
+    "\n",
+    "spec_dict[\"metadata\"][\"number_of_rows\"] = 100\n",
+    "spec_dict[\"metadata\"][\"uuid_columns\"] = [\"id\"]\n",
+    "spec_dict[\"metadata\"][\"categorical_columns\"] = [\"name\", \"address\", \"smoker\"]\n",
+    "spec_dict[\"metadata\"][\"id\"] = \"main\"\n",
+    "\n",
+    "smoker_data = pd.DataFrame(data={\n",
+    "    \"smoker\":             [\"Y\", \"N\", \"No Answer\", \"Missing Data\"],\n",
+    "    \"probability_vector\": [0.2, 0.7, 0.1, 0]\n",
+    "})\n",
+    "\n",
+    "spec_dict[\"columns\"][\"id\"]      = UUIDColumn(anon_set=\"range\")\n",
+    "spec_dict[\"columns\"][\"name\"]    = CategoricalColumn(\"name\", uniques=100, original_values=None, anon_set=fake_name)\n",
+    "spec_dict[\"columns\"][\"address\"] = CategoricalColumn(\"address\", uniques=100, original_values=None, anon_set=fake_address, miss_proba=0.1)\n",
+    "spec_dict[\"columns\"][\"smoker\"]  = CategoricalColumn(\"smoker\", uniques=3, original_values=smoker_data)\n",
+    "\n",
+    "exhibit_data = xbt.Exhibit(command=\"fromspec\", source=spec_dict, output=\"dataframe\")\n",
+    "anon_df = exhibit_data.generate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "587f7fa3-b648-44f3-8b8a-14b5075eded2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>name</th>\n",
+       "      <th>address</th>\n",
+       "      <th>smoker</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>22</td>\n",
+       "      <td>Mckenzie Cruz</td>\n",
+       "      <td>762 Baker Point\\nPort Kevin, MN 42282</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>64</td>\n",
+       "      <td>Michael Williams</td>\n",
+       "      <td>481 Madison Fords\\nNew Donnaview, CO 27959</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>11</td>\n",
+       "      <td>Jeanne Smith</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Y</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>46</td>\n",
+       "      <td>Joanna Franklin</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>No Answer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>82</td>\n",
+       "      <td>Daniel Martinez</td>\n",
+       "      <td>479 Jean Falls Suite 185\\nDeanbury, WV 72875</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>95</th>\n",
+       "      <td>5</td>\n",
+       "      <td>Jack Harrison</td>\n",
+       "      <td>Unit 0802 Box 5382\\nDPO AP 08329</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>96</th>\n",
+       "      <td>40</td>\n",
+       "      <td>Courtney Sanchez</td>\n",
+       "      <td>04326 Wallace Circles\\nNorth Anthonybury, IN 8...</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>56</td>\n",
+       "      <td>Thomas Anderson</td>\n",
+       "      <td>28821 Clark Drive Apt. 170\\nPort John, CO 44092</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>98</th>\n",
+       "      <td>76</td>\n",
+       "      <td>Julie Flowers</td>\n",
+       "      <td>74248 Ball Land Apt. 027\\nPowersfurt, RI 70556</td>\n",
+       "      <td>N</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>99</th>\n",
+       "      <td>15</td>\n",
+       "      <td>Jennifer King</td>\n",
+       "      <td>805 Richard Port\\nEast Mario, VT 18338</td>\n",
+       "      <td>No Answer</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>100 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    id              name                                            address  \\\n",
+       "0   22     Mckenzie Cruz              762 Baker Point\\nPort Kevin, MN 42282   \n",
+       "1   64  Michael Williams         481 Madison Fords\\nNew Donnaview, CO 27959   \n",
+       "2   11      Jeanne Smith                                                NaN   \n",
+       "3   46   Joanna Franklin                                                NaN   \n",
+       "4   82   Daniel Martinez       479 Jean Falls Suite 185\\nDeanbury, WV 72875   \n",
+       "..  ..               ...                                                ...   \n",
+       "95   5     Jack Harrison                   Unit 0802 Box 5382\\nDPO AP 08329   \n",
+       "96  40  Courtney Sanchez  04326 Wallace Circles\\nNorth Anthonybury, IN 8...   \n",
+       "97  56   Thomas Anderson    28821 Clark Drive Apt. 170\\nPort John, CO 44092   \n",
+       "98  76     Julie Flowers     74248 Ball Land Apt. 027\\nPowersfurt, RI 70556   \n",
+       "99  15     Jennifer King             805 Richard Port\\nEast Mario, VT 18338   \n",
+       "\n",
+       "       smoker  \n",
+       "0           N  \n",
+       "1           N  \n",
+       "2           Y  \n",
+       "3   No Answer  \n",
+       "4           N  \n",
+       "..        ...  \n",
+       "95          N  \n",
+       "96          N  \n",
+       "97          N  \n",
+       "98          N  \n",
+       "99  No Answer  \n",
+       "\n",
+       "[100 rows x 4 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "anon_df"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}