diff --git a/exhibit/core/generate/categorical.py b/exhibit/core/generate/categorical.py index bebb74d..2bdd7ab 100644 --- a/exhibit/core/generate/categorical.py +++ b/exhibit/core/generate/categorical.py @@ -219,6 +219,10 @@ def _generate_anon_series(self, col_name): # ignoring the standard date genderation parameters, like from / to. anon_set = col_attrs.get("anonymising_set", None) + # Users can pass custom functions to generate categorical / date columns + if callable(anon_set): + return self._generate_using_custom_function(col_name, anon_set) + # check if the anonymising set is a SQL statement starting with SELECT # note that for dates, all other parameters, like from / to will be ignored if anon_set is not None and anon_set.strip().upper()[:6] == "SELECT": @@ -501,8 +505,14 @@ def _generate_using_external_table(self, col_name, anon_set): # duplicates in case user didn't specify DISTINC in his SQL query; # the anon_df would typically be from UUIDs that are generated before # categorical columns. + + # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns if self.anon_df is None or self.anon_df.empty: - existing_data = pd.concat(self.generated_dfs, axis=1) + # self.generated_dfs has cat. columns generated BEFORE this particular column + if not self.generated_dfs: #pragma: no cover + existing_data = pd.DataFrame() + else: + existing_data = pd.concat(self.generated_dfs, axis=1) else: existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1) @@ -600,3 +610,36 @@ def _generate_using_external_table(self, col_name, anon_set): final_result = final_result.astype("datetime64[ns]") return final_result + + def _generate_using_custom_function(self, col_name, anon_set): + ''' + _summary_ + + Parameters + ---------- + col_name : _type_ + _description_ + anon_set : _type_ + _description_ + ''' + # self.anon_df is what is generated BEFORE categorical columns, e.g UUID columns + if self.anon_df is None or self.anon_df.empty: + # self.generated_dfs has cat. columns generated BEFORE this particular column + if not self.generated_dfs: + existing_data = pd.DataFrame() + else: + existing_data = pd.concat(self.generated_dfs, axis=1) + else: #pragma: no cover + existing_data = pd.concat(self.generated_dfs + [self.anon_df], axis=1) + + if existing_data.empty: + result = pd.Series( + data=[anon_set(pd.Series) for _ in range(self.num_rows)], + name=col_name + ) + return result + + result = existing_data.apply(anon_set, axis=1) + result.name = col_name + + return result diff --git a/exhibit/core/generate/tests/test_categorical.py b/exhibit/core/generate/tests/test_categorical.py index e838d1f..c3e26d5 100644 --- a/exhibit/core/generate/tests/test_categorical.py +++ b/exhibit/core/generate/tests/test_categorical.py @@ -3,6 +3,7 @@ ''' # Standard library imports +import datetime import unittest import tempfile from unittest.mock import Mock, patch @@ -532,6 +533,124 @@ def test_date_column_with_impossible_combination_of_from_to_and_period(self): self.assertWarns(RuntimeWarning, gen.generate) + def test_generate_column_with_custom_function_in_anonymised_set(self): + ''' + This option is only valid for when Exhibit is used as a script. For + specification-based generation, use custom ML models. Note that + while numerical weights are respected, probability vectors are not. + ''' + + def _generate_spam(_): + ''' + Basic function to generate menu items in a fictitious bistro. + + Parameters + ---------- + _ : None + the anonymising_set function return one value at a time + and has access to the current row in the DF generated so far. + This argument is mandatory to include, even if it's unused. + + Returns + ---------- + Scalar value + ''' + + rng = np.random.default_rng() + val = rng.choice([ + "Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam", + "Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam", + "Spam, bacon, sausage, and Spam", "Lobster Thermidor", + ]) + + return val + + test_dict = { + "_rng" : np.random.default_rng(seed=0), + "metadata": { + "categorical_columns" : ["menu"], + "inline_limit" : 5, + "id" : "main" + }, + "columns": { + "menu": { + "type": "categorical", + "uniques" : 7, + "original_values" : pd.DataFrame(), + "paired_columns": None, + "anonymising_set" : _generate_spam, + "cross_join_all_unique_values" : False, + }, + } + } + + gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50) + result = gen.generate() + + self.assertIsInstance(result, pd.DataFrame) + self.assertEqual(result.shape[0], 50) + + def test_generate_column_with_custom_date_function_in_anonymised_set(self): + ''' + This option is only valid for when Exhibit is used as a script. For + specification-based generation, use custom ML models. Note that + while numerical weights are respected, probability vectors are not. + ''' + + def _increment_date(row): + ''' + Basic function to generate menu items in a fictitious bistro. + + Parameters + ---------- + row : pd.Series + the anonymising_set function return one value at a time + and has access to the current row in the DF generated so far. + This argument is mandatory to include, even if it's unused. + + Returns + ---------- + Scalar value + ''' + rng = np.random.default_rng() + cur_date = row["date"] + new_date = cur_date + datetime.timedelta(days=int(rng.integers(1, 10))) + + return new_date + + test_dict = { + "_rng" : np.random.default_rng(seed=0), + "metadata": { + "date_columns" : ["date", "future_date"], + "inline_limit" : 5, + "id" : "main" + }, + "columns": { + "date": { + "type": "date", + "from": "2023-01-01", + "to" : "2024-01-01", + "uniques" : 50, + "frequency" : "D", + "cross_join_all_unique_values" : False, + }, + "future_date": { + "type": "date", + "from": "2023-01-01", + "to" : "2024-01-01", + "uniques" : 50, + "frequency" : "D", + "cross_join_all_unique_values" : False, + "anonymising_set" : _increment_date + } + } + } + + gen = tm.CategoricalDataGenerator(spec_dict=test_dict, core_rows=50) + result = gen.generate() + + self.assertTrue((result["future_date"] > result["date"]).all()) + if __name__ == "__main__" and __package__ is None: #overwrite __package__ builtin as per PEP 366 __package__ = "exhibit" diff --git a/exhibit/core/generate/weights.py b/exhibit/core/generate/weights.py index 416ff5e..c888a1f 100644 --- a/exhibit/core/generate/weights.py +++ b/exhibit/core/generate/weights.py @@ -180,9 +180,19 @@ def target_columns_for_weights_table(spec_dict): cat_cols = spec_dict["metadata"]["categorical_columns"] #includes linked cat_cols_set = set(cat_cols) - #drop paired columns and regex columns + #drop columns, like(paired / regex columns) that we don't expect to have num. weights for cat_col in cat_cols: anon_set = spec_dict["columns"][cat_col]["anonymising_set"] + + # if we're missing original_values, there can be no weights + orig_vals = spec_dict["columns"][cat_col]["original_values"] + if orig_vals is None or (isinstance(orig_vals, pd.DataFrame) and orig_vals.empty): #pragma: no cover + cat_cols_set.remove(cat_col) + continue + + # skip the checks for custom functions + if callable(anon_set): + continue if ( is_paired(spec_dict, cat_col) or # we keep the columns if they are in fixed sets or have custom SQL; diff --git a/exhibit/core/tests/test_spec.py b/exhibit/core/tests/test_spec.py index e716a8e..94b8ad3 100644 --- a/exhibit/core/tests/test_spec.py +++ b/exhibit/core/tests/test_spec.py @@ -147,6 +147,70 @@ def test_categorical_column_initialised_from_list(self): anon_df = exhibit_data.generate() self.assertEqual(anon_df.shape, (100, 1)) + + def test_mix_of_categorical_and_numerical_columns_with_incomplete_weights(self): + ''' + This test covers both categorical and continuous column generation. + + Remember that weights are relative to each other, meaning that if we provide + weights for just one value, it doesn't matter because it has no reference point. + If we provide weights for two values, they will be rescaled to sum to 1, while + other values without weights, will be treated as 1, meaning providing incomplete + weights will lead to smaller values relative to missing values. + ''' + + def _generate_spam(_): + ''' + Basic function to generate menu items in a fictitious bistro. + + Parameters + ---------- + _ : None + the anonymising_set function return one value at a time + and has access to the current row in the DF generated so far. + This argument is mandatory to include, even if it's unused. + + Returns + ---------- + Scalar value + ''' + + rng = np.random.default_rng() + val = rng.choice([ + "Egg and bacon", "Egg, sausage, and bacon", "Egg and Spam", + "Egg, bacon, and Spam", "Egg, bacon, sausage, and Spam", + "Spam, bacon, sausage, and Spam", "Lobster Thermidor", + ]) + + return val + + spec = tm.Spec() + spec_dict = spec.generate() + + spec_dict["metadata"]["number_of_rows"] = 50 + spec_dict["metadata"]["categorical_columns"] = ["menu"] + spec_dict["metadata"]["numerical_columns"] = ["price"] + spec_dict["metadata"]["id"] = "main" + + menu_df = pd.DataFrame(data={ + "menu" : ["Egg and bacon", "Lobster Thermidor", "Missing Data"], + "price": [0.5, 0.5, 0.0] + }) + + spec_dict["columns"]["menu"] = tm.CategoricalColumn("menu", uniques=7, original_values=menu_df, anon_set=_generate_spam) + spec_dict["columns"]["price"] = tm.NumericalColumn(distribution_parameters={"target_sum" : 1000, "dispersion": 0.2}) + + exhibit_data = xbt.Exhibit(command="fromspec", source=spec_dict, output="dataframe") + anon_df = exhibit_data.generate() + + test_items = ["Egg and bacon", "Lobster Thermidor"] + + # check that the average price of the two test items is about half the rest + self.assertAlmostEqual( + anon_df[anon_df["menu"].isin(test_items)]["price"].mean() * 2, + anon_df[~anon_df["menu"].isin(test_items)]["price"].mean(), + delta=3 + ) if __name__ == "__main__" and __package__ is None: #overwrite __package__ builtin as per PEP 366 diff --git a/exhibit/core/validator.py b/exhibit/core/validator.py index 9cbf36b..24a13ad 100644 --- a/exhibit/core/validator.py +++ b/exhibit/core/validator.py @@ -206,6 +206,9 @@ def validate_anonymising_set_length(self, spec_dict=None): attr="anonymising_set", col_names=True, types=["categorical"]): + # ignore anonymising_sets that have custom functions + if callable(v): + return True if v.split(".")[0] in self.fixed_sql_sets: col_uniques = spec_dict["columns"][c]["uniques"] diff --git a/recipes/Create peer groups.ipynb b/recipes/Create peer groups.ipynb index 36ab791..faf7ab5 100644 --- a/recipes/Create peer groups.ipynb +++ b/recipes/Create peer groups.ipynb @@ -59,7 +59,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 2018-12-31\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -73,7 +73,7 @@ " 20-29\n", " \n", " \n", - " 1\n", + " 1\n", " 2018-03-31\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -87,7 +87,7 @@ " 0-9\n", " \n", " \n", - " 2\n", + " 2\n", " 2018-06-30\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -101,7 +101,7 @@ " 0-9\n", " \n", " \n", - " 3\n", + " 3\n", " 2018-09-30\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -115,7 +115,7 @@ " 0-9\n", " \n", " \n", - " 4\n", + " 4\n", " 2018-12-31\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -358,27 +358,27 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " Arran War Memorial Hospital\n", " Arran War Memorial Hospital\n", " \n", " \n", - " 1\n", + " 1\n", " Arran War Memorial Hospital\n", " West Glasgow\n", " \n", " \n", - " 2\n", + " 2\n", " Arran War Memorial Hospital\n", " Lorn & Islands Hospital\n", " \n", " \n", - " 3\n", + " 3\n", " NHS Ayrshire & Arran\n", " NHS Ayrshire & Arran\n", " \n", " \n", - " 4\n", + " 4\n", " NHS Ayrshire & Arran\n", " NHS Greater Glasgow & Clyde\n", " \n", @@ -481,7 +481,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " Aberdeen Royal Infirmary\n", " 2018-03-31\n", " All Daycases\n", @@ -491,7 +491,7 @@ " 0\n", " \n", " \n", - " 1\n", + " 1\n", " Aberdeen Royal Infirmary\n", " 2018-03-31\n", " All Daycases\n", @@ -501,7 +501,7 @@ " 0\n", " \n", " \n", - " 2\n", + " 2\n", " Aberdeen Royal Infirmary\n", " 2018-03-31\n", " All Daycases\n", @@ -511,7 +511,7 @@ " 0\n", " \n", " \n", - " 3\n", + " 3\n", " Aberdeen Royal Infirmary\n", " 2018-03-31\n", " All Daycases\n", @@ -521,7 +521,7 @@ " 0\n", " \n", " \n", - " 4\n", + " 4\n", " Aberdeen Royal Infirmary\n", " 2018-03-31\n", " All Daycases\n", @@ -559,7 +559,7 @@ "peer_aggregated_df = (pd\n", " .merge(peer_reference_df, source, how=\"left\", left_on=\"peer_location\", right_on=\"loc_name\")\n", " .drop(columns=\"avlos\") \n", - " .groupby([\"peer_group_name\"] + join_cols).sum()\n", + " .groupby([\"peer_group_name\"] + join_cols)[[\"stays\", \"los\"]].sum()\n", " .rename(columns=lambda x: x + \"_peer\")\n", " .reset_index()\n", " .rename(columns={\"peer_group_name\" : \"loc_name\"}))\n", @@ -634,7 +634,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 2018-12-31\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -651,7 +651,7 @@ " 3.333333\n", " \n", " \n", - " 1\n", + " 1\n", " 2018-03-31\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -668,7 +668,7 @@ " 2.151188\n", " \n", " \n", - " 2\n", + " 2\n", " 2018-06-30\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -685,7 +685,7 @@ " 2.351852\n", " \n", " \n", - " 3\n", + " 3\n", " 2018-09-30\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -702,7 +702,7 @@ " 2.494975\n", " \n", " \n", - " 4\n", + " 4\n", " 2018-12-31\n", " S08000015\n", " NHS Ayrshire & Arran\n", @@ -758,9 +758,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Data Analysis [conda env:data]", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "conda-env-data-data" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -772,7 +772,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/recipes/Using custom functions in anonymising sets.ipynb b/recipes/Using custom functions in anonymising sets.ipynb new file mode 100644 index 0000000..bf61190 --- /dev/null +++ b/recipes/Using custom functions in anonymising sets.ipynb @@ -0,0 +1,464 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "60cfa429-cd74-459f-9be6-6c306b668989", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import pandas as pd\n", + "import numpy as np\n", + "from faker import Faker\n", + "\n", + "from exhibit import exhibit as xbt\n", + "from exhibit.core.spec import Spec, UUIDColumn, CategoricalColumn, DateColumn" + ] + }, + { + "cell_type": "markdown", + "id": "f2859c22-ec83-4c7f-8346-4c4990cce9cb", + "metadata": {}, + "source": [ + "### User-defined functions with custom logic to generate categorical and date values\n", + "When using Exhibit as an importable library and building the specification using Python objects, you can use custom functions in place of the `anonymising_set` attribute for categorical and date columns. This feature provides a lot of flexibility when it comes to data generation and provides a quick and easy way to augment the dataset without adding custom ML models. Custom functions have only two restrictions - they must define an argument for dataset row (even if it's unused) and return a single value. " + ] + }, + { + "cell_type": "markdown", + "id": "04e7a653-ccf0-4976-a032-eac67eeb31d1", + "metadata": {}, + "source": [ + "#### Basic example using dates" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6bca2837-e61a-4e84-bd9a-0dee71ecd27c", + "metadata": {}, + "outputs": [], + "source": [ + "def increment_date(row):\n", + " '''\n", + " Basic function to increase a date by a random number between 1 and 10\n", + "\n", + " Parameters\n", + " ----------\n", + " row : pd.Series\n", + " the anonymising_set function return one value at a time\n", + " and has access to the current row in the DF generated so far.\n", + " This argument is mandatory to include, even if it's unused.\n", + "\n", + " Returns\n", + " ----------\n", + " Scalar value\n", + " '''\n", + " \n", + " rng = np.random.default_rng()\n", + "\n", + " # note the use of row argument to get the value in the date column that was already in the generated dataset\n", + " # columns are generated in the order they appear (or are added) in the spec so if we reversed the order, we\n", + " # wouldn't have been able to access the \"date\" column. Similarly, numerical and geo columns are generated after\n", + " # the categorical ones so those values are also not yet \n", + " cur_date = row[\"date\"]\n", + " new_date = cur_date + datetime.timedelta(days=int(rng.integers(1, 10)))\n", + "\n", + " return new_date" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0edff8f0-47e3-4729-b87e-7c220f76016f", + "metadata": {}, + "outputs": [], + "source": [ + "spec = Spec()\n", + "spec_dict = spec.generate()\n", + "\n", + "spec_dict[\"metadata\"][\"number_of_rows\"] = 100\n", + "spec_dict[\"metadata\"][\"date_columns\"] = [\"date\", \"future_date\"]\n", + "spec_dict[\"metadata\"][\"id\"] = \"main\"\n", + "\n", + "spec_dict[\"columns\"][\"date\"] = DateColumn(\"date\", uniques=200, from_date=\"2023-01-01\", cross_join=False)\n", + "spec_dict[\"columns\"][\"future_date\"] = DateColumn(\"future_date\", uniques=200, cross_join=False, anonymising_set=increment_date)\n", + "\n", + "exhibit_data = xbt.Exhibit(command=\"fromspec\", source=spec_dict, output=\"dataframe\")\n", + "anon_df = exhibit_data.generate()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1ae914dc-8ad0-4f06-b734-33d367eaeb6a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datefuture_date
02023-02-202023-02-28
12023-01-172023-01-20
22023-02-252023-03-04
32023-01-092023-01-14
42023-02-152023-02-21
.........
952023-01-062023-01-13
962023-03-172023-03-21
972023-06-222023-06-27
982023-03-132023-03-20
992023-05-052023-05-09
\n", + "

100 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " date future_date\n", + "0 2023-02-20 2023-02-28\n", + "1 2023-01-17 2023-01-20\n", + "2 2023-02-25 2023-03-04\n", + "3 2023-01-09 2023-01-14\n", + "4 2023-02-15 2023-02-21\n", + ".. ... ...\n", + "95 2023-01-06 2023-01-13\n", + "96 2023-03-17 2023-03-21\n", + "97 2023-06-22 2023-06-27\n", + "98 2023-03-13 2023-03-20\n", + "99 2023-05-05 2023-05-09\n", + "\n", + "[100 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anon_df" + ] + }, + { + "cell_type": "markdown", + "id": "9c19d01f-d578-4945-a8a7-50e05b502083", + "metadata": {}, + "source": [ + "#### Using external libraries to generate realistic data\n", + "Faker is a well-known library with fake data used for testing purposes. It has a number of providers and fake datasets. In this example, we'll use Faker to generate the name and address details and augment them with a unique ID and a smoker attribute using Exhibit." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7e0eb2de-53a4-4bb2-82d2-725a65b11ee3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Arthur Washington'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fake = Faker()\n", + "fake.name()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e021381a-9431-4315-8d44-0f89150d0fa9", + "metadata": {}, + "outputs": [], + "source": [ + "# rememeber that we need to include a function argument for the dataset row, even if it's unused.\n", + "# under the hood, Exhibit will attempt to pass the function to Pandas' apply so if you don't include\n", + "# a placeholder argument, you will get an error.\n", + "def fake_name(_):\n", + " return fake.name()\n", + "\n", + "def fake_address(_):\n", + " return fake.address()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a9349232-1076-4788-99ff-0d6499cf7721", + "metadata": {}, + "outputs": [], + "source": [ + "spec = Spec()\n", + "spec_dict = spec.generate()\n", + "\n", + "spec_dict[\"metadata\"][\"number_of_rows\"] = 100\n", + "spec_dict[\"metadata\"][\"uuid_columns\"] = [\"id\"]\n", + "spec_dict[\"metadata\"][\"categorical_columns\"] = [\"name\", \"address\", \"smoker\"]\n", + "spec_dict[\"metadata\"][\"id\"] = \"main\"\n", + "\n", + "smoker_data = pd.DataFrame(data={\n", + " \"smoker\": [\"Y\", \"N\", \"No Answer\", \"Missing Data\"],\n", + " \"probability_vector\": [0.2, 0.7, 0.1, 0]\n", + "})\n", + "\n", + "spec_dict[\"columns\"][\"id\"] = UUIDColumn(anon_set=\"range\")\n", + "spec_dict[\"columns\"][\"name\"] = CategoricalColumn(\"name\", uniques=100, original_values=None, anon_set=fake_name)\n", + "spec_dict[\"columns\"][\"address\"] = CategoricalColumn(\"address\", uniques=100, original_values=None, anon_set=fake_address, miss_proba=0.1)\n", + "spec_dict[\"columns\"][\"smoker\"] = CategoricalColumn(\"smoker\", uniques=3, original_values=smoker_data)\n", + "\n", + "exhibit_data = xbt.Exhibit(command=\"fromspec\", source=spec_dict, output=\"dataframe\")\n", + "anon_df = exhibit_data.generate()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "587f7fa3-b648-44f3-8b8a-14b5075eded2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnameaddresssmoker
022Mckenzie Cruz762 Baker Point\\nPort Kevin, MN 42282N
164Michael Williams481 Madison Fords\\nNew Donnaview, CO 27959N
211Jeanne SmithNaNY
346Joanna FranklinNaNNo Answer
482Daniel Martinez479 Jean Falls Suite 185\\nDeanbury, WV 72875N
...............
955Jack HarrisonUnit 0802 Box 5382\\nDPO AP 08329N
9640Courtney Sanchez04326 Wallace Circles\\nNorth Anthonybury, IN 8...N
9756Thomas Anderson28821 Clark Drive Apt. 170\\nPort John, CO 44092N
9876Julie Flowers74248 Ball Land Apt. 027\\nPowersfurt, RI 70556N
9915Jennifer King805 Richard Port\\nEast Mario, VT 18338No Answer
\n", + "

100 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " id name address \\\n", + "0 22 Mckenzie Cruz 762 Baker Point\\nPort Kevin, MN 42282 \n", + "1 64 Michael Williams 481 Madison Fords\\nNew Donnaview, CO 27959 \n", + "2 11 Jeanne Smith NaN \n", + "3 46 Joanna Franklin NaN \n", + "4 82 Daniel Martinez 479 Jean Falls Suite 185\\nDeanbury, WV 72875 \n", + ".. .. ... ... \n", + "95 5 Jack Harrison Unit 0802 Box 5382\\nDPO AP 08329 \n", + "96 40 Courtney Sanchez 04326 Wallace Circles\\nNorth Anthonybury, IN 8... \n", + "97 56 Thomas Anderson 28821 Clark Drive Apt. 170\\nPort John, CO 44092 \n", + "98 76 Julie Flowers 74248 Ball Land Apt. 027\\nPowersfurt, RI 70556 \n", + "99 15 Jennifer King 805 Richard Port\\nEast Mario, VT 18338 \n", + "\n", + " smoker \n", + "0 N \n", + "1 N \n", + "2 Y \n", + "3 No Answer \n", + "4 N \n", + ".. ... \n", + "95 N \n", + "96 N \n", + "97 N \n", + "98 N \n", + "99 No Answer \n", + "\n", + "[100 rows x 4 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anon_df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}