diff --git a/notebooks/2.1_sandbox-match_households.ipynb b/notebooks/2.1_sandbox-match_households.ipynb
new file mode 100644
index 0000000..7333a8f
--- /dev/null
+++ b/notebooks/2.1_sandbox-match_households.ipynb
@@ -0,0 +1,3965 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Adding activity chains to synthetic populations \n",
+ "\n",
+ "The purpose of this script is to test different approaches to matching households in the synthetic population to a household from the [National Travel Survey (NTS)](https://beta.ukdataservice.ac.uk/datacatalogue/studies/study?id=5340). \n",
+ "\n",
+ "### Methods\n",
+ "\n",
+ "1. categorical matching: joining on relevant socio-demographic variables\n",
+ "2. statistical matching, as described in [An unconstrained statistical matching algorithm for combining individual and household level geo-specific census and survey data](https://doi.org/10.1016/j.compenvurbsys.2016.11.003). "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "from acbm.preprocessing import (\n",
+ " count_per_group,\n",
+ " match_coverage_col,\n",
+ " nts_filter_by_region,\n",
+ " nts_filter_by_year,\n",
+ " num_adult_child_hh,\n",
+ " transform_by_group,\n",
+ " truncate_values,\n",
+ ")\n",
+ "\n",
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Step 1: Load in the datasets "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### SPC "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# useful variables\n",
+ "region = \"west-yorkshire\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " household | \n",
+ " workplace | \n",
+ " location | \n",
+ " events | \n",
+ " weekday_diaries | \n",
+ " weekend_diaries | \n",
+ " orig_pid | \n",
+ " id_tus_hh | \n",
+ " id_tus_p | \n",
+ " pid_hs | \n",
+ " msoa | \n",
+ " oa | \n",
+ " members | \n",
+ " bmi | \n",
+ " has_cardiovascular_disease | \n",
+ " has_diabetes | \n",
+ " has_high_blood_pressure | \n",
+ " number_medications | \n",
+ " self_assessed_health | \n",
+ " life_satisfaction | \n",
+ " sic1d2007 | \n",
+ " sic2d2007 | \n",
+ " soc2010 | \n",
+ " pwkstat | \n",
+ " salary_yearly | \n",
+ " salary_hourly | \n",
+ " hid | \n",
+ " accommodation_type | \n",
+ " communal_type | \n",
+ " num_rooms | \n",
+ " central_heat | \n",
+ " tenure | \n",
+ " num_cars | \n",
+ " sex | \n",
+ " age_years | \n",
+ " ethnicity | \n",
+ " nssec8 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " {'x': -1.7892179489135742, 'y': 53.91915130615... | \n",
+ " {'concert_f': 1.2791347489984115e-31, 'concert... | \n",
+ " [1583, 13161] | \n",
+ " [1582, 13160] | \n",
+ " E02002183_0001_001 | \n",
+ " 11291218 | \n",
+ " 1 | \n",
+ " 2905399 | \n",
+ " E02002183 | \n",
+ " E00053954 | \n",
+ " [0] | \n",
+ " 24.879356 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " NaN | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ " J | \n",
+ " 58.0 | \n",
+ " 1115.0 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " E02002183_0001 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 2.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 86 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " {'x': -1.8262380361557007, 'y': 53.92028045654... | \n",
+ " {'concert_f': 9.743248151956307e-21, 'concert_... | \n",
+ " [2900, 4948, 4972, 7424, 10284, 10586, 12199, ... | \n",
+ " [2901, 4949, 4973, 7425, 10285, 10585, 12198, ... | \n",
+ " E02002183_0002_001 | \n",
+ " 17291219 | \n",
+ " 1 | \n",
+ " 2905308 | \n",
+ " E02002183 | \n",
+ " E00053953 | \n",
+ " [1, 2] | \n",
+ " 27.491207 | \n",
+ " False | \n",
+ " False | \n",
+ " True | \n",
+ " NaN | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " C | \n",
+ " 25.0 | \n",
+ " 1121.0 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " E02002183_0002 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 74 | \n",
+ " 3 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " {'x': -1.8262380361557007, 'y': 53.92028045654... | \n",
+ " {'concert_f': 8.46716103992468e-16, 'concert_f... | \n",
+ " [3010, 6389, 9448, 10184, 11598] | \n",
+ " [3011, 6388, 9447, 10183, 11599] | \n",
+ " E02002183_0002_002 | \n",
+ " 17070713 | \n",
+ " 2 | \n",
+ " 2907681 | \n",
+ " E02002183 | \n",
+ " E00053953 | \n",
+ " [1, 2] | \n",
+ " 17.310829 | \n",
+ " False | \n",
+ " True | \n",
+ " True | \n",
+ " NaN | \n",
+ " 2.0 | \n",
+ " 4.0 | \n",
+ " P | \n",
+ " 85.0 | \n",
+ " 2311.0 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " E02002183_0002 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 68 | \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 56126.0 | \n",
+ " {'x': -1.8749940395355225, 'y': 53.94298934936... | \n",
+ " {'concert_f': 1.8844366073608398, 'concert_fs'... | \n",
+ " [366, 867, 2096, 3678, 5212, 5450, 8145, 9254,... | \n",
+ " [365, 868, 2097, 3677, 5213, 5451, 8146, 9253,... | \n",
+ " E02002183_0003_001 | \n",
+ " 20310313 | \n",
+ " 1 | \n",
+ " 2902817 | \n",
+ " E02002183 | \n",
+ " E00053689 | \n",
+ " [3, 4] | \n",
+ " 20.852091 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " NaN | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " C | \n",
+ " 31.0 | \n",
+ " 3422.0 | \n",
+ " 1 | \n",
+ " 32857.859375 | \n",
+ " 14.360952 | \n",
+ " E02002183_0003 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 27 | \n",
+ " 1 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " {'x': -1.8749940395355225, 'y': 53.94298934936... | \n",
+ " {'concert_f': 4.877435207366943, 'concert_fs':... | \n",
+ " [1289, 12528, 12870] | \n",
+ " [1288, 12529, 12871] | \n",
+ " E02002183_0003_002 | \n",
+ " 13010909 | \n",
+ " 3 | \n",
+ " 2900884 | \n",
+ " E02002183 | \n",
+ " E00053689 | \n",
+ " [3, 4] | \n",
+ " 20.032526 | \n",
+ " False | \n",
+ " False | \n",
+ " False | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 3.0 | \n",
+ " J | \n",
+ " 62.0 | \n",
+ " 7214.0 | \n",
+ " 1 | \n",
+ " 18162.451172 | \n",
+ " 9.439944 | \n",
+ " E02002183_0003 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 26 | \n",
+ " 1 | \n",
+ " 6.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id household workplace \\\n",
+ "0 0 0 NaN \n",
+ "1 1 1 NaN \n",
+ "2 2 1 NaN \n",
+ "3 3 2 56126.0 \n",
+ "4 4 2 NaN \n",
+ "\n",
+ " location \\\n",
+ "0 {'x': -1.7892179489135742, 'y': 53.91915130615... \n",
+ "1 {'x': -1.8262380361557007, 'y': 53.92028045654... \n",
+ "2 {'x': -1.8262380361557007, 'y': 53.92028045654... \n",
+ "3 {'x': -1.8749940395355225, 'y': 53.94298934936... \n",
+ "4 {'x': -1.8749940395355225, 'y': 53.94298934936... \n",
+ "\n",
+ " events \\\n",
+ "0 {'concert_f': 1.2791347489984115e-31, 'concert... \n",
+ "1 {'concert_f': 9.743248151956307e-21, 'concert_... \n",
+ "2 {'concert_f': 8.46716103992468e-16, 'concert_f... \n",
+ "3 {'concert_f': 1.8844366073608398, 'concert_fs'... \n",
+ "4 {'concert_f': 4.877435207366943, 'concert_fs':... \n",
+ "\n",
+ " weekday_diaries \\\n",
+ "0 [1583, 13161] \n",
+ "1 [2900, 4948, 4972, 7424, 10284, 10586, 12199, ... \n",
+ "2 [3010, 6389, 9448, 10184, 11598] \n",
+ "3 [366, 867, 2096, 3678, 5212, 5450, 8145, 9254,... \n",
+ "4 [1289, 12528, 12870] \n",
+ "\n",
+ " weekend_diaries orig_pid \\\n",
+ "0 [1582, 13160] E02002183_0001_001 \n",
+ "1 [2901, 4949, 4973, 7425, 10285, 10585, 12198, ... E02002183_0002_001 \n",
+ "2 [3011, 6388, 9447, 10183, 11599] E02002183_0002_002 \n",
+ "3 [365, 868, 2097, 3677, 5213, 5451, 8146, 9253,... E02002183_0003_001 \n",
+ "4 [1288, 12529, 12871] E02002183_0003_002 \n",
+ "\n",
+ " id_tus_hh id_tus_p pid_hs msoa oa members bmi \\\n",
+ "0 11291218 1 2905399 E02002183 E00053954 [0] 24.879356 \n",
+ "1 17291219 1 2905308 E02002183 E00053953 [1, 2] 27.491207 \n",
+ "2 17070713 2 2907681 E02002183 E00053953 [1, 2] 17.310829 \n",
+ "3 20310313 1 2902817 E02002183 E00053689 [3, 4] 20.852091 \n",
+ "4 13010909 3 2900884 E02002183 E00053689 [3, 4] 20.032526 \n",
+ "\n",
+ " has_cardiovascular_disease has_diabetes has_high_blood_pressure \\\n",
+ "0 False False False \n",
+ "1 False False True \n",
+ "2 False True True \n",
+ "3 False False False \n",
+ "4 False False False \n",
+ "\n",
+ " number_medications self_assessed_health life_satisfaction sic1d2007 \\\n",
+ "0 NaN 3.0 2.0 J \n",
+ "1 NaN 3.0 NaN C \n",
+ "2 NaN 2.0 4.0 P \n",
+ "3 NaN 2.0 1.0 C \n",
+ "4 1.0 2.0 3.0 J \n",
+ "\n",
+ " sic2d2007 soc2010 pwkstat salary_yearly salary_hourly hid \\\n",
+ "0 58.0 1115.0 6 NaN NaN E02002183_0001 \n",
+ "1 25.0 1121.0 6 NaN NaN E02002183_0002 \n",
+ "2 85.0 2311.0 6 NaN NaN E02002183_0002 \n",
+ "3 31.0 3422.0 1 32857.859375 14.360952 E02002183_0003 \n",
+ "4 62.0 7214.0 1 18162.451172 9.439944 E02002183_0003 \n",
+ "\n",
+ " accommodation_type communal_type num_rooms central_heat tenure \\\n",
+ "0 1.0 NaN 2.0 True 2.0 \n",
+ "1 3.0 NaN 6.0 True 2.0 \n",
+ "2 3.0 NaN 6.0 True 2.0 \n",
+ "3 3.0 NaN 6.0 True 2.0 \n",
+ "4 3.0 NaN 6.0 True 2.0 \n",
+ "\n",
+ " num_cars sex age_years ethnicity nssec8 \n",
+ "0 2 1 86 1 1.0 \n",
+ "1 2 1 74 3 1.0 \n",
+ "2 2 2 68 1 2.0 \n",
+ "3 1 1 27 1 4.0 \n",
+ "4 1 2 26 1 6.0 "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Read in the spc data (parquet format)\n",
+ "spc = pd.read_parquet('../data/external/spc_output/' + region + '_people_hh.parquet')\n",
+ "spc.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# select columns\n",
+ "spc = spc[['id', 'household', 'location', 'pid_hs',\n",
+ " 'msoa', 'oa', 'members', 'sic1d2007', 'sic2d2007',\n",
+ " 'pwkstat', 'salary_yearly', 'salary_hourly', 'hid',\n",
+ " 'accommodation_type', 'communal_type', 'num_rooms', 'central_heat',\n",
+ " 'tenure', 'num_cars', 'sex', 'age_years', 'ethnicity', 'nssec8']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# temporary reduction of the dataset for quick analysis\n",
+ "spc = spc.head(50000)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### NTS\n",
+ "\n",
+ "The NTS is split up into multiple tables. We will load in the following tables:\n",
+ "- individuals\n",
+ "- households\n",
+ "- trips"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path_psu = \"../data/external/nts/UKDA-5340-tab/tab/psu_eul_2002-2022.tab\"\n",
+ "psu = pd.read_csv(path_psu, sep=\"\\t\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Individuals"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path_individuals = \"../data/external/nts/UKDA-5340-tab/tab/individual_eul_2002-2022.tab\"\n",
+ "nts_individuals = pd.read_csv(path_individuals,\n",
+ " sep=\"\\t\",\n",
+ " usecols = ['IndividualID',\n",
+ " 'HouseholdID',\n",
+ " 'PSUID',\n",
+ " 'Age_B01ID',\n",
+ " 'Age_B04ID',\n",
+ " 'Sex_B01ID',\n",
+ " 'OfPenAge_B01ID',\n",
+ " 'HRPRelation_B01ID',\n",
+ " 'EdAttn1_B01ID',\n",
+ " 'EdAttn2_B01ID',\n",
+ " 'EdAttn3_B01ID',\n",
+ " 'OwnCycle_B01ID', # Owns a cycle\n",
+ " 'DrivLic_B02ID', # type of driving license\n",
+ " 'CarAccess_B01ID',\n",
+ " 'IndIncome2002_B02ID',\n",
+ " 'IndWkGOR_B02ID', # Region of usual place of work\n",
+ " 'EcoStat_B02ID', # Working status of individual\n",
+ " 'EcoStat_B03ID',\n",
+ " 'NSSec_B03ID', # NSSEC high level breakdown\n",
+ " 'SC_B01ID', # Social class of individual\n",
+ " 'Stat_B01ID', # employee or self-employed\n",
+ " 'WkMode_B01ID', # Usual means of travel to work\n",
+ " 'WkHome_B01ID', # Work from home\n",
+ " 'PossHom_B01ID', # Is it possible to work from home?\n",
+ " 'OftHome_B01ID', # How often work from home\n",
+ " 'TravSh_B01ID', # Usual mode from main food shopping trip\n",
+ " 'SchDly_B01ID', # Daily school journey?\n",
+ " 'SchTrav_B01ID', # Usual mode of travel to school\n",
+ " 'SchAcc_B01ID', # IS school trip accompanied by an adult?\n",
+ " 'FdShp_B01ID', # How do you usually carry ot main food shop (go to shop, online etc)\n",
+ " ]\n",
+ ")\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Households"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "path_households = \"../data/external/nts/UKDA-5340-tab/tab/household_eul_2002-2022.tab\"\n",
+ "nts_households = pd.read_csv(path_households,\n",
+ " sep=\"\\t\",\n",
+ " usecols = ['HouseholdID',\n",
+ " 'PSUID',\n",
+ " 'HHIncome2002_B02ID',\n",
+ " 'AddressType_B01ID', # type of house\n",
+ " 'Ten1_B02ID', # type of tenure\n",
+ " 'HHoldNumAdults', # total no. of adults in household\n",
+ " 'HHoldNumChildren', # total no. of children in household\n",
+ " 'HHoldNumPeople', # total no. of people in household\n",
+ " 'NumLicHolders', # total no. of driving license holders in household\n",
+ " 'HHoldEmploy_B01ID', # number of employed in household\n",
+ " 'NumBike', # no. of bikes\n",
+ " 'NumCar', # no. of cars\n",
+ " 'NumVanLorry', # no. of vans or lorries\n",
+ " 'NumMCycle', # no. of motorcycles\n",
+ " 'WalkBus_B01ID', # walk time from house to nearest bus stop\n",
+ " 'Getbus_B01ID', # frequency of bus service\n",
+ " 'WalkRail_B01ID', # walk time from house to nearest rail station\n",
+ " 'JTimeHosp_B01ID', # journey time to nearest hospital\n",
+ " 'DVShop_B01ID', # person no. for main food shooper in hh\n",
+ " 'Settlement2011EW_B03ID', # ONS Urban/Rural: 2 categories\n",
+ " 'Settlement2011EW_B04ID', # ONS Urban/Rural: 3 categories\n",
+ " 'HHoldOAClass2011_B03ID', # Census 2011 OA Classification\n",
+ " 'HRPWorkStat_B02ID', # HH ref person working status\n",
+ " 'HRPSEGWorkStat_B01ID', # HH ref person socio economic group for active workers\n",
+ " 'W0', # Unweighted interview sample\n",
+ " 'W1', # Unweighted diary sample\n",
+ " 'W2', # Weighted diary sample\n",
+ " 'W3', # Weighted interview sample\n",
+ " ]\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Trips"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "path_trips = \"../data/external/nts/UKDA-5340-tab/tab/trip_eul_2002-2022.tab\"\n",
+ "nts_trips = pd.read_csv(path_trips,\n",
+ " sep=\"\\t\",\n",
+ " usecols = ['TripID',\n",
+ " 'DayID',\n",
+ " 'IndividualID',\n",
+ " 'HouseholdID',\n",
+ " 'PSUID',\n",
+ " 'PersNo',\n",
+ " 'TravDay',\n",
+ " 'JourSeq',\n",
+ " 'ShortWalkTrip_B01ID',\n",
+ " 'NumStages',\n",
+ " 'MainMode_B03ID',\n",
+ " 'MainMode_B04ID',\n",
+ " 'TripPurpFrom_B01ID',\n",
+ " 'TripPurpTo_B01ID',\n",
+ " 'TripPurpose_B04ID',\n",
+ " 'TripStart',\n",
+ " 'TripEnd',\n",
+ " 'TripTotalTime',\n",
+ " 'TripTravTime',\n",
+ " 'TripDisIncSW',\n",
+ " 'TripDisExSW',\n",
+ " 'TripOrigGOR_B02ID',\n",
+ " 'TripDestGOR_B02ID',\n",
+ " 'W5',\n",
+ " 'W5xHH'\n",
+ " ]\n",
+ " )\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Filter by year\n",
+ "\n",
+ "We will filter the NTS data to only include data from specific years. We can choose only 1 year, or multiple years to increase our sample size and the likelihood of a match with the spc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "years = [2019, 2021, 2022]\n",
+ "\n",
+ "nts_individuals = nts_filter_by_year(nts_individuals, psu, years)\n",
+ "nts_households = nts_filter_by_year(nts_households, psu, years)\n",
+ "nts_trips = nts_filter_by_year(nts_trips, psu, years)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Filter by geography \n",
+ "\n",
+ "I will not do this for categorical matching, as it reduces the sample significantly, and leads to more spc households not being matched"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# regions = ['Yorkshire and the Humber', 'North West']\n",
+ "\n",
+ "# nts_individuals = nts_filter_by_region(nts_individuals, psu, regions)\n",
+ "# nts_households = nts_filter_by_region(nts_households, psu, regions)\n",
+ "# nts_trips = nts_filter_by_region(nts_trips, psu, regions)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create dictionaries of key value pairs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "'''\n",
+ "guide to the dictionaries:\n",
+ "\n",
+ "_nts_hh: from NTS households table\n",
+ "_nts_ind: from NTS individuals table\n",
+ "_spc: from SPC\n",
+ "\n",
+ "'''\n",
+ "\n",
+ "\n",
+ "# ---------- NTS\n",
+ "\n",
+ "# Create a dictionary for the HHIncome2002_B02ID column\n",
+ "income_dict_nts_hh = {\n",
+ " '1': '0-25k',\n",
+ " '2': '25k-50k',\n",
+ " '3': '50k+',\n",
+ " '-8': 'NA',\n",
+ " # should be -10, but\n",
+ " # it could be a typo in household_eul_2002-2022_ukda_data_dictionary\n",
+ " '-1': 'DEAD'\n",
+ "}\n",
+ "\n",
+ "# Create a dictionary for the HHoldEmploy_B01ID column\n",
+ "# (PT: Part time, FT: Full time)\n",
+ "employment_dict_nts_hh = {\n",
+ " '1': 'None',\n",
+ " '2': '0 FT, 1 PT',\n",
+ " '3': '1 FT, 0 PT',\n",
+ " '4': '0 FT, 2 PT',\n",
+ " '5': '1 FT, 1 PT',\n",
+ " '6': '2 FT, 0 PT',\n",
+ " '7': '1 FT, 2+ PT',\n",
+ " '8': '2 FT, 1+ PT',\n",
+ " '9': '0 FT, 3+ PT',\n",
+ " '10': '3+ FT, 0 PT',\n",
+ " '11': '3+ FT, 1+ PT',\n",
+ " '-8': 'NA',\n",
+ " '-10': 'DEAD'\n",
+ "}\n",
+ "\n",
+ "# Create a dictionary for the Ten1_B02ID column\n",
+ "tenure_dict_nts_hh = {\n",
+ " '1': 'Owns / buying',\n",
+ " '2': 'Rents',\n",
+ " '3': 'Other (including rent free)',\n",
+ " '-8': 'NA',\n",
+ " '-9': 'DNA',\n",
+ " '-10': 'DEAD'\n",
+ "}\n",
+ "\n",
+ "\n",
+ "# ---------- SPC\n",
+ "\n",
+ "\n",
+ "# create a dictionary for the pwkstat column\n",
+ "employment_dict_spc = {\n",
+ " '0': 'Not applicable (age < 16)',\n",
+ " '1': 'Employee FT',\n",
+ " '2': 'Employee PT',\n",
+ " '3': 'Employee unspecified',\n",
+ " '4': 'Self-employed',\n",
+ " '5': 'Unemployed',\n",
+ " '6': 'Retired',\n",
+ " '7': 'Homemaker/Maternal leave',\n",
+ " '8': 'Student',\n",
+ " '9': 'Long term sickness/disability',\n",
+ " '10': 'Other'\n",
+ "}\n",
+ "\n",
+ "\n",
+ "# Create a dictionary for the tenure column\n",
+ "tenure_dict_spc = {\n",
+ " '1': 'Owned: Owned outright',\n",
+ " '2': 'Owned: Owned with a mortgage or loan or shared ownership',\n",
+ " '3': 'Rented or living rent free: Total',\n",
+ " '4': 'Rented: Social rented',\n",
+ " '5': 'Rented: Private rented or living rent free',\n",
+ " '-8': 'NA',\n",
+ " '-9': 'DNA',\n",
+ " '-10': 'DEAD'\n",
+ "}\n",
+ "\n",
+ "\n",
+ "# Combine the dictionaries into a dictionary of dictionaries\n",
+ "\n",
+ "dict_nts = {\n",
+ " 'HHIncome2002_B02ID': income_dict_nts_hh,\n",
+ " 'HHoldEmploy_B01ID': employment_dict_nts_hh,\n",
+ " 'Ten1_B02ID': tenure_dict_nts_hh\n",
+ "}\n",
+ "\n",
+ "dict_spc = {\n",
+ " 'pwkstat': employment_dict_spc,\n",
+ " 'tenure': tenure_dict_spc\n",
+ "}\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Step 2: Decide on matching variables \n",
+ "\n",
+ "We need to identify the socio-demographic characteristics that we will match on. The schema for the synthetic population can be found [here](https://github.com/alan-turing-institute/uatk-spc/blob/main/synthpop.proto). \n",
+ "\n",
+ "Matching between the SPC and the NTS will happen in two steps: \n",
+ "\n",
+ "1. Match at the household level\n",
+ "2. Match individuals within the household\n",
+ "\n",
+ "### Household level matching \n",
+ "\n",
+ "| Variable | Name (NTS) | Name (SPC) | Transformation (NTS) | Transformation (SPC) |\n",
+ "| ------------------ | -------------------- | --------------- | -------------------- | -------------------- |\n",
+ "| Household income | `HHIncome2002_BO2ID` | `salary_yearly` | NA | Group by household ID and sum |\n",
+ "| Number of adults | `HHoldNumAdults` | `age_years` | NA | Group by household ID and count |\n",
+ "| Number of children | `HHoldNumChildren` | `age_years` | NA | Group by household ID and count |\n",
+ "| Employment status | `HHoldEmploy_B01ID` | `pwkstat` | NA | a) match to NTS categories. b) group by household ID |\n",
+ "| Car ownership | `NumCar` | `num_cars` | SPC is capped at 2. We change all entries > 2 to 2 | NA |\n",
+ "\n",
+ "Other columns to match in the future\n",
+ "| Variable | Name (NTS) | Name (SPC) | Transformation (NTS) | Transformation (SPC) |\n",
+ "| ------------------ | -------------------- | --------------- | -------------------- | -------------------- |\n",
+ "| Type of tenancy | `Ten1_B02ID` | `tenure` | ?? | ?? |\n",
+ "| Urban-Rural classification of residence | `Settlement2011EW_B04ID` | NA | NA | Spatial join between [layer](https://www.gov.uk/government/collections/rural-urban-classification) and SPC |\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2.1 Edit SPC columns "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Household Income\n",
+ "\n",
+ "Edit the spc so that we have household income as well as individual income."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add household income column for SPC\n",
+ "spc_edited = transform_by_group(data = spc,\n",
+ " group_col = 'household',\n",
+ " transform_col = 'salary_yearly',\n",
+ " new_col = 'salary_yearly_hh',\n",
+ " transformation_type = 'sum')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Check number of individuals and households with reported salaries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Individuals in SPC = 50000\n",
+ "Individuals without reported income = 30056\n",
+ "% of individuals with reported income = 39.9\n",
+ "Individuals with reported income: 0 = 0\n",
+ "Households in SPC = 21569\n",
+ "Households without reported income = 14653\n",
+ "% of households with reported income = 67.9\n",
+ "Households with reported income: 0 = 14653\n"
+ ]
+ }
+ ],
+ "source": [
+ "# histogram for individuals and households (include NAs as 0)\n",
+ "fig, ax = plt.subplots(1, 2, figsize=(12, 6), sharey=True)\n",
+ "ax[0].hist(spc_edited['salary_yearly'].fillna(0), bins=30)\n",
+ "ax[0].set_title('Salary yearly (Individuals)')\n",
+ "ax[0].set_xlabel('Salary yearly')\n",
+ "ax[0].set_ylabel('Frequency')\n",
+ "ax[1].hist(spc_edited['salary_yearly_hh'].fillna(0), bins=30)\n",
+ "ax[1].set_title('Salary yearly (Households)')\n",
+ "ax[1].set_xlabel('Salary yearly')\n",
+ "plt.show()\n",
+ "\n",
+ "\n",
+ "# statistics\n",
+ "\n",
+ "# print the total number of rows in the spc. Add a message \"Values =\"\n",
+ "print(\"Individuals in SPC =\", spc_edited.shape[0])\n",
+ "# number of individuals without reported income\n",
+ "print(\"Individuals without reported income =\", spc_edited['salary_yearly'].isna().sum())\n",
+ "# % of individuals with reported income (salary_yearly not equal NA)\n",
+ "print(\"% of individuals with reported income =\", round((spc_edited['salary_yearly'].count() / spc_edited.shape[0]) * 100, 1))\n",
+ "print(\"Individuals with reported income: 0 =\", spc_edited[spc_edited['salary_yearly'] == 0].shape[0])\n",
+ "\n",
+ "\n",
+ "# print the total number of households\n",
+ "print(\"Households in SPC =\", spc_edited['household'].nunique())\n",
+ "# number of households without reported income (salary yearly_hh = 0)\n",
+ "print(\"Households without reported income =\", spc_edited[spc_edited['salary_yearly_hh'] == 0].shape[0])\n",
+ "# # % of households with reported income (salary_yearly not equal NA)\n",
+ "print(\"% of households with reported income =\", round((spc_edited[spc_edited['salary_yearly_hh'] == 0].shape[0] / spc_edited['household'].nunique()) * 100, 1))\n",
+ "print(\"Households with reported income: 0 =\", spc_edited[spc_edited['salary_yearly_hh'] == 0].shape[0])\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# --- Recode column so that it matches the reported NTS values (Use income_dict_nts_hh dictionary for reference)\n",
+ "\n",
+ "# Define the bins (first )\n",
+ "bins = [0, 24999, 49999, np.inf]\n",
+ "# Define the labels for the bins\n",
+ "labels = [1, 2, 3]\n",
+ "\n",
+ "spc_edited = spc_edited.copy()\n",
+ "\n",
+ "spc_edited['salary_yearly_hh_cat'] = (pd.cut(spc_edited['salary_yearly_hh'], bins=bins, labels=labels, include_lowest=True)\n",
+ " .astype('str')\n",
+ " .astype('float'))\n",
+ "\n",
+ "\n",
+ "# replace NA values with -8 (to be consistent with NTS)\n",
+ "spc_edited['salary_yearly_hh_cat'] = spc_edited['salary_yearly_hh_cat'].fillna(-8)\n",
+ "\n",
+ "# Convert the column to int\n",
+ "spc_edited['salary_yearly_hh_cat'] = spc_edited['salary_yearly_hh_cat'].astype('int')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If we compare household income from the SPC and the NTS, we find that the SPC has many more households with no reported income (-8). This will create an issue when matching using household income"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# bar plot showing spc_edited.salary_yearly_hh_cat and nts_households.HHIncome2002_B02ID side by side\n",
+ "fig, ax = plt.subplots(1, 2, figsize=(12, 6), sharey=True)\n",
+ "ax[0].bar(spc_edited['salary_yearly_hh_cat'].value_counts().index, spc_edited['salary_yearly_hh_cat'].value_counts().values)\n",
+ "ax[0].set_title('SPC')\n",
+ "ax[0].set_xlabel('Income Bracket - Household level')\n",
+ "ax[0].set_ylabel('No of Households')\n",
+ "ax[1].bar(nts_households['HHIncome2002_B02ID'].value_counts().index, nts_households['HHIncome2002_B02ID'].value_counts().values)\n",
+ "ax[1].set_title('NTS')\n",
+ "ax[1].set_xlabel('Income Bracket - Household level')\n",
+ "plt.show()\n",
+ "\n",
+ "# same as above but (%)\n",
+ "fig, ax = plt.subplots(1, 2, figsize=(12, 6), sharey=True)\n",
+ "ax[0].bar(spc_edited['salary_yearly_hh_cat'].value_counts(normalize=True).index, spc_edited['salary_yearly_hh_cat'].value_counts(normalize=True).values)\n",
+ "ax[0].set_title('SPC')\n",
+ "ax[0].set_xlabel('Income Bracket - Household level')\n",
+ "ax[0].set_ylabel('Fraction of Households')\n",
+ "ax[1].bar(nts_households['HHIncome2002_B02ID'].value_counts(normalize=True).index, nts_households['HHIncome2002_B02ID'].value_counts(normalize=True).values)\n",
+ "ax[1].set_title('NTS')\n",
+ "ax[1].set_xlabel('Income Bracket - Household level')\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "HHIncome2002_B02ID\n",
+ " 1.0 35.969773\n",
+ " 3.0 34.382872\n",
+ " 2.0 29.559194\n",
+ "-8.0 0.088161\n",
+ "Name: proportion, dtype: float64"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# get the % of households in each income bracket for the nts\n",
+ "nts_households['HHIncome2002_B02ID'].value_counts(normalize=True) * 100"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Household Composition (No. of Adults / Children)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Number of adults and children in the household\n",
+ "\n",
+ "spc_edited = num_adult_child_hh(data = spc_edited,\n",
+ " group_col = 'household',\n",
+ " age_col = 'age_years')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Employment Status"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "({'0': 'Not applicable (age < 16)',\n",
+ " '1': 'Employee FT',\n",
+ " '2': 'Employee PT',\n",
+ " '3': 'Employee unspecified',\n",
+ " '4': 'Self-employed',\n",
+ " '5': 'Unemployed',\n",
+ " '6': 'Retired',\n",
+ " '7': 'Homemaker/Maternal leave',\n",
+ " '8': 'Student',\n",
+ " '9': 'Long term sickness/disability',\n",
+ " '10': 'Other'},\n",
+ " {'1': 'None',\n",
+ " '2': '0 FT, 1 PT',\n",
+ " '3': '1 FT, 0 PT',\n",
+ " '4': '0 FT, 2 PT',\n",
+ " '5': '1 FT, 1 PT',\n",
+ " '6': '2 FT, 0 PT',\n",
+ " '7': '1 FT, 2+ PT',\n",
+ " '8': '2 FT, 1+ PT',\n",
+ " '9': '0 FT, 3+ PT',\n",
+ " '10': '3+ FT, 0 PT',\n",
+ " '11': '3+ FT, 1+ PT',\n",
+ " '-8': 'NA',\n",
+ " '-10': 'DEAD'})"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Employment status\n",
+ "\n",
+ "# check the colums values from our dictionary\n",
+ "dict_spc['pwkstat'], dict_nts['HHoldEmploy_B01ID']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The NTS only reports the number of Full time and Part time employees for each household. For the SPC we also need to get the number of full time and part time workers for each household.\n",
+ "\n",
+ "Step 1: Create a column for Full time and a column for Part time"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " pwkstat_FT_hh | \n",
+ " pwkstat_PT_hh | \n",
+ "
\n",
+ " \n",
+ " household | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " pwkstat_FT_hh pwkstat_PT_hh\n",
+ "household \n",
+ "0 0 0\n",
+ "1 0 0\n",
+ "2 2 0\n",
+ "3 1 0\n",
+ "4 0 0\n",
+ "5 1 0\n",
+ "6 0 0\n",
+ "7 1 0\n",
+ "8 1 0\n",
+ "9 0 1"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# We will only use '1' and '2' for the employment status\n",
+ "\n",
+ "counts_df = count_per_group(df = spc_edited,\n",
+ " group_col = 'household',\n",
+ " count_col = 'pwkstat',\n",
+ " values=[1, 2],\n",
+ " value_names=['pwkstat_FT_hh','pwkstat_PT_hh'])\n",
+ "\n",
+ "counts_df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create a column that matches the NTS categories (m FT, n PT)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " household | \n",
+ " pwkstat | \n",
+ " pwkstat_FT_hh | \n",
+ " pwkstat_PT_hh | \n",
+ " pwkstat_NTS_match | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 4 | \n",
+ " 10 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " household pwkstat pwkstat_FT_hh pwkstat_PT_hh pwkstat_NTS_match\n",
+ "0 0 6 0 0 1\n",
+ "1 1 6 0 0 1\n",
+ "2 1 6 0 0 1\n",
+ "3 2 1 2 0 6\n",
+ "4 2 1 2 0 6\n",
+ "5 3 1 1 0 3\n",
+ "6 4 10 0 0 1\n",
+ "7 4 4 0 0 1\n",
+ "8 4 0 0 0 1\n",
+ "9 5 1 1 0 3"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# We want to match the SPC values to the NTS\n",
+ "dict_nts['HHoldEmploy_B01ID']\n",
+ "'''\n",
+ "{\n",
+ " '1': 'None',\n",
+ " '2': '0 FT, 1 PT',\n",
+ " '3': '1 FT, 0 PT',\n",
+ " '4': '0 FT, 2 PT',\n",
+ " '5': '1 FT, 1 PT',\n",
+ " '6': '2 FT, 0 PT',\n",
+ " '7': '1 FT, 2+ PT',\n",
+ " '8': '2 FT, 1+ PT',\n",
+ " '9': '0 FT, 3+ PT',\n",
+ " '10': '3+ FT, 0 PT',\n",
+ " '11': '3+ FT, 1+ PT',\n",
+ " '-8': 'NA',\n",
+ " '-10': 'DEAD'}\n",
+ " '''\n",
+ "\n",
+ "# 1) Match each row to the NTS\n",
+ "\n",
+ "# Define the conditions and outputs.\n",
+ "# We are using the keys in dict_nts['HHoldEmploy_B01ID'] as reference\n",
+ "conditions = [\n",
+ " (counts_df['pwkstat_FT_hh'] == 0) & (counts_df['pwkstat_PT_hh'] == 0),\n",
+ " (counts_df['pwkstat_FT_hh'] == 0) & (counts_df['pwkstat_PT_hh'] == 1),\n",
+ " (counts_df['pwkstat_FT_hh'] == 1) & (counts_df['pwkstat_PT_hh'] == 0),\n",
+ " (counts_df['pwkstat_FT_hh'] == 0) & (counts_df['pwkstat_PT_hh'] == 2),\n",
+ " (counts_df['pwkstat_FT_hh'] == 1) & (counts_df['pwkstat_PT_hh'] == 1),\n",
+ " (counts_df['pwkstat_FT_hh'] == 2) & (counts_df['pwkstat_PT_hh'] == 0),\n",
+ " (counts_df['pwkstat_FT_hh'] == 1) & (counts_df['pwkstat_PT_hh'] >= 2),\n",
+ " (counts_df['pwkstat_FT_hh'] == 2) & (counts_df['pwkstat_PT_hh'] >= 1),\n",
+ " (counts_df['pwkstat_FT_hh'] == 0) & (counts_df['pwkstat_PT_hh'] >= 3),\n",
+ " (counts_df['pwkstat_FT_hh'] >= 3) & (counts_df['pwkstat_PT_hh'] == 0),\n",
+ " (counts_df['pwkstat_FT_hh'] >= 3) & (counts_df['pwkstat_PT_hh'] >= 1)\n",
+ "]\n",
+ "\n",
+ "# Define the corresponding outputs based on dict_nts['HHoldEmploy_B01ID]\n",
+ "outputs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]\n",
+ "\n",
+ "# Create a new column using np.select\n",
+ "counts_df['pwkstat_NTS_match'] = np.select(conditions,\n",
+ " outputs,\n",
+ " default= -8)\n",
+ "\n",
+ "\n",
+ "\n",
+ "# 2) merge back onto the spc\n",
+ "spc_edited = spc_edited.merge(counts_df, left_on='household', right_index=True)\n",
+ "\n",
+ "# check the output\n",
+ "spc_edited[['household', 'pwkstat', 'pwkstat_FT_hh', 'pwkstat_PT_hh', 'pwkstat_NTS_match']].head(10)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# bar plot of counts_df['pwkstat_NTS_match'] and nts_households['HHoldEmploy_B01ID']\n",
+ "fig, ax = plt.subplots(1, 2, figsize=(12, 6))\n",
+ "ax[0].bar(counts_df['pwkstat_NTS_match'].value_counts().index, counts_df['pwkstat_NTS_match'].value_counts().values)\n",
+ "ax[0].set_title('SPC')\n",
+ "ax[0].set_xlabel('Employment status - Household level')\n",
+ "ax[0].set_ylabel('Frequency')\n",
+ "ax[1].bar(nts_households['HHoldEmploy_B01ID'].value_counts().index, nts_households['HHoldEmploy_B01ID'].value_counts().values)\n",
+ "ax[1].set_title('NTS')\n",
+ "ax[1].set_xlabel('Employment status - Household level')\n",
+ "plt.show()\n",
+ "\n",
+ "# same as above but percentages\n",
+ "fig, ax = plt.subplots(1, 2, figsize=(12, 6))\n",
+ "ax[0].bar(counts_df['pwkstat_NTS_match'].value_counts().index, counts_df['pwkstat_NTS_match'].value_counts(normalize=True).values)\n",
+ "ax[0].set_title('SPC')\n",
+ "ax[0].set_xlabel('Employment status - Household level')\n",
+ "ax[0].set_ylabel('Frequency (normalized)')\n",
+ "ax[1].bar(nts_households['HHoldEmploy_B01ID'].value_counts().index, nts_households['HHoldEmploy_B01ID'].value_counts(normalize=True).values)\n",
+ "ax[1].set_title('NTS')\n",
+ "ax[1].set_xlabel('Employment status - Household level')\n",
+ "plt.show()\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Urban Rural Classification\n",
+ "\n",
+ "We use the 2011 rural urban classification to match the SPC to the NTS. The NTS has 2 columns that we can use to match to the SPC: `Settlement2011EW_B03ID` and `Settlement2011EW_B04ID`. The `Settlement2011EW_B03ID` column is more general (urban / rural only), while the `Settlement2011EW_B04ID` column is more specific. We stick to the more general column for now."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " household | \n",
+ " location | \n",
+ " pid_hs | \n",
+ " msoa | \n",
+ " oa | \n",
+ " members | \n",
+ " sic1d2007 | \n",
+ " sic2d2007 | \n",
+ " pwkstat | \n",
+ " salary_yearly | \n",
+ " salary_hourly | \n",
+ " hid | \n",
+ " accommodation_type | \n",
+ " communal_type | \n",
+ " num_rooms | \n",
+ " central_heat | \n",
+ " tenure | \n",
+ " num_cars | \n",
+ " sex | \n",
+ " age_years | \n",
+ " ethnicity | \n",
+ " nssec8 | \n",
+ " salary_yearly_hh | \n",
+ " salary_yearly_hh_cat | \n",
+ " is_adult | \n",
+ " num_adults | \n",
+ " is_child | \n",
+ " num_children | \n",
+ " is_pension_age | \n",
+ " num_pension_age | \n",
+ " pwkstat_FT_hh | \n",
+ " pwkstat_PT_hh | \n",
+ " pwkstat_NTS_match | \n",
+ " OA11CD | \n",
+ " RUC11 | \n",
+ " RUC11CD | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " {'x': -1.7892179489135742, 'y': 53.91915130615... | \n",
+ " 2905399 | \n",
+ " E02002183 | \n",
+ " E00053954 | \n",
+ " [0] | \n",
+ " J | \n",
+ " 58.0 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " E02002183_0001 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 2.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 86 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " E00053954 | \n",
+ " Urban city and town | \n",
+ " C1 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " {'x': -1.8262380361557007, 'y': 53.92028045654... | \n",
+ " 2905308 | \n",
+ " E02002183 | \n",
+ " E00053953 | \n",
+ " [1, 2] | \n",
+ " C | \n",
+ " 25.0 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " E02002183_0002 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 74 | \n",
+ " 3 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " E00053953 | \n",
+ " Urban city and town | \n",
+ " C1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " {'x': -1.8262380361557007, 'y': 53.92028045654... | \n",
+ " 2907681 | \n",
+ " E02002183 | \n",
+ " E00053953 | \n",
+ " [1, 2] | \n",
+ " P | \n",
+ " 85.0 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " E02002183_0002 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 68 | \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " E00053953 | \n",
+ " Urban city and town | \n",
+ " C1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " {'x': -1.8749940395355225, 'y': 53.94298934936... | \n",
+ " 2902817 | \n",
+ " E02002183 | \n",
+ " E00053689 | \n",
+ " [3, 4] | \n",
+ " C | \n",
+ " 31.0 | \n",
+ " 1 | \n",
+ " 32857.859375 | \n",
+ " 14.360952 | \n",
+ " E02002183_0003 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 27 | \n",
+ " 1 | \n",
+ " 4.0 | \n",
+ " 51020.310547 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " E00053689 | \n",
+ " Rural town and fringe | \n",
+ " D1 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " {'x': -1.8749940395355225, 'y': 53.94298934936... | \n",
+ " 2900884 | \n",
+ " E02002183 | \n",
+ " E00053689 | \n",
+ " [3, 4] | \n",
+ " J | \n",
+ " 62.0 | \n",
+ " 1 | \n",
+ " 18162.451172 | \n",
+ " 9.439944 | \n",
+ " E02002183_0003 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 26 | \n",
+ " 1 | \n",
+ " 6.0 | \n",
+ " 51020.310547 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " E00053689 | \n",
+ " Rural town and fringe | \n",
+ " D1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id household location pid_hs \\\n",
+ "0 0 0 {'x': -1.7892179489135742, 'y': 53.91915130615... 2905399 \n",
+ "1 1 1 {'x': -1.8262380361557007, 'y': 53.92028045654... 2905308 \n",
+ "2 2 1 {'x': -1.8262380361557007, 'y': 53.92028045654... 2907681 \n",
+ "3 3 2 {'x': -1.8749940395355225, 'y': 53.94298934936... 2902817 \n",
+ "4 4 2 {'x': -1.8749940395355225, 'y': 53.94298934936... 2900884 \n",
+ "\n",
+ " msoa oa members sic1d2007 sic2d2007 pwkstat salary_yearly \\\n",
+ "0 E02002183 E00053954 [0] J 58.0 6 NaN \n",
+ "1 E02002183 E00053953 [1, 2] C 25.0 6 NaN \n",
+ "2 E02002183 E00053953 [1, 2] P 85.0 6 NaN \n",
+ "3 E02002183 E00053689 [3, 4] C 31.0 1 32857.859375 \n",
+ "4 E02002183 E00053689 [3, 4] J 62.0 1 18162.451172 \n",
+ "\n",
+ " salary_hourly hid accommodation_type communal_type \\\n",
+ "0 NaN E02002183_0001 1.0 NaN \n",
+ "1 NaN E02002183_0002 3.0 NaN \n",
+ "2 NaN E02002183_0002 3.0 NaN \n",
+ "3 14.360952 E02002183_0003 3.0 NaN \n",
+ "4 9.439944 E02002183_0003 3.0 NaN \n",
+ "\n",
+ " num_rooms central_heat tenure num_cars sex age_years ethnicity \\\n",
+ "0 2.0 True 2.0 2 1 86 1 \n",
+ "1 6.0 True 2.0 2 1 74 3 \n",
+ "2 6.0 True 2.0 2 2 68 1 \n",
+ "3 6.0 True 2.0 1 1 27 1 \n",
+ "4 6.0 True 2.0 1 2 26 1 \n",
+ "\n",
+ " nssec8 salary_yearly_hh salary_yearly_hh_cat is_adult num_adults \\\n",
+ "0 1.0 0.000000 1 1 1 \n",
+ "1 1.0 0.000000 1 1 2 \n",
+ "2 2.0 0.000000 1 1 2 \n",
+ "3 4.0 51020.310547 3 1 2 \n",
+ "4 6.0 51020.310547 3 1 2 \n",
+ "\n",
+ " is_child num_children is_pension_age num_pension_age pwkstat_FT_hh \\\n",
+ "0 0 0 1 1 0 \n",
+ "1 0 0 1 2 0 \n",
+ "2 0 0 1 2 0 \n",
+ "3 0 0 0 0 2 \n",
+ "4 0 0 0 0 2 \n",
+ "\n",
+ " pwkstat_PT_hh pwkstat_NTS_match OA11CD RUC11 RUC11CD \n",
+ "0 0 1 E00053954 Urban city and town C1 \n",
+ "1 0 1 E00053953 Urban city and town C1 \n",
+ "2 0 1 E00053953 Urban city and town C1 \n",
+ "3 0 6 E00053689 Rural town and fringe D1 \n",
+ "4 0 6 E00053689 Rural town and fringe D1 "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# read the rural urban classification data\n",
+ "rural_urban = pd.read_csv('../data/external/census_2011_rural_urban.csv', sep=',')\n",
+ "\n",
+ "# merge the rural_urban data with the spc\n",
+ "spc_edited = spc_edited.merge(rural_urban[['OA11CD', 'RUC11', 'RUC11CD']], left_on='oa', right_on='OA11CD')\n",
+ "spc_edited.head(5)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create dictionary from the NTS `Settlement2011EW_B03ID` column\n",
+ "Settlement2011EW_B03ID_nts_hh = {\n",
+ " '1': 'Urban',\n",
+ " '2': 'Rural',\n",
+ " '3': 'Scotland',\n",
+ " '-8': 'NA',\n",
+ " '-10': 'DEAD'\n",
+ "}\n",
+ "\n",
+ "Settlement2011EW_B04ID_nts_hh = {\n",
+ " '1': 'Urban Conurbation',\n",
+ " '2': 'Urban City and Town',\n",
+ " '3': 'Rural Town and Fringe',\n",
+ " '4': 'Rural Village, Hamlet and Isolated Dwellings',\n",
+ " '5': 'Scotland',\n",
+ " '-8': 'NA',\n",
+ " '-10': 'DEAD'\n",
+ "}\n",
+ "\n",
+ "\n",
+ "census_2011_to_nts_B03ID = {\n",
+ " 'Urban major conurbation': 'Urban',\n",
+ " 'Urban minor conurbation': 'Urban',\n",
+ " 'Urban city and town': 'Urban',\n",
+ " 'Urban city and town in a sparse setting': 'Urban',\n",
+ " 'Rural town and fringe': 'Rural',\n",
+ " 'Rural town and fringe in a sparse setting': 'Rural',\n",
+ " 'Rural village': 'Rural',\n",
+ " 'Rural village in a sparse setting': 'Rural',\n",
+ " 'Rural hamlets and isolated dwellings': 'Rural',\n",
+ " 'Rural hamlets and isolated dwellings in a sparse setting': 'Rural'\n",
+ "}\n",
+ "\n",
+ "census_2011_to_nts_B04ID = {\n",
+ " 'Urban major conurbation': 'Urban Conurbation',\n",
+ " 'Urban minor conurbation': 'Urban Conurbation',\n",
+ " 'Urban city and town': 'Urban City and Town',\n",
+ " 'Urban city and town in a sparse setting': 'Urban City and Town',\n",
+ " 'Rural town and fringe': 'Rural Town and Fringe',\n",
+ " 'Rural town and fringe in a sparse setting': 'Rural Town and Fringe',\n",
+ " 'Rural village': 'Rural Village, Hamlet and Isolated Dwellings',\n",
+ " 'Rural village in a sparse setting': 'Rural Village, Hamlet and Isolated Dwellings',\n",
+ " 'Rural hamlets and isolated dwellings': 'Rural Village, Hamlet and Isolated Dwellings',\n",
+ " 'Rural hamlets and isolated dwellings in a sparse setting': 'Rural Village, Hamlet and Isolated Dwellings'\n",
+ "}\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " household | \n",
+ " location | \n",
+ " pid_hs | \n",
+ " msoa | \n",
+ " oa | \n",
+ " members | \n",
+ " sic1d2007 | \n",
+ " sic2d2007 | \n",
+ " pwkstat | \n",
+ " salary_yearly | \n",
+ " salary_hourly | \n",
+ " hid | \n",
+ " accommodation_type | \n",
+ " communal_type | \n",
+ " num_rooms | \n",
+ " central_heat | \n",
+ " tenure | \n",
+ " num_cars | \n",
+ " sex | \n",
+ " age_years | \n",
+ " ethnicity | \n",
+ " nssec8 | \n",
+ " salary_yearly_hh | \n",
+ " salary_yearly_hh_cat | \n",
+ " is_adult | \n",
+ " num_adults | \n",
+ " is_child | \n",
+ " num_children | \n",
+ " is_pension_age | \n",
+ " num_pension_age | \n",
+ " pwkstat_FT_hh | \n",
+ " pwkstat_PT_hh | \n",
+ " pwkstat_NTS_match | \n",
+ " OA11CD | \n",
+ " RUC11 | \n",
+ " RUC11CD | \n",
+ " Settlement2011EW_B03ID_spc | \n",
+ " Settlement2011EW_B04ID_spc | \n",
+ " Settlement2011EW_B03ID_spc_CD | \n",
+ " Settlement2011EW_B04ID_spc_CD | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " {'x': -1.7892179489135742, 'y': 53.91915130615... | \n",
+ " 2905399 | \n",
+ " E02002183 | \n",
+ " E00053954 | \n",
+ " [0] | \n",
+ " J | \n",
+ " 58.0 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " E02002183_0001 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 2.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 86 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " E00053954 | \n",
+ " Urban city and town | \n",
+ " C1 | \n",
+ " Urban | \n",
+ " Urban City and Town | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " {'x': -1.8262380361557007, 'y': 53.92028045654... | \n",
+ " 2905308 | \n",
+ " E02002183 | \n",
+ " E00053953 | \n",
+ " [1, 2] | \n",
+ " C | \n",
+ " 25.0 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " E02002183_0002 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 74 | \n",
+ " 3 | \n",
+ " 1.0 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " E00053953 | \n",
+ " Urban city and town | \n",
+ " C1 | \n",
+ " Urban | \n",
+ " Urban City and Town | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " {'x': -1.8262380361557007, 'y': 53.92028045654... | \n",
+ " 2907681 | \n",
+ " E02002183 | \n",
+ " E00053953 | \n",
+ " [1, 2] | \n",
+ " P | \n",
+ " 85.0 | \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " E02002183_0002 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 68 | \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 0.000000 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " E00053953 | \n",
+ " Urban city and town | \n",
+ " C1 | \n",
+ " Urban | \n",
+ " Urban City and Town | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " {'x': -1.8749940395355225, 'y': 53.94298934936... | \n",
+ " 2902817 | \n",
+ " E02002183 | \n",
+ " E00053689 | \n",
+ " [3, 4] | \n",
+ " C | \n",
+ " 31.0 | \n",
+ " 1 | \n",
+ " 32857.859375 | \n",
+ " 14.360952 | \n",
+ " E02002183_0003 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 27 | \n",
+ " 1 | \n",
+ " 4.0 | \n",
+ " 51020.310547 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " E00053689 | \n",
+ " Rural town and fringe | \n",
+ " D1 | \n",
+ " Rural | \n",
+ " Rural Town and Fringe | \n",
+ " 2 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " {'x': -1.8749940395355225, 'y': 53.94298934936... | \n",
+ " 2900884 | \n",
+ " E02002183 | \n",
+ " E00053689 | \n",
+ " [3, 4] | \n",
+ " J | \n",
+ " 62.0 | \n",
+ " 1 | \n",
+ " 18162.451172 | \n",
+ " 9.439944 | \n",
+ " E02002183_0003 | \n",
+ " 3.0 | \n",
+ " NaN | \n",
+ " 6.0 | \n",
+ " True | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 26 | \n",
+ " 1 | \n",
+ " 6.0 | \n",
+ " 51020.310547 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " E00053689 | \n",
+ " Rural town and fringe | \n",
+ " D1 | \n",
+ " Rural | \n",
+ " Rural Town and Fringe | \n",
+ " 2 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id household location pid_hs \\\n",
+ "0 0 0 {'x': -1.7892179489135742, 'y': 53.91915130615... 2905399 \n",
+ "1 1 1 {'x': -1.8262380361557007, 'y': 53.92028045654... 2905308 \n",
+ "2 2 1 {'x': -1.8262380361557007, 'y': 53.92028045654... 2907681 \n",
+ "3 3 2 {'x': -1.8749940395355225, 'y': 53.94298934936... 2902817 \n",
+ "4 4 2 {'x': -1.8749940395355225, 'y': 53.94298934936... 2900884 \n",
+ "\n",
+ " msoa oa members sic1d2007 sic2d2007 pwkstat salary_yearly \\\n",
+ "0 E02002183 E00053954 [0] J 58.0 6 NaN \n",
+ "1 E02002183 E00053953 [1, 2] C 25.0 6 NaN \n",
+ "2 E02002183 E00053953 [1, 2] P 85.0 6 NaN \n",
+ "3 E02002183 E00053689 [3, 4] C 31.0 1 32857.859375 \n",
+ "4 E02002183 E00053689 [3, 4] J 62.0 1 18162.451172 \n",
+ "\n",
+ " salary_hourly hid accommodation_type communal_type \\\n",
+ "0 NaN E02002183_0001 1.0 NaN \n",
+ "1 NaN E02002183_0002 3.0 NaN \n",
+ "2 NaN E02002183_0002 3.0 NaN \n",
+ "3 14.360952 E02002183_0003 3.0 NaN \n",
+ "4 9.439944 E02002183_0003 3.0 NaN \n",
+ "\n",
+ " num_rooms central_heat tenure num_cars sex age_years ethnicity \\\n",
+ "0 2.0 True 2.0 2 1 86 1 \n",
+ "1 6.0 True 2.0 2 1 74 3 \n",
+ "2 6.0 True 2.0 2 2 68 1 \n",
+ "3 6.0 True 2.0 1 1 27 1 \n",
+ "4 6.0 True 2.0 1 2 26 1 \n",
+ "\n",
+ " nssec8 salary_yearly_hh salary_yearly_hh_cat is_adult num_adults \\\n",
+ "0 1.0 0.000000 1 1 1 \n",
+ "1 1.0 0.000000 1 1 2 \n",
+ "2 2.0 0.000000 1 1 2 \n",
+ "3 4.0 51020.310547 3 1 2 \n",
+ "4 6.0 51020.310547 3 1 2 \n",
+ "\n",
+ " is_child num_children is_pension_age num_pension_age pwkstat_FT_hh \\\n",
+ "0 0 0 1 1 0 \n",
+ "1 0 0 1 2 0 \n",
+ "2 0 0 1 2 0 \n",
+ "3 0 0 0 0 2 \n",
+ "4 0 0 0 0 2 \n",
+ "\n",
+ " pwkstat_PT_hh pwkstat_NTS_match OA11CD RUC11 RUC11CD \\\n",
+ "0 0 1 E00053954 Urban city and town C1 \n",
+ "1 0 1 E00053953 Urban city and town C1 \n",
+ "2 0 1 E00053953 Urban city and town C1 \n",
+ "3 0 6 E00053689 Rural town and fringe D1 \n",
+ "4 0 6 E00053689 Rural town and fringe D1 \n",
+ "\n",
+ " Settlement2011EW_B03ID_spc Settlement2011EW_B04ID_spc \\\n",
+ "0 Urban Urban City and Town \n",
+ "1 Urban Urban City and Town \n",
+ "2 Urban Urban City and Town \n",
+ "3 Rural Rural Town and Fringe \n",
+ "4 Rural Rural Town and Fringe \n",
+ "\n",
+ " Settlement2011EW_B03ID_spc_CD Settlement2011EW_B04ID_spc_CD \n",
+ "0 1 2 \n",
+ "1 1 2 \n",
+ "2 1 2 \n",
+ "3 2 3 \n",
+ "4 2 3 "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# add the nts Settlement2011EW_B03ID and Settlement2011EW_B04ID columns to the spc\n",
+ "spc_edited['Settlement2011EW_B03ID_spc'] = spc_edited['RUC11'].map(census_2011_to_nts_B03ID)\n",
+ "spc_edited['Settlement2011EW_B04ID_spc'] = spc_edited['RUC11'].map(census_2011_to_nts_B04ID)\n",
+ "spc_edited.head()\n",
+ "\n",
+ "# add the keys from nts_Settlement2011EW_B03ID and nts_Settlement2011EW_B04ID to the spc based on above mappings\n",
+ "\n",
+ "# reverse the dictionaries\n",
+ "Settlement2011EW_B03ID_nts_rev = {v: k for k, v in Settlement2011EW_B03ID_nts_hh.items()}\n",
+ "# map the values\n",
+ "spc_edited['Settlement2011EW_B03ID_spc_CD'] = spc_edited['Settlement2011EW_B03ID_spc'].map(Settlement2011EW_B03ID_nts_rev).astype('int')\n",
+ "\n",
+ "Settlement2011EW_B04ID_nts_rev = {v: k for k, v in Settlement2011EW_B04ID_nts_hh.items()}\n",
+ "spc_edited['Settlement2011EW_B04ID_spc_CD'] = spc_edited['Settlement2011EW_B04ID_spc'].map(Settlement2011EW_B04ID_nts_rev).astype('int')\n",
+ "spc_edited.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2.2 Edit NTS columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Number of people of pension age"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "nts_pensioners = count_per_group(df = nts_individuals,\n",
+ " group_col='HouseholdID',\n",
+ " count_col='OfPenAge_B01ID',\n",
+ " values=[1],\n",
+ " value_names=['num_pension_age_nts'])\n",
+ "\n",
+ "nts_pensioners.head()\n",
+ "\n",
+ "# join onto the nts household df\n",
+ "nts_households = nts_households.merge(nts_pensioners, left_on='HouseholdID', right_index=True, how='left')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Number of cars\n",
+ "\n",
+ "- `SPC.num_cars` only has values [0, 1, 2]. 2 is for all households with 2 or more cars\n",
+ "- `NTS.NumCar` is more detailed. It has the actual value of the number of cars. We will cap this at 2."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " NumCar | \n",
+ " NumCar_SPC_match | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 142954 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142955 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142956 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 142957 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 142958 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142959 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142960 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 142961 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142962 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 142963 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142964 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142965 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142966 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 142967 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142968 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142969 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 142970 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ " 142971 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 142972 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 142973 | \n",
+ " 3.0 | \n",
+ " 2.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " NumCar NumCar_SPC_match\n",
+ "142954 1.0 1.0\n",
+ "142955 1.0 1.0\n",
+ "142956 0.0 0.0\n",
+ "142957 2.0 2.0\n",
+ "142958 1.0 1.0\n",
+ "142959 1.0 1.0\n",
+ "142960 2.0 2.0\n",
+ "142961 1.0 1.0\n",
+ "142962 2.0 2.0\n",
+ "142963 1.0 1.0\n",
+ "142964 1.0 1.0\n",
+ "142965 1.0 1.0\n",
+ "142966 0.0 0.0\n",
+ "142967 1.0 1.0\n",
+ "142968 1.0 1.0\n",
+ "142969 0.0 0.0\n",
+ "142970 2.0 2.0\n",
+ "142971 0.0 0.0\n",
+ "142972 1.0 1.0\n",
+ "142973 3.0 2.0"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "# Create a new column in NTS\n",
+ "nts_households.loc[:, 'NumCar_SPC_match'] = nts_households['NumCar'].apply(truncate_values, upper = 2)\n",
+ "\n",
+ "nts_households[['NumCar', 'NumCar_SPC_match']].head(20)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Type of tenancy\n",
+ "\n",
+ "Breakdown between NTS and SPC is different. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "({'1': 'Owns / buying',\n",
+ " '2': 'Rents',\n",
+ " '3': 'Other (including rent free)',\n",
+ " '-8': 'NA',\n",
+ " '-9': 'DNA',\n",
+ " '-10': 'DEAD'},\n",
+ " {'1': 'Owned: Owned outright',\n",
+ " '2': 'Owned: Owned with a mortgage or loan or shared ownership',\n",
+ " '3': 'Rented or living rent free: Total',\n",
+ " '4': 'Rented: Social rented',\n",
+ " '5': 'Rented: Private rented or living rent free',\n",
+ " '-8': 'NA',\n",
+ " '-9': 'DNA',\n",
+ " '-10': 'DEAD'})"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dict_nts['Ten1_B02ID'], dict_spc['tenure']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create dictionaries to map tenure onto the spc and nts dfs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Dictionary showing how we want the final columns to look like\n",
+ "tenure_dict_nts_spc = {\n",
+ " 1: 'Owned',\n",
+ " 2: 'Rented or rent free',\n",
+ " -8: 'NA',\n",
+ " -9: 'DNA',\n",
+ " -10: 'DEAD'\n",
+ "}\n",
+ "\n",
+ "# Matching NTS to tenure_dict_nts_spc\n",
+ "\n",
+ "# Create a new dictionary for matching\n",
+ "matching_dict_nts_tenure = {\n",
+ " 1: 1,\n",
+ " 2: 2,\n",
+ " 3: 2\n",
+ "}\n",
+ "\n",
+ "matching_dict_spc_tenure = {\n",
+ " 1: 1, #'Owned: Owned outright' : 'Owned'\n",
+ " 2: 1, #'Owned: Owned with a mortgage or loan or shared ownership', : 'Owned'\n",
+ " 3: 2, #'Rented or living rent free: Total', : 'Rented or rent free'\n",
+ " 4: 2, #'Rented: Social rented', : 'Rented or rent free'\n",
+ " 5: 2, #'Rented: Private rented or living rent free', : 'Rented or rent free'\n",
+ "}\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "map dictionaries to create comparable columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a new column in nts_households\n",
+ "nts_households['tenure_nts_for_matching'] = (nts_households['Ten1_B02ID']\n",
+ " .map(matching_dict_nts_tenure) # map the values to the new dictionary\n",
+ " .fillna(nts_households['Ten1_B02ID'])) # fill the NaNs with the original values\n",
+ "\n",
+ "# Create a new column in spc\n",
+ "spc_edited['tenure_spc_for_matching'] = (spc_edited['tenure']\n",
+ " .map(matching_dict_spc_tenure) # map the values to the new dictionary\n",
+ " .fillna(spc_edited['tenure'])) # fill the NaNs with the original values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Step 3: Matching at Household Level\n",
+ "\n",
+ "Now that we've prepared all the columns, we can start matching."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.1 Categorical matching\n",
+ "\n",
+ "We will match on (a subset of) the following columns:\n",
+ "\n",
+ "| Matching variable | NTS column | SPC column |\n",
+ "| ------------------| ---------- | ---------- |\n",
+ "| Household income | `HHIncome2002_BO2ID` | `salary_yearly_hh_cat` |\n",
+ "| Number of adults | `HHoldNumAdults` | `num_adults` |\n",
+ "| Number of children | `HHoldNumChildren` | `num_children` |\n",
+ "| Employment status | `HHoldEmploy_B01ID` | `pwkstat_NTS_match` |\n",
+ "| Car ownership | `NumCar_SPC_match` | `num_cars` |\n",
+ "| Type of tenancy | `tenure_nts_for_matching` | `tenure_spc_for_matching` |\n",
+ "| Rural/Urban Classification | `Settlement2011EW_B03ID` | `Settlement2011EW_B03ID_spc_CD` |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Prepare SPC df for matching"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " hid | \n",
+ " salary_yearly_hh_cat | \n",
+ " num_adults | \n",
+ " num_children | \n",
+ " num_pension_age | \n",
+ " pwkstat_NTS_match | \n",
+ " num_cars | \n",
+ " tenure_spc_for_matching | \n",
+ " Settlement2011EW_B03ID_spc_CD | \n",
+ " Settlement2011EW_B04ID_spc_CD | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " E02002183_0001 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " E02002183_0002 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " E02002183_0003 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " E02002183_0004 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " E02002183_0005 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " E02002183_0006 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " E02002183_0007 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " E02002183_0008 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " E02002183_0009 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " E02002183_0010 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " hid salary_yearly_hh_cat num_adults num_children \\\n",
+ "0 E02002183_0001 1 1 0 \n",
+ "1 E02002183_0002 1 2 0 \n",
+ "3 E02002183_0003 3 2 0 \n",
+ "5 E02002183_0004 2 1 0 \n",
+ "6 E02002183_0005 1 2 1 \n",
+ "9 E02002183_0006 3 1 0 \n",
+ "10 E02002183_0007 1 2 1 \n",
+ "13 E02002183_0008 2 1 0 \n",
+ "14 E02002183_0009 1 2 0 \n",
+ "16 E02002183_0010 1 2 1 \n",
+ "\n",
+ " num_pension_age pwkstat_NTS_match num_cars tenure_spc_for_matching \\\n",
+ "0 1 1 2 1.0 \n",
+ "1 2 1 2 1.0 \n",
+ "3 0 6 1 1.0 \n",
+ "5 0 3 1 1.0 \n",
+ "6 1 1 2 1.0 \n",
+ "9 0 3 1 2.0 \n",
+ "10 2 1 1 1.0 \n",
+ "13 0 3 2 1.0 \n",
+ "14 0 3 2 1.0 \n",
+ "16 0 2 2 1.0 \n",
+ "\n",
+ " Settlement2011EW_B03ID_spc_CD Settlement2011EW_B04ID_spc_CD \n",
+ "0 1 2 \n",
+ "1 1 2 \n",
+ "3 2 3 \n",
+ "5 2 3 \n",
+ "6 2 3 \n",
+ "9 1 2 \n",
+ "10 1 2 \n",
+ "13 2 4 \n",
+ "14 2 4 \n",
+ "16 1 2 "
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Select multiple columns\n",
+ "spc_matching = spc_edited[[\n",
+ " 'hid',\n",
+ " 'salary_yearly_hh_cat', 'num_adults',\n",
+ " 'num_children', 'num_pension_age', 'pwkstat_NTS_match',\n",
+ " 'num_cars', 'tenure_spc_for_matching',\n",
+ " 'Settlement2011EW_B03ID_spc_CD', 'Settlement2011EW_B04ID_spc_CD']]\n",
+ "\n",
+ "# edit the df so that we have one row per hid\n",
+ "spc_matching = spc_matching.drop_duplicates(subset='hid')\n",
+ "\n",
+ "spc_matching.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Prepare NTS df for matching"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " HouseholdID | \n",
+ " HHIncome2002_B02ID | \n",
+ " HHoldNumAdults | \n",
+ " HHoldNumChildren | \n",
+ " num_pension_age_nts | \n",
+ " HHoldEmploy_B01ID | \n",
+ " NumCar_SPC_match | \n",
+ " tenure_nts_for_matching | \n",
+ " Settlement2011EW_B03ID | \n",
+ " Settlement2011EW_B04ID | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 142954 | \n",
+ " 2019001895 | \n",
+ " 1.0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 142955 | \n",
+ " 2019002676 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 142956 | \n",
+ " 2019001891 | \n",
+ " 1.0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 142957 | \n",
+ " 2019002687 | \n",
+ " 2.0 | \n",
+ " 4 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 142958 | \n",
+ " 2019001913 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 142959 | \n",
+ " 2019002273 | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 142960 | \n",
+ " 2019001906 | \n",
+ " 3.0 | \n",
+ " 3 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 5 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 142961 | \n",
+ " 2019001910 | \n",
+ " 1.0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 142962 | \n",
+ " 2019002688 | \n",
+ " 3.0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 2.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " 142963 | \n",
+ " 2019002686 | \n",
+ " 1.0 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " HouseholdID HHIncome2002_B02ID HHoldNumAdults HHoldNumChildren \\\n",
+ "142954 2019001895 1.0 3 0 \n",
+ "142955 2019002676 1.0 1 0 \n",
+ "142956 2019001891 1.0 2 0 \n",
+ "142957 2019002687 2.0 4 0 \n",
+ "142958 2019001913 1.0 1 0 \n",
+ "142959 2019002273 2.0 1 0 \n",
+ "142960 2019001906 3.0 3 0 \n",
+ "142961 2019001910 1.0 2 0 \n",
+ "142962 2019002688 3.0 2 2 \n",
+ "142963 2019002686 1.0 2 0 \n",
+ "\n",
+ " num_pension_age_nts HHoldEmploy_B01ID NumCar_SPC_match \\\n",
+ "142954 2 1 1.0 \n",
+ "142955 0 3 1.0 \n",
+ "142956 0 1 0.0 \n",
+ "142957 0 4 2.0 \n",
+ "142958 0 3 1.0 \n",
+ "142959 0 3 1.0 \n",
+ "142960 0 5 2.0 \n",
+ "142961 2 1 1.0 \n",
+ "142962 0 6 2.0 \n",
+ "142963 2 2 1.0 \n",
+ "\n",
+ " tenure_nts_for_matching Settlement2011EW_B03ID \\\n",
+ "142954 1.0 1 \n",
+ "142955 1.0 1 \n",
+ "142956 2.0 1 \n",
+ "142957 2.0 1 \n",
+ "142958 1.0 1 \n",
+ "142959 2.0 1 \n",
+ "142960 1.0 1 \n",
+ "142961 1.0 1 \n",
+ "142962 1.0 1 \n",
+ "142963 1.0 1 \n",
+ "\n",
+ " Settlement2011EW_B04ID \n",
+ "142954 1 \n",
+ "142955 1 \n",
+ "142956 1 \n",
+ "142957 2 \n",
+ "142958 1 \n",
+ "142959 1 \n",
+ "142960 1 \n",
+ "142961 1 \n",
+ "142962 2 \n",
+ "142963 2 "
+ ]
+ },
+ "execution_count": 32,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "nts_matching = nts_households[[\n",
+ " 'HouseholdID','HHIncome2002_B02ID',\n",
+ " 'HHoldNumAdults', 'HHoldNumChildren', 'num_pension_age_nts',\n",
+ " 'HHoldEmploy_B01ID', 'NumCar_SPC_match',\n",
+ " 'tenure_nts_for_matching',\n",
+ " 'Settlement2011EW_B03ID', 'Settlement2011EW_B04ID']]\n",
+ "\n",
+ "nts_matching.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Dictionary of matching columns. We extract column names from this dictioary when matching on a subset of the columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'household_id': ['hid', 'HouseholdID'],\n",
+ " 'yearly_income': ['salary_yearly_hh_cat', 'HHIncome2002_B02ID'],\n",
+ " 'number_adults': ['num_adults', 'HHoldNumAdults'],\n",
+ " 'number_children': ['num_children', 'HHoldNumChildren'],\n",
+ " 'num_pension_age': ['num_pension_age', 'num_pension_age_nts'],\n",
+ " 'employment_status': ['pwkstat_NTS_match', 'HHoldEmploy_B01ID'],\n",
+ " 'number_cars': ['num_cars', 'NumCar_SPC_match'],\n",
+ " 'tenure_status': ['tenure_spc_for_matching', 'tenure_nts_for_matching'],\n",
+ " 'rural_urban_2_categories': ['Settlement2011EW_B03ID_spc_CD',\n",
+ " 'Settlement2011EW_B03ID'],\n",
+ " 'rural_urban_4_categories': ['Settlement2011EW_B04ID_spc_CD',\n",
+ " 'Settlement2011EW_B04ID']}"
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# column_names (keys) for the dictionary\n",
+ "matching_ids = ['household_id', 'yearly_income', 'number_adults', 'number_children', 'num_pension_age',\n",
+ " 'employment_status', 'number_cars', 'tenure_status', 'rural_urban_2_categories', 'rural_urban_4_categories']\n",
+ "\n",
+ "# i want the value to be a list with spc_matching and nts_matching\n",
+ "matching_dfs_dict = {column_name: [spc_value, nts_value] for column_name, spc_value, nts_value in zip(matching_ids, spc_matching, nts_matching)}\n",
+ "matching_dfs_dict"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Attempt 1: Match on all possible columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "3203 households in the SPC had no match\n",
+ "14.9 % of households in the SPC had no match\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# columns for matching\n",
+ "keys = ['yearly_income', 'number_adults', 'number_children', 'num_pension_age',\n",
+ " 'employment_status', 'number_cars', 'tenure_status', 'rural_urban_2_categories']\n",
+ "\n",
+ "\n",
+ "spc_cols = [matching_dfs_dict[key][0] for key in keys]\n",
+ "nts_cols = [matching_dfs_dict[key][1] for key in keys]\n",
+ "\n",
+ "# match\n",
+ "spc_nts_1 = spc_matching.merge(nts_matching,\n",
+ " left_on= spc_cols,\n",
+ " right_on= nts_cols,\n",
+ " how = 'left')\n",
+ "\n",
+ "# Calculate how many rows from nts_matching are matched onto each hid in spc_matching,\n",
+ "spc_nts_1['count'] = spc_nts_1.groupby('hid')['HouseholdID'].transform('count')\n",
+ "\n",
+ "spc_nts_1_hist = spc_nts_1.drop_duplicates(subset='hid')\n",
+ "\n",
+ "\n",
+ "# plot a histogram of the counts and label the axis and title\n",
+ "plt.hist(spc_nts_1_hist['count'], bins=50)\n",
+ "plt.xlabel('Number of matches per household')\n",
+ "plt.ylabel('Number of households')\n",
+ "plt.title('Categorical Matching')\n",
+ "\n",
+ "print(spc_nts_1_hist[spc_nts_1_hist['count'] == 0].shape[0], \"households in the SPC had no match\")\n",
+ "print(round((spc_nts_1_hist[spc_nts_1_hist['count'] == 0].shape[0] / spc_matching['hid'].unique().shape[0]) * 100, 1), \"% of households in the SPC had no match\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Total | \n",
+ " Matched | \n",
+ " Percentage Matched | \n",
+ "
\n",
+ " \n",
+ " num_children | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 14302 | \n",
+ " 13760.0 | \n",
+ " 96.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6345 | \n",
+ " 4017.0 | \n",
+ " 63.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 764 | \n",
+ " 505.0 | \n",
+ " 66.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 136 | \n",
+ " 74.0 | \n",
+ " 54.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 19 | \n",
+ " 10.0 | \n",
+ " 53.0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 2 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Total Matched Percentage Matched\n",
+ "num_children \n",
+ "0 14302 13760.0 96.0\n",
+ "1 6345 4017.0 63.0\n",
+ "2 764 505.0 66.0\n",
+ "3 136 74.0 54.0\n",
+ "4 19 10.0 53.0\n",
+ "5 2 NaN NaN\n",
+ "6 1 NaN NaN"
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# calculate matching coverage for all columns\n",
+ "\n",
+ "match_coverage_1 = {key: match_coverage_col(data=spc_nts_1,\n",
+ " id_x='hid',\n",
+ " id_y='HouseholdID',\n",
+ " column=matching_dfs_dict[key][0])\n",
+ " for key in matching_dfs_dict\n",
+ " }\n",
+ "\n",
+ "# extract any df from the list\n",
+ "match_coverage_1['number_children']\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Attempt 2: Match on a subset of columns (exclude salary)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2138 households in the SPC had no match\n",
+ "9.9 % of households in the SPC had no match\n"
+ ]
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "