fixing this, ensuring the inputs are all good (no neg)

bobkatla · bobkatla · commit 15ebce3a91ad · 2024-08-23T14:30:49.000+10:00
diff --git a/PopSynthesis/Generator_data/generate_combine_census/meta_data_pp_manual.csv b/PopSynthesis/Generator_data/generate_combine_census/meta_data_pp_manual.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:caae732b8c43b6d38d38440c3f400388cde84ef5643965b7861bc272c766f34e
-size 2808
+oid sha256:d83d34d37df375743e2a655a7c23d3eecf3e14d33a7b5b3292eb7d296698bc90
+size 2807
diff --git a/PopSynthesis/Generator_data/generate_combine_census/process_match_ipu.ipynb b/PopSynthesis/Generator_data/generate_combine_census/process_match_ipu.ipynb
@@ -11,20 +11,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import numpy as np\n",
     "from collections import defaultdict\n",
     "\n",
-    "from PopSynthesis.Generator_data.generate_combine_census.utils import *"
+    "from PopSynthesis.Generator_data.generate_combine_census.utils import * "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -35,7 +35,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -45,7 +45,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -58,7 +58,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,7 +69,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -79,7 +79,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -146,7 +146,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -156,7 +156,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -165,7 +165,7 @@
        "6464884"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -181,7 +181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -190,7 +190,7 @@
        "6450747.0"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -201,21 +201,127 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 32,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-1\n",
+      "('dwelltype', 'Flat or Apartment')\n",
+      "-2\n",
+      "('dwelltype', 'Separate House')\n",
+      "-3\n",
+      "('dwelltype', 'Separate House')\n",
+      "-3\n",
+      "('dwelltype', 'Separate House')\n"
+     ]
+    }
+   ],
    "source": [
-    "a = final_df_census_hh.astype(int) < 0\n",
+    "check_df = final_df_census_hh.astype(int) < 0\n",
     "dict_to_process = {}\n",
-    "for i, r in a.iterrows():\n",
+    "for i, r in check_df.iterrows():\n",
     "    if r.any():\n",
-    "        loc_cols = r[r].index\n",
+    "        print(f\"Issue in {i}\")\n",
+    "        loc_cols = r[r].index # Filter to only true\n",
     "        dict_to_process[i] = list(loc_cols)\n",
+    "        for att, state in list(loc_cols):\n",
+    "            # print(r.index.get_level_values(0) == att)\n",
+    "            all_related_state_idx = r[r.index.get_level_values(0) == att].index\n",
+    "            sub_check = final_df_census_hh.loc[i, all_related_state_idx]\n",
+    "            to_add_del_val = final_df_census_hh.loc[i, (att, state)] * -1\n",
+    "            assert to_add_del_val > 0 # confirm again\n",
+    "            assert sub_check.max() > to_add_del_val\n",
+    "            print(f\"Old value to fix {(att, state)}: {final_df_census_hh.loc[i, (att, state)]}\")\n",
+    "            print(f\"And delete in {sub_check.idxmax()}: {final_df_census_hh.loc[i, sub_check.idxmax()]}\")\n",
+    "            final_df_census_hh.loc[i, (att, state)] += to_add_del_val\n",
+    "            final_df_census_hh.loc[i, sub_check.idxmax()] -= to_add_del_val\n",
+    "            print(f\"New value in {(att, state)}: {final_df_census_hh.loc[i, (att, state)]}\")\n",
+    "            print(f\"New value in {sub_check.idxmax()}: {final_df_census_hh.loc[i, sub_check.idxmax()]}\")\n",
     "\n",
-    "for idx, ls_cols in dict_to_process.items():\n",
-    "    print(idx, ls_cols)"
+    "            \n",
+    "        # print(final_df_census_hh.loc[i, loc_cols])\n",
+    "\n",
+    "# for idx, ls_cols in dict_to_process.items():\n",
+    "#     print(idx, ls_cols)\n",
+    "\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "zone_id      None                 3942\n",
+       "sample_geog  None                    2\n",
+       "hhsize       1                     341\n",
+       "             2                     542\n",
+       "             3                     102\n",
+       "             4                     108\n",
+       "             5                      45\n",
+       "             6                       5\n",
+       "             7                       0\n",
+       "             8+                     55\n",
+       "dwelltype    Flat or Apartment      -3\n",
+       "             Missing                 0\n",
+       "             Other                  -3\n",
+       "             Separate House       1191\n",
+       "             Terrace/Townhouse      13\n",
+       "hhinc        1-149                  19\n",
+       "             1000-1249              99\n",
+       "             1250-1499              84\n",
+       "             150-299                25\n",
+       "             1500-1749              56\n",
+       "             1750-1999              64\n",
+       "             2000-2499             121\n",
+       "             2500-2999              69\n",
+       "             300-399                30\n",
+       "             3000-3499              78\n",
+       "             3500-3999              31\n",
+       "             400-499                65\n",
+       "             4000-4499              23\n",
+       "             4500-4999              71\n",
+       "             500-649                55\n",
+       "             5000-5999              54\n",
+       "             6000-7999              50\n",
+       "             650-799                83\n",
+       "             800-999                66\n",
+       "             8000+                  22\n",
+       "             Negative income         9\n",
+       "             Nil income             24\n",
+       "totalvehs    0                      34\n",
+       "             1                     416\n",
+       "             2                     550\n",
+       "             3                     141\n",
+       "             4+                     57\n",
+       "owndwell     Being Purchased       317\n",
+       "             Being Rented          152\n",
+       "             Fully Owned           694\n",
+       "             Something Else         35\n",
+       "Name: 3942, dtype: object"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "final_df_census_hh.loc[\"3942\", :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 13,
@@ -235,9 +341,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'final_df_census_hh' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Input \u001b[1;32mIn [1]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m \u001b[43mfinal_df_census_hh\u001b[49m\u001b[38;5;241m.\u001b[39mcolumns:\n\u001b[0;32m      2\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m col[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mzone_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msample_geog\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVehicle\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m      3\u001b[0m         final_df_census_hh[col] \u001b[38;5;241m=\u001b[39m final_df_census_hh[col] \u001b[38;5;241m/\u001b[39m tot_hh_seri\n",
+      "\u001b[1;31mNameError\u001b[0m: name 'final_df_census_hh' is not defined"
+     ]
+    }
+   ],
    "source": [
     "for col in final_df_census_hh.columns:\n",
     "    if col[0] not in [\"zone_id\", \"sample_geog\", \"Vehicle\"]:\n",