Skip to content

Commit 15ebce3

Browse files
committed
fixing this, ensuring the inputs are all good (no neg)
1 parent 9f4bbd1 commit 15ebce3

File tree

2 files changed

+142
-24
lines changed

2 files changed

+142
-24
lines changed
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:caae732b8c43b6d38d38440c3f400388cde84ef5643965b7861bc272c766f34e
3-
size 2808
2+
oid sha256:d83d34d37df375743e2a655a7c23d3eecf3e14d33a7b5b3292eb7d296698bc90
3+
size 2807

PopSynthesis/Generator_data/generate_combine_census/process_match_ipu.ipynb

Lines changed: 140 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,20 @@
1111
},
1212
{
1313
"cell_type": "code",
14-
"execution_count": 2,
14+
"execution_count": 1,
1515
"metadata": {},
1616
"outputs": [],
1717
"source": [
1818
"import pandas as pd\n",
1919
"import numpy as np\n",
2020
"from collections import defaultdict\n",
2121
"\n",
22-
"from PopSynthesis.Generator_data.generate_combine_census.utils import *"
22+
"from PopSynthesis.Generator_data.generate_combine_census.utils import * "
2323
]
2424
},
2525
{
2626
"cell_type": "code",
27-
"execution_count": 3,
27+
"execution_count": 2,
2828
"metadata": {},
2929
"outputs": [],
3030
"source": [
@@ -35,7 +35,7 @@
3535
},
3636
{
3737
"cell_type": "code",
38-
"execution_count": 4,
38+
"execution_count": 3,
3939
"metadata": {},
4040
"outputs": [],
4141
"source": [
@@ -45,7 +45,7 @@
4545
},
4646
{
4747
"cell_type": "code",
48-
"execution_count": 5,
48+
"execution_count": 4,
4949
"metadata": {},
5050
"outputs": [],
5151
"source": [
@@ -58,7 +58,7 @@
5858
},
5959
{
6060
"cell_type": "code",
61-
"execution_count": 6,
61+
"execution_count": 5,
6262
"metadata": {},
6363
"outputs": [],
6464
"source": [
@@ -69,7 +69,7 @@
6969
},
7070
{
7171
"cell_type": "code",
72-
"execution_count": 7,
72+
"execution_count": 6,
7373
"metadata": {},
7474
"outputs": [],
7575
"source": [
@@ -79,7 +79,7 @@
7979
},
8080
{
8181
"cell_type": "code",
82-
"execution_count": 8,
82+
"execution_count": 7,
8383
"metadata": {},
8484
"outputs": [],
8585
"source": [
@@ -146,7 +146,7 @@
146146
},
147147
{
148148
"cell_type": "code",
149-
"execution_count": 9,
149+
"execution_count": 8,
150150
"metadata": {},
151151
"outputs": [],
152152
"source": [
@@ -156,7 +156,7 @@
156156
},
157157
{
158158
"cell_type": "code",
159-
"execution_count": 10,
159+
"execution_count": 9,
160160
"metadata": {},
161161
"outputs": [
162162
{
@@ -165,7 +165,7 @@
165165
"6464884"
166166
]
167167
},
168-
"execution_count": 10,
168+
"execution_count": 9,
169169
"metadata": {},
170170
"output_type": "execute_result"
171171
}
@@ -181,7 +181,7 @@
181181
},
182182
{
183183
"cell_type": "code",
184-
"execution_count": 11,
184+
"execution_count": 10,
185185
"metadata": {},
186186
"outputs": [
187187
{
@@ -190,7 +190,7 @@
190190
"6450747.0"
191191
]
192192
},
193-
"execution_count": 11,
193+
"execution_count": 10,
194194
"metadata": {},
195195
"output_type": "execute_result"
196196
}
@@ -201,21 +201,127 @@
201201
},
202202
{
203203
"cell_type": "code",
204-
"execution_count": 12,
204+
"execution_count": 32,
205205
"metadata": {},
206-
"outputs": [],
206+
"outputs": [
207+
{
208+
"name": "stdout",
209+
"output_type": "stream",
210+
"text": [
211+
"-1\n",
212+
"('dwelltype', 'Flat or Apartment')\n",
213+
"-2\n",
214+
"('dwelltype', 'Separate House')\n",
215+
"-3\n",
216+
"('dwelltype', 'Separate House')\n",
217+
"-3\n",
218+
"('dwelltype', 'Separate House')\n"
219+
]
220+
}
221+
],
207222
"source": [
208-
"a = final_df_census_hh.astype(int) < 0\n",
223+
"check_df = final_df_census_hh.astype(int) < 0\n",
209224
"dict_to_process = {}\n",
210-
"for i, r in a.iterrows():\n",
225+
"for i, r in check_df.iterrows():\n",
211226
" if r.any():\n",
212-
" loc_cols = r[r].index\n",
227+
" print(f\"Issue in {i}\")\n",
228+
" loc_cols = r[r].index # Filter to only true\n",
213229
" dict_to_process[i] = list(loc_cols)\n",
230+
" for att, state in list(loc_cols):\n",
231+
" # print(r.index.get_level_values(0) == att)\n",
232+
" all_related_state_idx = r[r.index.get_level_values(0) == att].index\n",
233+
" sub_check = final_df_census_hh.loc[i, all_related_state_idx]\n",
234+
" to_add_del_val = final_df_census_hh.loc[i, (att, state)] * -1\n",
235+
" assert to_add_del_val > 0 # confirm again\n",
236+
" assert sub_check.max() > to_add_del_val\n",
237+
" print(f\"Old value to fix {(att, state)}: {final_df_census_hh.loc[i, (att, state)]}\")\n",
238+
" print(f\"And delete in {sub_check.idxmax()}: {final_df_census_hh.loc[i, sub_check.idxmax()]}\")\n",
239+
" final_df_census_hh.loc[i, (att, state)] += to_add_del_val\n",
240+
" final_df_census_hh.loc[i, sub_check.idxmax()] -= to_add_del_val\n",
241+
" print(f\"New value in {(att, state)}: {final_df_census_hh.loc[i, (att, state)]}\")\n",
242+
" print(f\"New value in {sub_check.idxmax()}: {final_df_census_hh.loc[i, sub_check.idxmax()]}\")\n",
214243
"\n",
215-
"for idx, ls_cols in dict_to_process.items():\n",
216-
" print(idx, ls_cols)"
244+
" \n",
245+
" # print(final_df_census_hh.loc[i, loc_cols])\n",
246+
"\n",
247+
"# for idx, ls_cols in dict_to_process.items():\n",
248+
"# print(idx, ls_cols)\n",
249+
"\n"
217250
]
218251
},
252+
{
253+
"cell_type": "code",
254+
"execution_count": 18,
255+
"metadata": {},
256+
"outputs": [
257+
{
258+
"data": {
259+
"text/plain": [
260+
"zone_id None 3942\n",
261+
"sample_geog None 2\n",
262+
"hhsize 1 341\n",
263+
" 2 542\n",
264+
" 3 102\n",
265+
" 4 108\n",
266+
" 5 45\n",
267+
" 6 5\n",
268+
" 7 0\n",
269+
" 8+ 55\n",
270+
"dwelltype Flat or Apartment -3\n",
271+
" Missing 0\n",
272+
" Other -3\n",
273+
" Separate House 1191\n",
274+
" Terrace/Townhouse 13\n",
275+
"hhinc 1-149 19\n",
276+
" 1000-1249 99\n",
277+
" 1250-1499 84\n",
278+
" 150-299 25\n",
279+
" 1500-1749 56\n",
280+
" 1750-1999 64\n",
281+
" 2000-2499 121\n",
282+
" 2500-2999 69\n",
283+
" 300-399 30\n",
284+
" 3000-3499 78\n",
285+
" 3500-3999 31\n",
286+
" 400-499 65\n",
287+
" 4000-4499 23\n",
288+
" 4500-4999 71\n",
289+
" 500-649 55\n",
290+
" 5000-5999 54\n",
291+
" 6000-7999 50\n",
292+
" 650-799 83\n",
293+
" 800-999 66\n",
294+
" 8000+ 22\n",
295+
" Negative income 9\n",
296+
" Nil income 24\n",
297+
"totalvehs 0 34\n",
298+
" 1 416\n",
299+
" 2 550\n",
300+
" 3 141\n",
301+
" 4+ 57\n",
302+
"owndwell Being Purchased 317\n",
303+
" Being Rented 152\n",
304+
" Fully Owned 694\n",
305+
" Something Else 35\n",
306+
"Name: 3942, dtype: object"
307+
]
308+
},
309+
"execution_count": 18,
310+
"metadata": {},
311+
"output_type": "execute_result"
312+
}
313+
],
314+
"source": [
315+
"final_df_census_hh.loc[\"3942\", :]"
316+
]
317+
},
318+
{
319+
"cell_type": "code",
320+
"execution_count": null,
321+
"metadata": {},
322+
"outputs": [],
323+
"source": []
324+
},
219325
{
220326
"cell_type": "code",
221327
"execution_count": 13,
@@ -235,9 +341,21 @@
235341
},
236342
{
237343
"cell_type": "code",
238-
"execution_count": 40,
344+
"execution_count": 1,
239345
"metadata": {},
240-
"outputs": [],
346+
"outputs": [
347+
{
348+
"ename": "NameError",
349+
"evalue": "name 'final_df_census_hh' is not defined",
350+
"output_type": "error",
351+
"traceback": [
352+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
353+
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
354+
"Input \u001b[1;32mIn [1]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m \u001b[43mfinal_df_census_hh\u001b[49m\u001b[38;5;241m.\u001b[39mcolumns:\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m col[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mzone_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msample_geog\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVehicle\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m 3\u001b[0m final_df_census_hh[col] \u001b[38;5;241m=\u001b[39m final_df_census_hh[col] \u001b[38;5;241m/\u001b[39m tot_hh_seri\n",
355+
"\u001b[1;31mNameError\u001b[0m: name 'final_df_census_hh' is not defined"
356+
]
357+
}
358+
],
241359
"source": [
242360
"for col in final_df_census_hh.columns:\n",
243361
" if col[0] not in [\"zone_id\", \"sample_geog\", \"Vehicle\"]:\n",

0 commit comments

Comments
 (0)