From ed937595294b02a5d5e93e639719a67bf5e41b9d Mon Sep 17 00:00:00 2001 From: Duc Minh La Date: Tue, 15 Oct 2024 18:05:41 +1100 Subject: [PATCH] black fix and also clean for better logging --- PopSynthesis/Methods/IPSF/CSP/CSP.py | 3 +- .../IPSF/CSP/operations/convert_seeds.py | 58 ++++++++++++++----- .../Methods/IPSF/CSP/operations/rela_syn.py | 2 +- PopSynthesis/Methods/IPSF/CSP/run.py | 9 ++- PopSynthesis/Methods/IPSF/SAA/SAA.py | 10 +++- .../Methods/IPSF/SAA/operations/general.py | 1 + .../IPSF/SAA/operations/zone_adjustment.py | 8 ++- PopSynthesis/Methods/IPSF/SAA/run_hh.py | 41 +++++++------ .../IPSF/utils/synthetic_checked_census.py | 21 +++++-- 9 files changed, 106 insertions(+), 47 deletions(-) diff --git a/PopSynthesis/Methods/IPSF/CSP/CSP.py b/PopSynthesis/Methods/IPSF/CSP/CSP.py index a6a4ae8..634b669 100644 --- a/PopSynthesis/Methods/IPSF/CSP/CSP.py +++ b/PopSynthesis/Methods/IPSF/CSP/CSP.py @@ -10,9 +10,10 @@ We will add the loop here as well (the longest is the fisrt SAA which is seperately) """ + class CSP: def __init__(self) -> None: NotImplemented def run(): - NotImplemented \ No newline at end of file + NotImplemented diff --git a/PopSynthesis/Methods/IPSF/CSP/operations/convert_seeds.py b/PopSynthesis/Methods/IPSF/CSP/operations/convert_seeds.py index a4140fc..0b917bb 100644 --- a/PopSynthesis/Methods/IPSF/CSP/operations/convert_seeds.py +++ b/PopSynthesis/Methods/IPSF/CSP/operations/convert_seeds.py @@ -8,33 +8,54 @@ import pandas as pd from typing import Dict, List -def convert_seeds_to_pairs(hh_seed: pd.DataFrame, pp_seed: pd.DataFrame, id_col:str, pp_segment_col: str, main_state: str) -> Dict[str, pd.DataFrame]: + +def convert_seeds_to_pairs( + hh_seed: pd.DataFrame, + pp_seed: pd.DataFrame, + id_col: str, + pp_segment_col: str, + main_state: str, +) -> Dict[str, pd.DataFrame]: pp_states = pp_seed[pp_segment_col].unique() assert main_state in pp_states assert id_col in hh_seed.columns assert id_col in pp_seed.columns - + hh_seed[id_col] = hh_seed[id_col].astype(str) pp_seed[id_col] = pp_seed[id_col].astype(str) - hh_name = "HH" # simply for naming convention + hh_name = "HH" # simply for naming convention hh_seed = add_pp_seg_count(hh_seed, pp_seed, pp_segment_col, id_col) segmented_pp = segment_pp_seed(pp_seed, pp_segment_col) assert len(segmented_pp[main_state]) == len(hh_seed) - + # pair up HH - Main first - result_pairs = {f"{hh_name}-{main_state}": pair_by_id(hh_seed, segmented_pp[main_state], id_col, hh_name, main_state)} + result_pairs = { + f"{hh_name}-{main_state}": pair_by_id( + hh_seed, segmented_pp[main_state], id_col, hh_name, main_state + ) + } assert len(result_pairs[f"{hh_name}-{main_state}"]) == len(hh_seed) for pp_state in pp_states: if pp_state != main_state: - result_pairs[f"{main_state}-{pp_state}"] = pair_by_id(segmented_pp[main_state], segmented_pp[pp_state], id_col, main_state, pp_state) + result_pairs[f"{main_state}-{pp_state}"] = pair_by_id( + segmented_pp[main_state], + segmented_pp[pp_state], + id_col, + main_state, + pp_state, + ) return result_pairs -def pair_by_id(df1: pd.DataFrame, df2: pd.DataFrame, id: str, name1:str="x", name2:str="y") -> pd.DataFrame: +def pair_by_id( + df1: pd.DataFrame, df2: pd.DataFrame, id: str, name1: str = "x", name2: str = "y" +) -> pd.DataFrame: # join by the id col with inner (so only matched one got accepted) # Likely id will not be unique for df2 as the normal rela may have multiple in 1 hh - join_result = df1.merge(df2, on=id, how="inner", suffixes=[f"_{name1}", f"_{name2}"]) + join_result = df1.merge( + df2, on=id, how="inner", suffixes=[f"_{name1}", f"_{name2}"] + ) assert len(join_result) == min(len(df1), len(df2)) return join_result @@ -42,18 +63,23 @@ def pair_by_id(df1: pd.DataFrame, df2: pd.DataFrame, id: str, name1:str="x", nam def segment_pp_seed(pp_seed: pd.DataFrame, segment_col: str) -> Dict[str, pd.DataFrame]: result_seg_pp = {} for state in pp_seed[segment_col].unique(): - result_seg_pp[state] = pp_seed[pp_seed[segment_col]==state] + result_seg_pp[state] = pp_seed[pp_seed[segment_col] == state] return result_seg_pp -def pair_states_dict(states1: Dict[str, List[str]], states2: Dict[str, List[str]], name1:str="x", name2:str="y") -> Dict[str, List[str]]: +def pair_states_dict( + states1: Dict[str, List[str]], + states2: Dict[str, List[str]], + name1: str = "x", + name2: str = "y", +) -> Dict[str, List[str]]: # This is to create the states list for pool creation using BN (also as ref if needed) states_in_1 = set(states1.keys()) states_in_2 = set(states2.keys()) states_unique_1 = states_in_1 - states_in_2 states_unique_2 = states_in_2 - states_in_1 states_common = states_in_1 & states_in_2 - + results = {s: states1[s] for s in states_unique_1} for s in states_unique_2: results[s] = states2[s] @@ -63,12 +89,18 @@ def pair_states_dict(states1: Dict[str, List[str]], states2: Dict[str, List[str] return results -def add_pp_seg_count(hh_seed: pd.DataFrame, pp_seed: pd.DataFrame, segment_col: str, id_col: str) -> pd.DataFrame: +def add_pp_seg_count( + hh_seed: pd.DataFrame, pp_seed: pd.DataFrame, segment_col: str, id_col: str +) -> pd.DataFrame: """Add the count for each segment (e.g. relationship) into hh_seed for CSP""" possible_seg_states = list(pp_seed[segment_col].unique()) filtered_pp_seed = pp_seed.groupby(id_col)[segment_col].apply(lambda x: list(x)) + def process_seg_count(r): seg_count = filtered_pp_seed[r[id_col]] return [seg_count.count(x) for x in possible_seg_states] - hh_seed[possible_seg_states] = hh_seed.apply(process_seg_count, axis=1, result_type="expand") + + hh_seed[possible_seg_states] = hh_seed.apply( + process_seg_count, axis=1, result_type="expand" + ) return hh_seed diff --git a/PopSynthesis/Methods/IPSF/CSP/operations/rela_syn.py b/PopSynthesis/Methods/IPSF/CSP/operations/rela_syn.py index 10e946f..d2e1ef9 100644 --- a/PopSynthesis/Methods/IPSF/CSP/operations/rela_syn.py +++ b/PopSynthesis/Methods/IPSF/CSP/operations/rela_syn.py @@ -6,4 +6,4 @@ Also a wrapper to take in all pools and the original (only HH) Then we will combine all again and output the kept and removed This can go through SAA again (with the help of the loop check census) -""" \ No newline at end of file +""" diff --git a/PopSynthesis/Methods/IPSF/CSP/run.py b/PopSynthesis/Methods/IPSF/CSP/run.py index e6c0955..39ec3d1 100644 --- a/PopSynthesis/Methods/IPSF/CSP/run.py +++ b/PopSynthesis/Methods/IPSF/CSP/run.py @@ -4,7 +4,10 @@ import pandas as pd import pickle from PopSynthesis.Methods.IPSF.const import data_dir, POOL_SIZE, processed_dir -from PopSynthesis.Methods.IPSF.CSP.operations.convert_seeds import convert_seeds_to_pairs, pair_states_dict +from PopSynthesis.Methods.IPSF.CSP.operations.convert_seeds import ( + convert_seeds_to_pairs, + pair_states_dict, +) from PopSynthesis.Methods.IPSF.utils.pool_utils import create_pool @@ -14,7 +17,7 @@ def main(): with open(processed_dir / "dict_pool_pairs.pickle", "rb") as handle: pools_ref = pickle.load(handle) print(pools_ref) - + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/PopSynthesis/Methods/IPSF/SAA/SAA.py b/PopSynthesis/Methods/IPSF/SAA/SAA.py index 8b1040e..c0efea7 100644 --- a/PopSynthesis/Methods/IPSF/SAA/SAA.py +++ b/PopSynthesis/Methods/IPSF/SAA/SAA.py @@ -29,10 +29,12 @@ def __init__( self.init_required_inputs(marginal_raw) def init_required_inputs(self, marginal_raw: pd.DataFrame): - converted_segment_marg = process_raw_ipu_marg(marginal_raw, atts=self.considered_atts) + converted_segment_marg = process_raw_ipu_marg( + marginal_raw, atts=self.considered_atts + ) self.segmented_marg = converted_segment_marg - def run(self, output_each_step:bool =True, extra_name:str="") -> pd.DataFrame: + def run(self, output_each_step: bool = True, extra_name: str = "") -> pd.DataFrame: # Output the synthetic population, the main point curr_syn_pop = None adjusted_atts = [] @@ -43,5 +45,7 @@ def run(self, output_each_step:bool =True, extra_name:str="") -> pd.DataFrame: ) adjusted_atts.append(att) if output_each_step: - curr_syn_pop.to_csv(output_dir / f"syn_pop_adjusted_{att}{extra_name}.csv") + curr_syn_pop.to_csv( + output_dir / f"syn_pop_adjusted_{att}{extra_name}.csv" + ) return curr_syn_pop diff --git a/PopSynthesis/Methods/IPSF/SAA/operations/general.py b/PopSynthesis/Methods/IPSF/SAA/operations/general.py index bd26d97..c80557d 100644 --- a/PopSynthesis/Methods/IPSF/SAA/operations/general.py +++ b/PopSynthesis/Methods/IPSF/SAA/operations/general.py @@ -11,6 +11,7 @@ ) from PopSynthesis.Methods.IPSF.SAA.operations.zone_adjustment import zone_adjustment + def process_raw_ipu_marg(marg: pd.DataFrame, atts: List[str]) -> pd.DataFrame: segmented_marg = {} zones = marg[marg.columns[marg.columns.get_level_values(0) == zone_field]].values diff --git a/PopSynthesis/Methods/IPSF/SAA/operations/zone_adjustment.py b/PopSynthesis/Methods/IPSF/SAA/operations/zone_adjustment.py index 5665ead..8705756 100644 --- a/PopSynthesis/Methods/IPSF/SAA/operations/zone_adjustment.py +++ b/PopSynthesis/Methods/IPSF/SAA/operations/zone_adjustment.py @@ -124,7 +124,9 @@ def zone_adjustment( neg_states = diff_census[diff_census < 0].index.tolist() pos_states = diff_census[diff_census > 0].index.tolist() - zeros_states = diff_census[diff_census == 0].index.tolist() # Only for later processing + zeros_states = diff_census[ + diff_census == 0 + ].index.tolist() # Only for later processing pairs_adjust = list(itertools.product(neg_states, pos_states)) random.shuffle(pairs_adjust) @@ -215,6 +217,8 @@ def zone_adjustment( ) final_resulted_syn[zone_field] = zone if len(final_resulted_syn) != ori_num_syn: - raise ValueError(f"Error processing at zone {zone}: expected {ori_num_syn} records, got {len(final_resulted_syn)}") + raise ValueError( + f"Error processing at zone {zone}: expected {ori_num_syn} records, got {len(final_resulted_syn)}" + ) print(f"Finished zone {zone}") return final_resulted_syn diff --git a/PopSynthesis/Methods/IPSF/SAA/run_hh.py b/PopSynthesis/Methods/IPSF/SAA/run_hh.py index f38fa9d..0d8b9d3 100644 --- a/PopSynthesis/Methods/IPSF/SAA/run_hh.py +++ b/PopSynthesis/Methods/IPSF/SAA/run_hh.py @@ -2,14 +2,17 @@ import pandas as pd -import numpy as np from PopSynthesis.Methods.IPSF.const import ( data_dir, output_dir, processed_dir, zone_field, ) -from PopSynthesis.Methods.IPSF.utils.synthetic_checked_census import adjust_kept_rec_match_census, get_diff_marg, convert_full_to_marg_count +from PopSynthesis.Methods.IPSF.utils.synthetic_checked_census import ( + adjust_kept_rec_match_census, + get_diff_marg, + convert_full_to_marg_count, +) from PopSynthesis.Methods.IPSF.SAA.SAA import SAA import random import time @@ -17,53 +20,56 @@ def run_main() -> None: hh_marg = pd.read_csv(data_dir / "hh_marginals_ipu.csv", header=[0, 1]) - hh_marg = hh_marg.drop(columns=hh_marg.columns[hh_marg.columns.get_level_values(0)=="sample_geog"][0]) - + hh_marg = hh_marg.drop( + columns=hh_marg.columns[hh_marg.columns.get_level_values(0) == "sample_geog"][0] + ) + order_adjustment = [ "hhsize", "hhinc", "totalvehs", "dwelltype", "owndwell", - ] + ] # these must exist in both marg and syn considered_atts = [ "hhsize", "hhinc", "totalvehs", "dwelltype", "owndwell", - ] + ] # exist in final syn - hh_marg = hh_marg.head(2) - pool = pd.read_csv(processed_dir / "HH_pool_small_test.csv") + pool = pd.read_csv(processed_dir / "HH_pool.csv") start_time = time.time() n_run_time = 0 - n_removed_err_hh = np.inf + # init with the total HH we want + n_removed_err_hh = hh_marg.sum().sum() / len(order_adjustment) MAX_RUN_TIME = 30 chosen_hhs = [] err_rm_hh = [] while n_run_time < MAX_RUN_TIME and n_removed_err_hh > 0: # randomly shuffle for each adjustment random.shuffle(order_adjustment) - print(f"For run {n_run_time}, order is: {order_adjustment}, aim for {n_removed_err_hh} HHs") + err_rm_hh.append(n_removed_err_hh) + print( + f"For run {n_run_time}, order is: {order_adjustment}, aim for {n_removed_err_hh} HHs" + ) saa = SAA(hh_marg, considered_atts, order_adjustment, pool) ### final_syn_pop = saa.run(extra_name=f"_{n_run_time}") ### # error check - marg_from_kept_hh = convert_full_to_marg_count( - final_syn_pop, [zone_field] + marg_from_kept_hh = convert_full_to_marg_count(final_syn_pop, [zone_field]) + converted_hh_marg = hh_marg.set_index( + hh_marg.columns[hh_marg.columns.get_level_values(0) == zone_field][0] ) - converted_hh_marg = hh_marg.set_index(hh_marg.columns[hh_marg.columns.get_level_values(0)==zone_field][0]) diff_marg = get_diff_marg(converted_hh_marg, marg_from_kept_hh) - + kept_hh = adjust_kept_rec_match_census(final_syn_pop, diff_marg) # checking - kept_marg = convert_full_to_marg_count( - kept_hh, [zone_field] - ) + kept_marg = convert_full_to_marg_count(kept_hh, [zone_field]) new_diff_marg = get_diff_marg(converted_hh_marg, kept_marg) # check it is no neg indeed checking_not_neg = new_diff_marg < 0 @@ -75,7 +81,6 @@ def run_main() -> None: n_run_time += 1 n_removed_err_hh = len(final_syn_pop) - len(kept_hh) - err_rm_hh.append(n_removed_err_hh) if n_run_time == MAX_RUN_TIME: # not adjusting anymore chosen_hhs.append(final_syn_pop) diff --git a/PopSynthesis/Methods/IPSF/utils/synthetic_checked_census.py b/PopSynthesis/Methods/IPSF/utils/synthetic_checked_census.py index 4fe73ca..7057c0d 100644 --- a/PopSynthesis/Methods/IPSF/utils/synthetic_checked_census.py +++ b/PopSynthesis/Methods/IPSF/utils/synthetic_checked_census.py @@ -15,6 +15,7 @@ from PopSynthesis.Methods.IPSF.const import count_field, zone_field from typing import List, Literal, Tuple, Dict + def segment_df(df: pd.DataFrame, chunk_sz: int) -> List[pd.DataFrame]: start = 0 ls_df = [] @@ -25,7 +26,9 @@ def segment_df(df: pd.DataFrame, chunk_sz: int) -> List[pd.DataFrame]: return ls_df -def convert_count_to_full(count_df: pd.DataFrame, count_col: str = count_field) -> pd.DataFrame: +def convert_count_to_full( + count_df: pd.DataFrame, count_col: str = count_field +) -> pd.DataFrame: assert count_col in count_df.columns repeated_idx = list(count_df.index.repeat(count_df[count_col])) fin = count_df.loc[repeated_idx] @@ -35,10 +38,10 @@ def convert_count_to_full(count_df: pd.DataFrame, count_col: str = count_field) def convert_full_to_marg_count( - full_pop: pd.DataFrame, filter_ls: list[str]=[] + full_pop: pd.DataFrame, filter_ls: list[str] = [] ) -> pd.DataFrame: assert zone_field in full_pop.columns - cols = [x for x in full_pop.columns if x not in filter_ls+[zone_field]] + cols = [x for x in full_pop.columns if x not in filter_ls + [zone_field]] ls_temp_hold = [] for att in cols: full_pop[att] = full_pop[att].astype(str) @@ -59,7 +62,9 @@ def convert_full_to_marg_count( return new_marg_hh -def add_0_to_missing(df: pd.DataFrame, ls_missing: List[str], axis: Literal[0, 1]) -> pd.DataFrame: +def add_0_to_missing( + df: pd.DataFrame, ls_missing: List[str], axis: Literal[0, 1] +) -> pd.DataFrame: for missing in ls_missing: if axis == 1: # by row df.loc[missing] = 0 @@ -68,7 +73,9 @@ def add_0_to_missing(df: pd.DataFrame, ls_missing: List[str], axis: Literal[0, 1 return df -def get_diff_marg(converted_census_marg: pd.DataFrame, converted_new_hh_marg: pd.DataFrame) -> pd.DataFrame: +def get_diff_marg( + converted_census_marg: pd.DataFrame, converted_new_hh_marg: pd.DataFrame +) -> pd.DataFrame: print("getting the diff marg df") converted_census_marg.index = converted_census_marg.index.astype(str) converted_new_hh_marg.index = converted_new_hh_marg.index.astype(str) @@ -104,7 +111,9 @@ def convert_to_dict_ls(tup: Tuple[Tuple[str, str]]) -> Dict[str, str]: return di -def adjust_kept_rec_match_census(syn_records: pd.DataFrame, diff_census: pd.DataFrame) -> pd.DataFrame: +def adjust_kept_rec_match_census( + syn_records: pd.DataFrame, diff_census: pd.DataFrame +) -> pd.DataFrame: # The point is to remove the chosen in syn_records = syn_records.astype(str) count_kept = syn_records.value_counts()