diff --git a/PopSynthesis/DataProcessor/DataProcessor.py b/PopSynthesis/DataProcessor/DataProcessor.py index 277c66d..7e64c34 100644 --- a/PopSynthesis/DataProcessor/DataProcessor.py +++ b/PopSynthesis/DataProcessor/DataProcessor.py @@ -4,25 +4,35 @@ from pathlib import Path from os import PathLike from typing import Union -from PopSynthesis.DataProcessor.utils.const_files import hh_seed_file, pp_seed_file +from PopSynthesis.DataProcessor.utils.const_files import ( + hh_seed_file, + pp_seed_file, + raw_data_dir, + processed_data_dir, + output_dir, +) from PopSynthesis.DataProcessor.utils.general_utils import find_file + class DataProcessorGeneric: - def __init__(self, raw_data_src:PathLike[Union[Path, str]], mid_processed_src: PathLike[Union[Path, str]], output_data_src: PathLike[Union[Path, str]]) -> None: + def __init__( + self, + raw_data_src: PathLike[Union[Path, str]], + mid_processed_src: PathLike[Union[Path, str]], + output_data_src: PathLike[Union[Path, str]], + ) -> None: self.raw_data_path = Path(raw_data_src) self.mid_process_path = Path(mid_processed_src) self.output_data_path = Path(output_data_src) def process_all_seed(self): NotImplemented - - + def process_households_seed(self): # Import the hh seed data hh_file = find_file(base_path=self.raw_data_path, filename=hh_seed_file) print(hh_file) - def process_persons_seed(self): NotImplemented @@ -34,4 +44,7 @@ def process_households_census(self): def process_persons_census(self): NotImplemented - \ No newline at end of file + + +if __name__ == "__main__": + a = DataProcessorGeneric() diff --git a/PopSynthesis/DataProcessor/utils/const_files.py b/PopSynthesis/DataProcessor/utils/const_files.py index 0e8e2ce..89287ce 100644 --- a/PopSynthesis/DataProcessor/utils/const_files.py +++ b/PopSynthesis/DataProcessor/utils/const_files.py @@ -1,4 +1,12 @@ hh_seed_file = "H_VISTA_1220_SA1.csv" pp_seed_file = "P_VISTA_1220_SA1.csv" seed_loc = "VISA" -census_loc = "CENSUSS" \ No newline at end of file +census_loc = "CENSUSS" + +raw_data_dir = ( + r"C:\Users\dlaa0001\Documents\PhD\PopSyn_Monash\PopSynthesis\DataProcessor\data" +) +processed_data_dir = r"C:\Users\dlaa0001\Documents\PhD\PopSyn_Monash\PopSynthesis\DataProcessor\processed_data" +output_dir = ( + r"C:\Users\dlaa0001\Documents\PhD\PopSyn_Monash\PopSynthesis\DataProcessor\output" +) diff --git a/PopSynthesis/DataProcessor/utils/const_process.py b/PopSynthesis/DataProcessor/utils/const_process.py index ecb1ebf..828f4fe 100644 --- a/PopSynthesis/DataProcessor/utils/const_process.py +++ b/PopSynthesis/DataProcessor/utils/const_process.py @@ -8,14 +8,14 @@ ct = str(ct).replace(".", "-").replace(":", "-").replace(" ", "-") # create logger -logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s') -logger = logging.getLogger('process inputs data') +logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s") +logger = logging.getLogger("process inputs data") logger.setLevel(logging.DEBUG) # create file handler which logs even debug messages -fh = logging.FileHandler(os.path.join(log_dir, f'process_data_{ct}.log')) +fh = logging.FileHandler(os.path.join(log_dir, f"process_data_{ct}.log")) fh.setLevel(logging.DEBUG) # create formatter and add it to the handlers -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") fh.setFormatter(formatter) # add the handlers to the logger logger.addHandler(fh) @@ -43,14 +43,7 @@ "$8,000 or more ($416,000 or more)", ] -HH_ATTS = [ - "hhid", - "dwelltype", - "owndwell", - "hhinc", - "totalvehs", - "hhsize" -] +HH_ATTS = ["hhid", "dwelltype", "owndwell", "hhinc", "totalvehs", "hhsize"] PP_ATTS = [ "persid", @@ -60,10 +53,15 @@ "relationship", "persinc", "nolicence", - "anywork" + "anywork", ] -LS_GR_RELA = ["Self", "Spouse", "Child", "Grandchild"] # For the rest we will make them Others +LS_GR_RELA = [ + "Self", + "Spouse", + "Child", + "Grandchild", +] # For the rest we will make them Others HANDLE_THE_REST_RELA = "Others" ALL_RELA = LS_GR_RELA + [HANDLE_THE_REST_RELA] NOT_INCLUDED_IN_BN_LEARN = ["hhid", "persid", "relationship"] diff --git a/PopSynthesis/DataProcessor/utils/general_utils.py b/PopSynthesis/DataProcessor/utils/general_utils.py index d40edb5..00a126d 100644 --- a/PopSynthesis/DataProcessor/utils/general_utils.py +++ b/PopSynthesis/DataProcessor/utils/general_utils.py @@ -6,4 +6,4 @@ def find_file(base_path, filename): for file in base_path.rglob(filename): if file.is_file(): return file - return None \ No newline at end of file + return None diff --git a/PopSynthesis/DataProcessor/utils/seed/add_weghts.py b/PopSynthesis/DataProcessor/utils/seed/add_weghts.py index 3d215cc..0b681b4 100644 --- a/PopSynthesis/DataProcessor/utils/seed/add_weghts.py +++ b/PopSynthesis/DataProcessor/utils/seed/add_weghts.py @@ -1,14 +1,20 @@ import polars as pl + def get_weights_dict(hh_df_w: pl.DataFrame, pp_df_w: pl.DataFrame): re_dict = {} # Process HH weights - hh_df_w["_weight"] = hh_df_w["wdhhwgt_sa3"].fillna(0) + hh_df_w["wehhwgt_sa3"].fillna(0) - pp_df_w["_weight"] = pp_df_w["wdperswgt_sa3"].fillna(0) + pp_df_w["weperswgt_sa3"].fillna(0) + hh_df_w["_weight"] = hh_df_w["wdhhwgt_sa3"].fillna(0) + hh_df_w[ + "wehhwgt_sa3" + ].fillna(0) + pp_df_w["_weight"] = pp_df_w["wdperswgt_sa3"].fillna(0) + pp_df_w[ + "weperswgt_sa3" + ].fillna(0) re_dict["hh"] = dict(zip(hh_df_w["hhid"], hh_df_w["_weight"])) re_dict["pp"] = dict(zip(pp_df_w["persid"], pp_df_w["_weight"])) return re_dict + def add_weights_in_df(df, weights_dict, type="hh"): select_col = None dict_check = weights_dict[type] @@ -17,8 +23,10 @@ def add_weights_in_df(df, weights_dict, type="hh"): if len(check_cols) == 0: raise ValueError("No HHID to match with the weights") else: - select_col = check_cols[0] # Don't know there will be mutiple but just incase, will select the first col - + select_col = check_cols[ + 0 + ] # Don't know there will be mutiple but just incase, will select the first col + elif type == "pp": check_cols = [x for x in df.columns if "persid" in x] if len(check_cols) == 0: @@ -26,11 +34,11 @@ def add_weights_in_df(df, weights_dict, type="hh"): elif len(check_cols) == 1: select_col = check_cols[0] else: - pref_val = "persid_main" # We will now use the weights of the main person + pref_val = "persid_main" # We will now use the weights of the main person select_col = pref_val if pref_val in check_cols else check_cols[0] else: raise ValueError("You pick wrong type for dict check") - + assert select_col is not None df["_weight"] = df.apply(lambda row: dict_check[row[select_col]], axis=1) - return df \ No newline at end of file + return df diff --git a/PopSynthesis/DataProcessor/utils/seed/hh/add_pp_rela_to_hh.py b/PopSynthesis/DataProcessor/utils/seed/hh/add_pp_rela_to_hh.py index c28922a..eea85d6 100644 --- a/PopSynthesis/DataProcessor/utils/seed/hh/add_pp_rela_to_hh.py +++ b/PopSynthesis/DataProcessor/utils/seed/hh/add_pp_rela_to_hh.py @@ -1,4 +1,3 @@ - def adding_pp_related_atts(hh_df, pp_df): # This adding the persons-related atts to the hh df for later sampling # at the moment we will use to have the number of each relationship @@ -8,15 +7,17 @@ def adding_pp_related_atts(hh_df, pp_df): dict_count_rela = {} for hhid, rela_gr in zip(gb_df_pp.index, gb_df_pp): check_dict = {x: 0 for x in ls_rela} - for i in rela_gr: check_dict[i] += 1 + for i in rela_gr: + check_dict[i] += 1 dict_count_rela[hhid] = check_dict for rela in ls_rela: - hh_df[rela] = hh_df.apply(lambda row: dict_count_rela[row["hhid"]][rela], axis=1) + hh_df[rela] = hh_df.apply( + lambda row: dict_count_rela[row["hhid"]][rela], axis=1 + ) # check Self again assert len(hh_df["Main"].unique()) == 1 assert hh_df["Main"].unique()[0] == 1 return hh_df.drop(columns=["Main"]) - diff --git a/PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py b/PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py index 7bf073a..32ce50f 100644 --- a/PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py +++ b/PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py @@ -4,6 +4,7 @@ def convert_veh(row): return str(row["totalvehs"]) else: return f"{veh_limit}+" + hh_df["totalvehs"] = hh_df.apply(convert_veh, axis=1) return hh_df @@ -13,7 +14,7 @@ def con_inc(row): hh_inc = row["hhinc"] # Confime hhinc always exist, it's float if hh_inc < 0: - return "Negative income" #NOTE: None like this but exist in census, need to check whether this can be an issue + return "Negative income" # NOTE: None like this but exist in census, need to check whether this can be an issue elif hh_inc > 0: for state in check_states: bool_val = None @@ -33,14 +34,23 @@ def con_inc(row): return state else: return "Nil income" + hh_df["hhinc"] = hh_df.apply(con_inc, axis=1) return hh_df -def convert_hh_dwell(hh_df): # Removing the occupied rent free - hh_df["owndwell"] = hh_df.apply(lambda r: "Something Else" if r["owndwell"] == "Occupied Rent-Free" else r["owndwell"], axis=1) + +def convert_hh_dwell(hh_df): # Removing the occupied rent free + hh_df["owndwell"] = hh_df.apply( + lambda r: "Something Else" + if r["owndwell"] == "Occupied Rent-Free" + else r["owndwell"], + axis=1, + ) return hh_df def convert_hh_size(hh_df): - hh_df["hhsize"] = hh_df.apply(lambda r: "8+" if r["hhsize"] >= 8 else str(r["hhsize"]), axis=1) - return hh_df \ No newline at end of file + hh_df["hhsize"] = hh_df.apply( + lambda r: "8+" if r["hhsize"] >= 8 else str(r["hhsize"]), axis=1 + ) + return hh_df diff --git a/PopSynthesis/DataProcessor/utils/seed/hh/process_hh_main.py b/PopSynthesis/DataProcessor/utils/seed/hh/process_hh_main.py index f1db6a9..c9596e5 100644 --- a/PopSynthesis/DataProcessor/utils/seed/hh/process_hh_main.py +++ b/PopSynthesis/DataProcessor/utils/seed/hh/process_hh_main.py @@ -1,4 +1,6 @@ -def process_hh_main_person(hh_df, main_pp_df, to_csv=False, name_file="connect_hh_main", include_weights=True): +def process_hh_main_person( + hh_df, main_pp_df, to_csv=False, name_file="connect_hh_main", include_weights=True +): # they need to perfect match assert len(hh_df) == len(main_pp_df) combine_df = hh_df.merge(main_pp_df, on="hhid", how="inner") @@ -10,7 +12,7 @@ def process_hh_main_person(hh_df, main_pp_df, to_csv=False, name_file="connect_h if not include_weights: combine_df = combine_df.drop(columns="_weight") - + if to_csv: - combine_df.to_csv(os.path.join(processed_data ,f"{name_file}.csv"), index=False) + combine_df.to_csv(os.path.join(processed_data, f"{name_file}.csv"), index=False) return combine_df diff --git a/PopSynthesis/DataProcessor/utils/seed/pp/convert_age.py b/PopSynthesis/DataProcessor/utils/seed/pp/convert_age.py index 8904f43..fb927e2 100644 --- a/PopSynthesis/DataProcessor/utils/seed/pp/convert_age.py +++ b/PopSynthesis/DataProcessor/utils/seed/pp/convert_age.py @@ -1,16 +1,15 @@ - def get_main_max_age(pp_df): # add the dummy inc to rank ls_hh_id = pp_df["hhid"].unique() for hh_id in ls_hh_id: print(hh_id) - sub_df = pp_df[pp_df["hhid"]==hh_id] + sub_df = pp_df[pp_df["hhid"] == hh_id] idx_max_age = sub_df["age"].idxmax() rela_max_age = sub_df.loc[idx_max_age]["relationship"] # CONFIRMED this will be Spouse or Others only pp_df.at[idx_max_age, "relationship"] = "Main" if rela_max_age != "Self": - sub_sub_df = sub_df[sub_df["relationship"]=="Self"] + sub_sub_df = sub_df[sub_df["relationship"] == "Self"] idx_self = sub_sub_df.index[0] pp_df.at[idx_self, "relationship"] = rela_max_age return pp_df @@ -26,13 +25,12 @@ def convert_pp_age_gr(pp_df, range_age=10, age_limit=100): new_name = f"{hold_min}-{hold_min+range_age-1}" check_dict[i] = new_name check_dict["others"] = f"{age_limit}+" - + def convert_age(row): if row["age"] in check_dict: return check_dict[row["age"]] else: return check_dict["others"] - + pp_df["age"] = pp_df.apply(convert_age, axis=1) return pp_df - diff --git a/PopSynthesis/DataProcessor/utils/seed/pp/convert_inc.py b/PopSynthesis/DataProcessor/utils/seed/pp/convert_inc.py index f32f989..1fddeab 100644 --- a/PopSynthesis/DataProcessor/utils/seed/pp/convert_inc.py +++ b/PopSynthesis/DataProcessor/utils/seed/pp/convert_inc.py @@ -1,4 +1,3 @@ - def add_converted_inc(pp_df): def process_inc(row): r_check = row["persinc"] @@ -21,6 +20,6 @@ def process_inc(row): else: raise ValueError(f"Dunno I never seen this lol {r_check}") return val - + pp_df["inc_dummy"] = pp_df.apply(process_inc, axis=1) return pp_df diff --git a/PopSynthesis/DataProcessor/utils/seed/pp/process_main_others.py b/PopSynthesis/DataProcessor/utils/seed/pp/process_main_others.py index 52ae16b..d18a851 100644 --- a/PopSynthesis/DataProcessor/utils/seed/pp/process_main_others.py +++ b/PopSynthesis/DataProcessor/utils/seed/pp/process_main_others.py @@ -1,10 +1,11 @@ - def process_main_other(main_pp_df, sub_df, rela, to_csv=True, include_weights=True): - assert len(main_pp_df["relationship"].unique()) == 1 # It is Main - assert len(sub_df["relationship"].unique()) == 1 # It is the relationship we checking + assert len(main_pp_df["relationship"].unique()) == 1 # It is Main + assert ( + len(sub_df["relationship"].unique()) == 1 + ) # It is the relationship we checking # Change the name to avoid confusion - main_pp_df = main_pp_df.add_suffix('_main', axis=1) - sub_df = sub_df.add_suffix(f'_{rela}', axis=1) + main_pp_df = main_pp_df.add_suffix("_main", axis=1) + sub_df = sub_df.add_suffix(f"_{rela}", axis=1) main_pp_df = main_pp_df.rename(columns={"hhid_main": "hhid"}) sub_df = sub_df.rename(columns={f"hhid_{rela}": "hhid"}) @@ -17,9 +18,10 @@ def process_main_other(main_pp_df, sub_df, rela, to_csv=True, include_weights=Tr if not include_weights: combine_df = combine_df.drop(columns="_weight") - + if to_csv: - combine_df.to_csv(os.path.join(processed_data, f"connect_main_{rela}.csv"), index=False) - - return combine_df + combine_df.to_csv( + os.path.join(processed_data, f"connect_main_{rela}.csv"), index=False + ) + return combine_df diff --git a/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py b/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py index 25a755a..a305a98 100644 --- a/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py +++ b/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py @@ -1,10 +1,12 @@ from collections import defaultdict from PopSynthesis.Methods.connect_HH_PP.scripts.const import * + def check_rela_gb(gb_df): for hhid, rela_gr in zip(gb_df.index, gb_df): check_dict = defaultdict(lambda: 0) - for i in rela_gr: check_dict[i] += 1 + for i in rela_gr: + check_dict[i] += 1 if check_dict["Self"] == 0: # print(hhid) print([f"{x} - {y}" for x, y in check_dict.items() if x != "Self"]) @@ -25,17 +27,18 @@ def process_rela(pp_df): ls_to_replace = [] for hhid, rela_gr in zip(gb_df.index, gb_df): check_dict = defaultdict(lambda: 0) - for i in rela_gr: check_dict[i] += 1 + for i in rela_gr: + check_dict[i] += 1 if check_dict["Self"] == 0: replace_method = "oldest" if check_dict["Spouse"] == 0 else "spouse" ls_to_replace.append((hhid, replace_method)) # start to replace to fix errors for hhid, replace_method in ls_to_replace: - sub_df = pp_df[pp_df["hhid"]==hhid] + sub_df = pp_df[pp_df["hhid"] == hhid] idx_to_replace = None if replace_method == "spouse": - sub_sub_df = sub_df[sub_df["relationship"]=="Spouse"] + sub_sub_df = sub_df[sub_df["relationship"] == "Spouse"] idx_to_replace = sub_sub_df.index[0] elif replace_method == "oldest": idx_to_replace = sub_df["age"].idxmax() @@ -44,10 +47,12 @@ def process_rela(pp_df): # check again gb_df_2 = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x)) - check_rela_gb(gb_df_2) # Should print nothing + check_rela_gb(gb_df_2) # Should print nothing # replace values in columns - pp_df.loc[~pp_df["relationship"].isin(LS_GR_RELA), "relationship"] = HANDLE_THE_REST_RELA + pp_df.loc[ + ~pp_df["relationship"].isin(LS_GR_RELA), "relationship" + ] = HANDLE_THE_REST_RELA # print(pp_df["relationship"].unique()) - return pp_df \ No newline at end of file + return pp_df