Skip to content

Commit

Permalink
quick clean black and added the file dir
Browse files Browse the repository at this point in the history
  • Loading branch information
bobkatla committed Jul 14, 2024
1 parent 3bbd1cf commit c6bc2c2
Show file tree
Hide file tree
Showing 12 changed files with 109 additions and 65 deletions.
25 changes: 19 additions & 6 deletions PopSynthesis/DataProcessor/DataProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,35 @@
from pathlib import Path
from os import PathLike
from typing import Union
from PopSynthesis.DataProcessor.utils.const_files import hh_seed_file, pp_seed_file
from PopSynthesis.DataProcessor.utils.const_files import (
hh_seed_file,
pp_seed_file,
raw_data_dir,
processed_data_dir,
output_dir,
)
from PopSynthesis.DataProcessor.utils.general_utils import find_file


class DataProcessorGeneric:
def __init__(self, raw_data_src:PathLike[Union[Path, str]], mid_processed_src: PathLike[Union[Path, str]], output_data_src: PathLike[Union[Path, str]]) -> None:
def __init__(
self,
raw_data_src: PathLike[Union[Path, str]],
mid_processed_src: PathLike[Union[Path, str]],
output_data_src: PathLike[Union[Path, str]],
) -> None:
self.raw_data_path = Path(raw_data_src)
self.mid_process_path = Path(mid_processed_src)
self.output_data_path = Path(output_data_src)

def process_all_seed(self):
NotImplemented



def process_households_seed(self):
# Import the hh seed data
hh_file = find_file(base_path=self.raw_data_path, filename=hh_seed_file)
print(hh_file)


def process_persons_seed(self):
NotImplemented

Expand All @@ -34,4 +44,7 @@ def process_households_census(self):

def process_persons_census(self):
NotImplemented



if __name__ == "__main__":
a = DataProcessorGeneric()
10 changes: 9 additions & 1 deletion PopSynthesis/DataProcessor/utils/const_files.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
hh_seed_file = "H_VISTA_1220_SA1.csv"
pp_seed_file = "P_VISTA_1220_SA1.csv"
seed_loc = "VISA"
census_loc = "CENSUSS"
census_loc = "CENSUSS"

raw_data_dir = (
r"C:\Users\dlaa0001\Documents\PhD\PopSyn_Monash\PopSynthesis\DataProcessor\data"
)
processed_data_dir = r"C:\Users\dlaa0001\Documents\PhD\PopSyn_Monash\PopSynthesis\DataProcessor\processed_data"
output_dir = (
r"C:\Users\dlaa0001\Documents\PhD\PopSyn_Monash\PopSynthesis\DataProcessor\output"
)
26 changes: 12 additions & 14 deletions PopSynthesis/DataProcessor/utils/const_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
ct = str(ct).replace(".", "-").replace(":", "-").replace(" ", "-")

# create logger
logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s')
logger = logging.getLogger('process inputs data')
logging.basicConfig(format="%(asctime)s - %(levelname)s: %(message)s")
logger = logging.getLogger("process inputs data")
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler(os.path.join(log_dir, f'process_data_{ct}.log'))
fh = logging.FileHandler(os.path.join(log_dir, f"process_data_{ct}.log"))
fh.setLevel(logging.DEBUG)
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
fh.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
Expand Down Expand Up @@ -43,14 +43,7 @@
"$8,000 or more ($416,000 or more)",
]

HH_ATTS = [
"hhid",
"dwelltype",
"owndwell",
"hhinc",
"totalvehs",
"hhsize"
]
HH_ATTS = ["hhid", "dwelltype", "owndwell", "hhinc", "totalvehs", "hhsize"]

PP_ATTS = [
"persid",
Expand All @@ -60,10 +53,15 @@
"relationship",
"persinc",
"nolicence",
"anywork"
"anywork",
]

LS_GR_RELA = ["Self", "Spouse", "Child", "Grandchild"] # For the rest we will make them Others
LS_GR_RELA = [
"Self",
"Spouse",
"Child",
"Grandchild",
] # For the rest we will make them Others
HANDLE_THE_REST_RELA = "Others"
ALL_RELA = LS_GR_RELA + [HANDLE_THE_REST_RELA]
NOT_INCLUDED_IN_BN_LEARN = ["hhid", "persid", "relationship"]
2 changes: 1 addition & 1 deletion PopSynthesis/DataProcessor/utils/general_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ def find_file(base_path, filename):
for file in base_path.rglob(filename):
if file.is_file():
return file
return None
return None
22 changes: 15 additions & 7 deletions PopSynthesis/DataProcessor/utils/seed/add_weghts.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import polars as pl


def get_weights_dict(hh_df_w: pl.DataFrame, pp_df_w: pl.DataFrame):
re_dict = {}
# Process HH weights
hh_df_w["_weight"] = hh_df_w["wdhhwgt_sa3"].fillna(0) + hh_df_w["wehhwgt_sa3"].fillna(0)
pp_df_w["_weight"] = pp_df_w["wdperswgt_sa3"].fillna(0) + pp_df_w["weperswgt_sa3"].fillna(0)
hh_df_w["_weight"] = hh_df_w["wdhhwgt_sa3"].fillna(0) + hh_df_w[
"wehhwgt_sa3"
].fillna(0)
pp_df_w["_weight"] = pp_df_w["wdperswgt_sa3"].fillna(0) + pp_df_w[
"weperswgt_sa3"
].fillna(0)
re_dict["hh"] = dict(zip(hh_df_w["hhid"], hh_df_w["_weight"]))
re_dict["pp"] = dict(zip(pp_df_w["persid"], pp_df_w["_weight"]))
return re_dict


def add_weights_in_df(df, weights_dict, type="hh"):
select_col = None
dict_check = weights_dict[type]
Expand All @@ -17,20 +23,22 @@ def add_weights_in_df(df, weights_dict, type="hh"):
if len(check_cols) == 0:
raise ValueError("No HHID to match with the weights")
else:
select_col = check_cols[0] # Don't know there will be mutiple but just incase, will select the first col

select_col = check_cols[
0
] # Don't know there will be mutiple but just incase, will select the first col

elif type == "pp":
check_cols = [x for x in df.columns if "persid" in x]
if len(check_cols) == 0:
raise ValueError("No persid to match with the weights")
elif len(check_cols) == 1:
select_col = check_cols[0]
else:
pref_val = "persid_main" # We will now use the weights of the main person
pref_val = "persid_main" # We will now use the weights of the main person
select_col = pref_val if pref_val in check_cols else check_cols[0]
else:
raise ValueError("You pick wrong type for dict check")

assert select_col is not None
df["_weight"] = df.apply(lambda row: dict_check[row[select_col]], axis=1)
return df
return df
9 changes: 5 additions & 4 deletions PopSynthesis/DataProcessor/utils/seed/hh/add_pp_rela_to_hh.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

def adding_pp_related_atts(hh_df, pp_df):
# This adding the persons-related atts to the hh df for later sampling
# at the moment we will use to have the number of each relationship
Expand All @@ -8,15 +7,17 @@ def adding_pp_related_atts(hh_df, pp_df):
dict_count_rela = {}
for hhid, rela_gr in zip(gb_df_pp.index, gb_df_pp):
check_dict = {x: 0 for x in ls_rela}
for i in rela_gr: check_dict[i] += 1
for i in rela_gr:
check_dict[i] += 1
dict_count_rela[hhid] = check_dict

for rela in ls_rela:
hh_df[rela] = hh_df.apply(lambda row: dict_count_rela[row["hhid"]][rela], axis=1)
hh_df[rela] = hh_df.apply(
lambda row: dict_count_rela[row["hhid"]][rela], axis=1
)

# check Self again
assert len(hh_df["Main"].unique()) == 1
assert hh_df["Main"].unique()[0] == 1

return hh_df.drop(columns=["Main"])

20 changes: 15 additions & 5 deletions PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ def convert_veh(row):
return str(row["totalvehs"])
else:
return f"{veh_limit}+"

hh_df["totalvehs"] = hh_df.apply(convert_veh, axis=1)
return hh_df

Expand All @@ -13,7 +14,7 @@ def con_inc(row):
hh_inc = row["hhinc"]
# Confime hhinc always exist, it's float
if hh_inc < 0:
return "Negative income" #NOTE: None like this but exist in census, need to check whether this can be an issue
return "Negative income" # NOTE: None like this but exist in census, need to check whether this can be an issue
elif hh_inc > 0:
for state in check_states:
bool_val = None
Expand All @@ -33,14 +34,23 @@ def con_inc(row):
return state
else:
return "Nil income"

hh_df["hhinc"] = hh_df.apply(con_inc, axis=1)
return hh_df

def convert_hh_dwell(hh_df): # Removing the occupied rent free
hh_df["owndwell"] = hh_df.apply(lambda r: "Something Else" if r["owndwell"] == "Occupied Rent-Free" else r["owndwell"], axis=1)

def convert_hh_dwell(hh_df): # Removing the occupied rent free
hh_df["owndwell"] = hh_df.apply(
lambda r: "Something Else"
if r["owndwell"] == "Occupied Rent-Free"
else r["owndwell"],
axis=1,
)
return hh_df


def convert_hh_size(hh_df):
hh_df["hhsize"] = hh_df.apply(lambda r: "8+" if r["hhsize"] >= 8 else str(r["hhsize"]), axis=1)
return hh_df
hh_df["hhsize"] = hh_df.apply(
lambda r: "8+" if r["hhsize"] >= 8 else str(r["hhsize"]), axis=1
)
return hh_df
8 changes: 5 additions & 3 deletions PopSynthesis/DataProcessor/utils/seed/hh/process_hh_main.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
def process_hh_main_person(hh_df, main_pp_df, to_csv=False, name_file="connect_hh_main", include_weights=True):
def process_hh_main_person(
hh_df, main_pp_df, to_csv=False, name_file="connect_hh_main", include_weights=True
):
# they need to perfect match
assert len(hh_df) == len(main_pp_df)
combine_df = hh_df.merge(main_pp_df, on="hhid", how="inner")
Expand All @@ -10,7 +12,7 @@ def process_hh_main_person(hh_df, main_pp_df, to_csv=False, name_file="connect_h

if not include_weights:
combine_df = combine_df.drop(columns="_weight")

if to_csv:
combine_df.to_csv(os.path.join(processed_data ,f"{name_file}.csv"), index=False)
combine_df.to_csv(os.path.join(processed_data, f"{name_file}.csv"), index=False)
return combine_df
10 changes: 4 additions & 6 deletions PopSynthesis/DataProcessor/utils/seed/pp/convert_age.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@

def get_main_max_age(pp_df):
# add the dummy inc to rank
ls_hh_id = pp_df["hhid"].unique()
for hh_id in ls_hh_id:
print(hh_id)
sub_df = pp_df[pp_df["hhid"]==hh_id]
sub_df = pp_df[pp_df["hhid"] == hh_id]
idx_max_age = sub_df["age"].idxmax()
rela_max_age = sub_df.loc[idx_max_age]["relationship"]
# CONFIRMED this will be Spouse or Others only
pp_df.at[idx_max_age, "relationship"] = "Main"
if rela_max_age != "Self":
sub_sub_df = sub_df[sub_df["relationship"]=="Self"]
sub_sub_df = sub_df[sub_df["relationship"] == "Self"]
idx_self = sub_sub_df.index[0]
pp_df.at[idx_self, "relationship"] = rela_max_age
return pp_df
Expand All @@ -26,13 +25,12 @@ def convert_pp_age_gr(pp_df, range_age=10, age_limit=100):
new_name = f"{hold_min}-{hold_min+range_age-1}"
check_dict[i] = new_name
check_dict["others"] = f"{age_limit}+"

def convert_age(row):
if row["age"] in check_dict:
return check_dict[row["age"]]
else:
return check_dict["others"]

pp_df["age"] = pp_df.apply(convert_age, axis=1)
return pp_df

3 changes: 1 addition & 2 deletions PopSynthesis/DataProcessor/utils/seed/pp/convert_inc.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

def add_converted_inc(pp_df):
def process_inc(row):
r_check = row["persinc"]
Expand All @@ -21,6 +20,6 @@ def process_inc(row):
else:
raise ValueError(f"Dunno I never seen this lol {r_check}")
return val

pp_df["inc_dummy"] = pp_df.apply(process_inc, axis=1)
return pp_df
20 changes: 11 additions & 9 deletions PopSynthesis/DataProcessor/utils/seed/pp/process_main_others.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@

def process_main_other(main_pp_df, sub_df, rela, to_csv=True, include_weights=True):
assert len(main_pp_df["relationship"].unique()) == 1 # It is Main
assert len(sub_df["relationship"].unique()) == 1 # It is the relationship we checking
assert len(main_pp_df["relationship"].unique()) == 1 # It is Main
assert (
len(sub_df["relationship"].unique()) == 1
) # It is the relationship we checking
# Change the name to avoid confusion
main_pp_df = main_pp_df.add_suffix('_main', axis=1)
sub_df = sub_df.add_suffix(f'_{rela}', axis=1)
main_pp_df = main_pp_df.add_suffix("_main", axis=1)
sub_df = sub_df.add_suffix(f"_{rela}", axis=1)
main_pp_df = main_pp_df.rename(columns={"hhid_main": "hhid"})
sub_df = sub_df.rename(columns={f"hhid_{rela}": "hhid"})

Expand All @@ -17,9 +18,10 @@ def process_main_other(main_pp_df, sub_df, rela, to_csv=True, include_weights=Tr

if not include_weights:
combine_df = combine_df.drop(columns="_weight")

if to_csv:
combine_df.to_csv(os.path.join(processed_data, f"connect_main_{rela}.csv"), index=False)

return combine_df
combine_df.to_csv(
os.path.join(processed_data, f"connect_main_{rela}.csv"), index=False
)

return combine_df
Loading

0 comments on commit c6bc2c2

Please sign in to comment.