From 9e26c541cbf73a85dbe68510edf30c5f196dba1e Mon Sep 17 00:00:00 2001 From: Duc Minh La Date: Sun, 14 Jul 2024 19:18:11 +1000 Subject: [PATCH] quick clean --- PopSynthesis/DataProcessor/DataProcessor.py | 16 +++++++-- .../utils/seed/hh/process_general_hh.py | 34 +++++++++++++------ 2 files changed, 37 insertions(+), 13 deletions(-) diff --git a/PopSynthesis/DataProcessor/DataProcessor.py b/PopSynthesis/DataProcessor/DataProcessor.py index 0baa1d9..286c845 100644 --- a/PopSynthesis/DataProcessor/DataProcessor.py +++ b/PopSynthesis/DataProcessor/DataProcessor.py @@ -11,9 +11,18 @@ processed_data_dir, output_dir, ) -from PopSynthesis.DataProcessor.utils.const_process import HH_ATTS, LS_GR_RELA, LS_HH_INC +from PopSynthesis.DataProcessor.utils.const_process import ( + HH_ATTS, + LS_GR_RELA, + LS_HH_INC, +) from PopSynthesis.DataProcessor.utils.general_utils import find_file -from PopSynthesis.DataProcessor.utils.seed.hh.process_general_hh import convert_hh_totvehs, convert_hh_size, convert_hh_dwell, convert_hh_inc +from PopSynthesis.DataProcessor.utils.seed.hh.process_general_hh import ( + convert_hh_totvehs, + convert_hh_size, + convert_hh_dwell, + convert_hh_inc, +) import polars as pl @@ -35,7 +44,8 @@ def process_households_seed(self): # Import the hh seed data hh_file = find_file(base_path=self.raw_data_path, filename=hh_seed_file) raw_hh_seed = pl.read_csv(hh_file) - hh_df = convert_hh_totvehs(raw_hh_seed) + hh_df = raw_hh_seed[HH_ATTS] + hh_df = convert_hh_totvehs(hh_df) hh_df = convert_hh_inc(hh_df, check_states=LS_HH_INC) hh_df = convert_hh_dwell(hh_df) hh_df = convert_hh_size(hh_df) diff --git a/PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py b/PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py index a655e4e..1a92e09 100644 --- a/PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py +++ b/PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py @@ -4,7 +4,11 @@ def convert_hh_totvehs(hh_df: pl.DataFrame, veh_limit=4): # Define the conditional operation def convert_veh(col, veh_limit): - return pl.when(col < veh_limit).then(col.cast(pl.Utf8)).otherwise(pl.lit(f"{veh_limit}+")) + return ( + pl.when(col < veh_limit) + .then(col.cast(pl.Utf8)) + .otherwise(pl.lit(f"{veh_limit}+")) + ) hh_df = hh_df.with_columns( convert_veh(pl.col("totalvehs"), veh_limit).alias("totalvehs") @@ -15,11 +19,11 @@ def convert_veh(col, veh_limit): def convert_hh_inc(hh_df, check_states): # Note there can be null hhinc_col = pl.col("hhinc") - + # Base expression expr = pl.when(hhinc_col < 0).then(pl.lit("Negative income")) expr = expr.when(hhinc_col == 0).then(pl.lit("Nil income")) - + # Generate conditions and results for each state in check_states for state in check_states: state_clean = state.replace(",", "").replace("$", "").split(" ")[0] @@ -28,29 +32,39 @@ def convert_hh_inc(hh_df, check_states): expr = expr.when(hhinc_col >= val).then(pl.lit(f"{val}+")) elif "-" in state: a, b = map(int, state_clean.split("-")) - expr = expr.when((hhinc_col >= a) & (hhinc_col <= b)).then(pl.lit(f"{a}-{b}")) + expr = expr.when((hhinc_col >= a) & (hhinc_col <= b)).then( + pl.lit(f"{a}-{b}") + ) else: raise ValueError(f"Dunno I never seen this lol {state}") - + # Final otherwise to retain the original value if no conditions match expr = expr.otherwise(hhinc_col) - + # Apply the transformation hh_df = hh_df.with_columns(expr.alias("hhinc")) return hh_df - + def convert_hh_dwell(hh_df: pl.DataFrame): # Removing the occupied rent free col_owndwell = pl.col("owndwell") - expr = pl.when(col_owndwell=="Occupied Rent-Free").then(pl.lit("Something Else")).otherwise(col_owndwell) + expr = ( + pl.when(col_owndwell == "Occupied Rent-Free") + .then(pl.lit("Something Else")) + .otherwise(col_owndwell) + ) hh_df = hh_df.with_columns(expr.alias("owndwell")) return hh_df def convert_hh_size(hh_df): col_hhsz = pl.col("hhsize") - max_hhsz = 8 # const, based on census - expr = pl.when(col_hhsz >= max_hhsz).then(pl.lit(f"{max_hhsz}+")).otherwise(col_hhsz.cast(pl.Utf8)) + max_hhsz = 8 # const, based on census + expr = ( + pl.when(col_hhsz >= max_hhsz) + .then(pl.lit(f"{max_hhsz}+")) + .otherwise(col_hhsz.cast(pl.Utf8)) + ) hh_df = hh_df.with_columns(expr.alias("hhsize")) return hh_df