Skip to content

Commit

Permalink
quick clean
Browse files Browse the repository at this point in the history
  • Loading branch information
bobkatla committed Jul 14, 2024
1 parent 09fe8b9 commit 9e26c54
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 13 deletions.
16 changes: 13 additions & 3 deletions PopSynthesis/DataProcessor/DataProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,18 @@
processed_data_dir,
output_dir,
)
from PopSynthesis.DataProcessor.utils.const_process import HH_ATTS, LS_GR_RELA, LS_HH_INC
from PopSynthesis.DataProcessor.utils.const_process import (
HH_ATTS,
LS_GR_RELA,
LS_HH_INC,
)
from PopSynthesis.DataProcessor.utils.general_utils import find_file
from PopSynthesis.DataProcessor.utils.seed.hh.process_general_hh import convert_hh_totvehs, convert_hh_size, convert_hh_dwell, convert_hh_inc
from PopSynthesis.DataProcessor.utils.seed.hh.process_general_hh import (
convert_hh_totvehs,
convert_hh_size,
convert_hh_dwell,
convert_hh_inc,
)
import polars as pl


Expand All @@ -35,7 +44,8 @@ def process_households_seed(self):
# Import the hh seed data
hh_file = find_file(base_path=self.raw_data_path, filename=hh_seed_file)
raw_hh_seed = pl.read_csv(hh_file)
hh_df = convert_hh_totvehs(raw_hh_seed)
hh_df = raw_hh_seed[HH_ATTS]
hh_df = convert_hh_totvehs(hh_df)
hh_df = convert_hh_inc(hh_df, check_states=LS_HH_INC)
hh_df = convert_hh_dwell(hh_df)
hh_df = convert_hh_size(hh_df)
Expand Down
34 changes: 24 additions & 10 deletions PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
def convert_hh_totvehs(hh_df: pl.DataFrame, veh_limit=4):
# Define the conditional operation
def convert_veh(col, veh_limit):
return pl.when(col < veh_limit).then(col.cast(pl.Utf8)).otherwise(pl.lit(f"{veh_limit}+"))
return (
pl.when(col < veh_limit)
.then(col.cast(pl.Utf8))
.otherwise(pl.lit(f"{veh_limit}+"))
)

hh_df = hh_df.with_columns(
convert_veh(pl.col("totalvehs"), veh_limit).alias("totalvehs")
Expand All @@ -15,11 +19,11 @@ def convert_veh(col, veh_limit):
def convert_hh_inc(hh_df, check_states):
# Note there can be null
hhinc_col = pl.col("hhinc")

# Base expression
expr = pl.when(hhinc_col < 0).then(pl.lit("Negative income"))
expr = expr.when(hhinc_col == 0).then(pl.lit("Nil income"))

# Generate conditions and results for each state in check_states
for state in check_states:
state_clean = state.replace(",", "").replace("$", "").split(" ")[0]
Expand All @@ -28,29 +32,39 @@ def convert_hh_inc(hh_df, check_states):
expr = expr.when(hhinc_col >= val).then(pl.lit(f"{val}+"))
elif "-" in state:
a, b = map(int, state_clean.split("-"))
expr = expr.when((hhinc_col >= a) & (hhinc_col <= b)).then(pl.lit(f"{a}-{b}"))
expr = expr.when((hhinc_col >= a) & (hhinc_col <= b)).then(
pl.lit(f"{a}-{b}")
)
else:
raise ValueError(f"Dunno I never seen this lol {state}")

# Final otherwise to retain the original value if no conditions match
expr = expr.otherwise(hhinc_col)

# Apply the transformation
hh_df = hh_df.with_columns(expr.alias("hhinc"))

return hh_df


def convert_hh_dwell(hh_df: pl.DataFrame): # Removing the occupied rent free
col_owndwell = pl.col("owndwell")
expr = pl.when(col_owndwell=="Occupied Rent-Free").then(pl.lit("Something Else")).otherwise(col_owndwell)
expr = (
pl.when(col_owndwell == "Occupied Rent-Free")
.then(pl.lit("Something Else"))
.otherwise(col_owndwell)
)
hh_df = hh_df.with_columns(expr.alias("owndwell"))
return hh_df


def convert_hh_size(hh_df):
col_hhsz = pl.col("hhsize")
max_hhsz = 8 # const, based on census
expr = pl.when(col_hhsz >= max_hhsz).then(pl.lit(f"{max_hhsz}+")).otherwise(col_hhsz.cast(pl.Utf8))
max_hhsz = 8 # const, based on census
expr = (
pl.when(col_hhsz >= max_hhsz)
.then(pl.lit(f"{max_hhsz}+"))
.otherwise(col_hhsz.cast(pl.Utf8))
)
hh_df = hh_df.with_columns(expr.alias("hhsize"))
return hh_df

0 comments on commit 9e26c54

Please sign in to comment.