bobkatla
diff --git a/‎PopSynthesis/DataProcessor/DataProcessor.py
Lines changed: 14 additions & 7 deletions b/‎PopSynthesis/DataProcessor/DataProcessor.py
Lines changed: 14 additions & 7 deletions
diff --git a/‎PopSynthesis/DataProcessor/utils/const_process.py
Lines changed: 11 additions & 2 deletions b/‎PopSynthesis/DataProcessor/utils/const_process.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py
Lines changed: 3 additions & 1 deletion b/‎PopSynthesis/DataProcessor/utils/seed/hh/process_general_hh.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎PopSynthesis/DataProcessor/utils/seed/hh/process_hh_main.py
Lines changed: 1 addition & 0 deletions b/‎PopSynthesis/DataProcessor/utils/seed/hh/process_hh_main.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py
Lines changed: 11 additions & 11 deletions b/‎PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py
Lines changed: 11 additions & 11 deletions
diff --git a/‎PopSynthesis/Methods/IPSF/SAA/main.py
Lines changed: 20 additions & 5 deletions b/‎PopSynthesis/Methods/IPSF/SAA/main.py
Lines changed: 20 additions & 5 deletions
diff --git a/‎PopSynthesis/Methods/IPSF/SAA/operations/compare_census.py
Lines changed: 6 additions & 9 deletions b/‎PopSynthesis/Methods/IPSF/SAA/operations/compare_census.py
Lines changed: 6 additions & 9 deletions
diff --git a/‎PopSynthesis/Methods/IPSF/SAA/operations/general.py
Lines changed: 41 additions & 20 deletions b/‎PopSynthesis/Methods/IPSF/SAA/operations/general.py
Lines changed: 41 additions & 20 deletions
@@ -58,7 +58,9 @@ def process_all_seed(self) -> None:
         filtered_hh = hh_df[hh_df["hhid"].isin(hhid_in_pp)]
         hhid_in_hh = list(filtered_hh["hhid"].unique())
         filtered_pp = pp_df[pp_df["hhid"].isin(hhid_in_hh)]
-        print(f"Removed {len(hh_df) - len(filtered_hh)} households due to mismatch with pp")
+        print(
+            f"Removed {len(hh_df) - len(filtered_hh)} households due to mismatch with pp"
+        )
         print(f"Removed {len(pp_df) - len(filtered_pp)} people due to mismatch with hh")
 
         # households size equal number of persons
@@ -81,7 +83,6 @@ def check_match_hhsz(r):
 
         self.hh_seed_data = filtered_hh
         self.pp_seed_data = filtered_pp
-        
 
     def process_households_seed(self) -> pd.DataFrame:
         # Import the hh seed data
@@ -91,7 +92,9 @@ def process_households_seed(self) -> pd.DataFrame:
         # Next we add weights, we combine weights of both wd and we
         hh_df = hh_df.with_columns(pl.col("wdhhwgt_sa3").fill_null(strategy="zero"))
         hh_df = hh_df.with_columns(pl.col("wehhwgt_sa3").fill_null(strategy="zero"))
-        hh_df = hh_df.with_columns(_weight = pl.col("wdhhwgt_sa3") + pl.col("wehhwgt_sa3"))
+        hh_df = hh_df.with_columns(
+            _weight=pl.col("wdhhwgt_sa3") + pl.col("wehhwgt_sa3")
+        )
         hh_df = hh_df.drop(["wdhhwgt_sa3", "wehhwgt_sa3"])
         hh_df = hh_df.drop_nulls()
 
@@ -109,15 +112,19 @@ def process_persons_seed(self) -> pd.DataFrame:
         # Next we add weights, we combine weights of both wd and we
         pp_df = pp_df.with_columns(pl.col("wdperswgt_sa3").fill_null(strategy="zero"))
         pp_df = pp_df.with_columns(pl.col("weperswgt_sa3").fill_null(strategy="zero"))
-        pp_df = pp_df.with_columns(_weight = pl.col("wdperswgt_sa3") + pl.col("weperswgt_sa3"))
+        pp_df = pp_df.with_columns(
+            _weight=pl.col("wdperswgt_sa3") + pl.col("weperswgt_sa3")
+        )
         pp_df = pp_df.drop(["wdperswgt_sa3", "weperswgt_sa3"])
 
         pp_df = process_not_accept_values(pp_df)
         pp_df = process_rela(pp_df)
         pp_df = convert_pp_age_gr(pp_df)
         return pp_df.to_pandas()
-    
-    def output_seed(self, name_pp_seed:str = "pp_seed", name_hh_seed:str = "hh_seed") -> None:
+
+    def output_seed(
+        self, name_pp_seed: str = "pp_seed", name_hh_seed: str = "hh_seed"
+    ) -> None:
         pp_loc = self.output_data_path / f"{name_pp_seed}.csv"
         hh_loc = self.output_data_path / f"{name_hh_seed}.csv"
         self.pp_seed_data.to_csv(pp_loc, index=False)
@@ -131,7 +138,7 @@ def process_households_census(self):
 
     def process_persons_census(self):
         NotImplemented
-    
+
     def output_all_files(self):
         NotImplemented
 
 
@@ -43,7 +43,16 @@
     "$8,000 or more ($416,000 or more)",
 ]
 
-HH_ATTS = ["hhid", "dwelltype", "owndwell", "hhinc", "totalvehs", "hhsize", "wdhhwgt_sa3", "wehhwgt_sa3"]
+HH_ATTS = [
+    "hhid",
+    "dwelltype",
+    "owndwell",
+    "hhinc",
+    "totalvehs",
+    "hhsize",
+    "wdhhwgt_sa3",
+    "wehhwgt_sa3",
+]
 
 PP_ATTS = [
     "persid",
@@ -55,6 +64,6 @@
     "nolicence",
     "anywork",
     "wdperswgt_sa3",
-    "weperswgt_sa3"
+    "weperswgt_sa3",
 ]
 NOT_INCLUDED_IN_BN_LEARN = ["hhid", "persid", "relationship"]
@@ -47,7 +47,9 @@ def convert_hh_inc(hh_df: pl.DataFrame, check_states: str) -> pl.DataFrame:
     return hh_df
 
 
-def convert_hh_dwell(hh_df: pl.DataFrame) -> pl.DataFrame:  # Removing the occupied rent free
+def convert_hh_dwell(
+    hh_df: pl.DataFrame,
+) -> pl.DataFrame:  # Removing the occupied rent free
     col_owndwell = pl.col("owndwell")
     expr = (
         pl.when(col_owndwell == "Occupied Rent-Free")
 
@@ -3,6 +3,7 @@
 
 import pandas as pd
 
+
 def process_hh_main_person(
     hh_df, main_pp_df, to_csv=False, name_file="connect_hh_main", include_weights=True
 ):
 
@@ -6,18 +6,18 @@
 MIN_PARENT_CHILD_GAP = 15
 MIN_GRANDPARENT_GRANDCHILD_GAP = 33
 # This only apply when we do the conversion for Child and Grandchild
-MAX_COUPLE_GAP = 20  
+MAX_COUPLE_GAP = 20
 MIN_PERMITTED_AGE_MARRIED = 16
 AVAILABLE_RELATIONSHIPS = [
-        "Main",
-        "Spouse",
-        "Child",
-        "Grandchild",
-        "Sibling",
-        "Others",
-        "Parent",
-        "Grandparent",
-    ]
+    "Main",
+    "Spouse",
+    "Child",
+    "Grandchild",
+    "Sibling",
+    "Others",
+    "Parent",
+    "Grandparent",
+]
 
 
 class Person:
@@ -385,6 +385,6 @@ def process_rela(pp_df: pl.DataFrame) -> pl.DataFrame:
     pp_df["relationship"] = pp_df["persid"].map(result_mapping)
 
     # The households with implausible combinations will have None value for relationship
-    pp_df =pp_df[~pp_df["relationship"].isna()]
+    pp_df = pp_df[~pp_df["relationship"].isna()]
 
     return pl.from_pandas(pp_df)
@@ -8,27 +8,42 @@
 import pandas as pd
 
 from PopSynthesis.Methods.IPSF.const import POOL_SIZE
-from PopSynthesis.Methods.IPSF.SAA.operations.general import process_raw_ipu_init, adjust_atts_state_match_census
+from PopSynthesis.Methods.IPSF.SAA.operations.general import (
+    process_raw_ipu_init,
+    adjust_atts_state_match_census,
+)
 from typing import List, Dict
 
+
 class SAA:
-    def __init__(self, marginal_raw: pd.DataFrame, seed_raw: pd.DataFrame, ordered_to_adjust_atts:List[str], att_states: Dict[str, List[str]], pool_sz: int = POOL_SIZE) -> None:
+    def __init__(
+        self,
+        marginal_raw: pd.DataFrame,
+        seed_raw: pd.DataFrame,
+        ordered_to_adjust_atts: List[str],
+        att_states: Dict[str, List[str]],
+        pool_sz: int = POOL_SIZE,
+    ) -> None:
         self.ordered_atts = ordered_to_adjust_atts
         self.known_att_states = att_states
         self.init_required_inputs(marginal_raw, seed_raw)
 
     def init_required_inputs(self, marginal_raw: pd.DataFrame, seed_raw: pd.DataFrame):
-        converted_segment_marg, converted_seed = process_raw_ipu_init(marginal_raw, seed_raw)
+        converted_segment_marg, converted_seed = process_raw_ipu_init(
+            marginal_raw, seed_raw
+        )
         self.seed = converted_seed
         self.segmented_marg = converted_segment_marg
 
     def run(self) -> pd.DataFrame:
         # Output the synthetic population, the main point
         curr_syn_pop = None
         adjusted_atts = []
-        pool = self.seed # change later
+        pool = self.seed  # change later
         for att in self.ordered_atts:
             sub_census = self.segmented_marg[att].reset_index()
-            curr_syn_pop = adjust_atts_state_match_census(att, curr_syn_pop, sub_census, adjusted_atts, pool)
+            curr_syn_pop = adjust_atts_state_match_census(
+                att, curr_syn_pop, sub_census, adjusted_atts, pool
+            )
             adjusted_atts.append(att)
         return curr_syn_pop
@@ -8,10 +8,14 @@
 from PopSynthesis.Methods.IPSF.const import zone_field, count_field
 
 
-def calculate_states_diff(att:str, syn_pop: pd.DataFrame, sub_census: pd.DataFrame) -> pd.DataFrame:
+def calculate_states_diff(
+    att: str, syn_pop: pd.DataFrame, sub_census: pd.DataFrame
+) -> pd.DataFrame:
     """ This calculate the differences between current syn_pop and the census at a specific geo_lev """
     sub_syn_pop_count = syn_pop[[zone_field, att]].value_counts().reset_index()
-    tranformed_sub_syn_count = sub_syn_pop_count.pivot(index=zone_field, columns=att, values=count_field).fillna(0)
+    tranformed_sub_syn_count = sub_syn_pop_count.pivot(
+        index=zone_field, columns=att, values=count_field
+    ).fillna(0)
     sub_census = sub_census.set_index(zone_field)
     # Always census is the ground truth, check for missing and fill
     missing_zones = set(sub_census.index) - set(tranformed_sub_syn_count.index)
@@ -26,10 +30,3 @@ def calculate_states_diff(att:str, syn_pop: pd.DataFrame, sub_census: pd.DataFra
     # no nan values
     assert not results.isna().any().any()
     return results
-
-
-
-
-
-
-
@@ -6,18 +6,25 @@
 
 from typing import List, Union, Tuple, Dict
 from PopSynthesis.Methods.IPSF.const import count_field, zone_field, data_dir
-from PopSynthesis.Methods.IPSF.SAA.operations.compare_census import calculate_states_diff
+from PopSynthesis.Methods.IPSF.SAA.operations.compare_census import (
+    calculate_states_diff,
+)
 from PopSynthesis.Methods.IPSF.SAA.operations.zone_adjustment import zone_adjustment
-from PopSynthesis.Methods.IPSF.utils.condensed_tools import CondensedDF, sample_from_condensed
+from PopSynthesis.Methods.IPSF.utils.condensed_tools import (
+    CondensedDF,
+    sample_from_condensed,
+)
 
 
-def process_raw_ipu_init(marg: pd.DataFrame, seed: pd.DataFrame) -> Tuple[Dict[str, pd.DataFrame], pd.DataFrame]:
-    atts = [x for x in seed.columns if x not in ["serialno", "sample_geog"] ]
+def process_raw_ipu_init(
+    marg: pd.DataFrame, seed: pd.DataFrame
+) -> Tuple[Dict[str, pd.DataFrame], pd.DataFrame]:
+    atts = [x for x in seed.columns if x not in ["serialno", "sample_geog"]]
     segmented_marg = {}
-    zones = marg[marg.columns[marg.columns.get_level_values(0)==zone_field]].values
+    zones = marg[marg.columns[marg.columns.get_level_values(0) == zone_field]].values
     zones = [z[0] for z in zones]
     for att in atts:
-        sub_marg = marg[marg.columns[marg.columns.get_level_values(0)==att]]
+        sub_marg = marg[marg.columns[marg.columns.get_level_values(0) == att]]
         if sub_marg.empty:
             print(f"Don't have this att {att} in census")
             continue
@@ -29,15 +36,21 @@ def process_raw_ipu_init(marg: pd.DataFrame, seed: pd.DataFrame) -> Tuple[Dict[s
     return segmented_marg, new_seed
 
 
-def sample_from_pl(df: pl.DataFrame, n: int, count_field:str = count_field, with_replacement=True) -> pl.DataFrame:
+def sample_from_pl(
+    df: pl.DataFrame, n: int, count_field: str = count_field, with_replacement=True
+) -> pl.DataFrame:
     # Normalize weights to sum to 1
     weights = df[count_field].to_numpy()
-    weights = weights/weights.sum()
-    sample_indices = np.random.choice(df.height, size=n, replace=with_replacement, p=weights)
+    weights = weights / weights.sum()
+    sample_indices = np.random.choice(
+        df.height, size=n, replace=with_replacement, p=weights
+    )
     return df[sample_indices.tolist()]
 
 
-def init_syn_pop_saa(att:str, marginal_data: pd.DataFrame, pool: pd.DataFrame) -> pl.DataFrame:
+def init_syn_pop_saa(
+    att: str, marginal_data: pd.DataFrame, pool: pd.DataFrame
+) -> pl.DataFrame:
     pool = pl.from_pandas(pool)
     marginal_data = pl.from_pandas(marginal_data)
     assert zone_field in marginal_data
@@ -51,13 +64,11 @@ def init_syn_pop_saa(att:str, marginal_data: pd.DataFrame, pool: pd.DataFrame) -
     for state in states:
         sub_pool = pool.filter(pl.col(att) == state)
         if len(sub_pool) == 0:
-            print(
-                f"WARNING: cannot see {att}_{state} in the pool, sample by the rest"
-            )
+            print(f"WARNING: cannot see {att}_{state} in the pool, sample by the rest")
             sub_pool = pool  # if there are none, we take all
         for zone in marginal_data[zone_field]:
             condition = marginal_data.filter(pl.col(zone_field) == zone)
-            census_val = condition.select(state).to_numpy()[0,0]
+            census_val = condition.select(state).to_numpy()[0, 0]
 
             sub_syn_pop = sample_from_pl(sub_pool, census_val)
 
@@ -68,23 +79,33 @@ def init_syn_pop_saa(att:str, marginal_data: pd.DataFrame, pool: pd.DataFrame) -
     return pl.concat(sub_pops)
 
 
-def adjust_atts_state_match_census(att: str, curr_syn_pop: Union[None, pd.DataFrame], census_data_by_att: pd.DataFrame, adjusted_atts: List[str], pool: pd.DataFrame) -> pd.DataFrame:
+def adjust_atts_state_match_census(
+    att: str,
+    curr_syn_pop: Union[None, pd.DataFrame],
+    census_data_by_att: pd.DataFrame,
+    adjusted_atts: List[str],
+    pool: pd.DataFrame,
+) -> pd.DataFrame:
     if curr_syn_pop is None:
         updated_syn_pop = init_syn_pop_saa(att, census_data_by_att, pool).to_pandas()
     else:
         updated_syn_pop = curr_syn_pop
 
-        states_diff_census = calculate_states_diff(att, curr_syn_pop, census_data_by_att)
+        states_diff_census = calculate_states_diff(
+            att, curr_syn_pop, census_data_by_att
+        )
         assert (states_diff_census.sum(axis=1) == 0).all()
         # With state diff we can now do adjustment for each zone, can parallel it?
         pop_syn_across_zones = []
         for zid, zone_states_diff in states_diff_census.iterrows():
             print(f"Processing zone {zid}")
-            sub_syn_pop = updated_syn_pop[updated_syn_pop[zone_field]==zid]
-            zone_adjusted_syn_pop = zone_adjustment(att, sub_syn_pop, zone_states_diff, pool, adjusted_atts)
+            sub_syn_pop = updated_syn_pop[updated_syn_pop[zone_field] == zid]
+            zone_adjusted_syn_pop = zone_adjustment(
+                att, sub_syn_pop, zone_states_diff, pool, adjusted_atts
+            )
             if zone_adjusted_syn_pop is not None:
                 pop_syn_across_zones.append(zone_adjusted_syn_pop)
-        
+
         updated_syn_pop = pd.concat(pop_syn_across_zones)
 
-    return updated_syn_pop
+    return updated_syn_pop