minors setup for check census (in progress) and run sequential

bobkatla · Oct 13, 2024 · 33d230d · 33d230d
1 parent 2e37d81
commit 33d230d
Show file tree

Hide file tree

Showing 3 changed files with 172 additions and 6 deletions.
diff --git a/PopSynthesis/Methods/IPSF/SAA/run_hh.py b/PopSynthesis/Methods/IPSF/SAA/run_hh.py
@@ -41,7 +41,7 @@ def run_main() -> None:
     print(f"Processing took {int(hours)}h-{int(minutes)}m-{seconds:.2f}s")
 
     print(final_syn_pop)
-    final_syn_pop.to_csv(output_dir / "SAA_output_HH.csv")
+    final_syn_pop.to_csv(output_dir / "SAA_output_HH_again.csv")
 
 
 if __name__ == "__main__":

diff --git a/PopSynthesis/Methods/IPSF/utils/synthetic_checked_census.py b/PopSynthesis/Methods/IPSF/utils/synthetic_checked_census.py
@@ -8,8 +8,174 @@
 Output will be part of synthetic we keep and part we want to remove
 """
 
-def convert_synthetic_to_census():
-    NotImplemented
+import pandas as pd
+import pandas as pd
+import numpy as np
+from itertools import combinations, product
+from PopSynthesis.Methods.IPSF.const import count_field, zone_field
+from typing import List, Literal, Tuple, Dict
 
-def check_synthetic_against_census():
-    NotImplemented
+def segment_df(df: pd.DataFrame, chunk_sz: int) -> List[pd.DataFrame]:
+    start = 0
+    ls_df = []
+    while start < len(df):
+        sub_df = df.iloc[start : start + chunk_sz]
+        ls_df.append(sub_df)
+        start += chunk_sz
+    return ls_df
+
+
+def convert_count_to_full(count_df: pd.DataFrame, count_col: str = count_field) -> pd.DataFrame:
+    assert count_col in count_df.columns
+    repeated_idx = list(count_df.index.repeat(count_df[count_col]))
+    fin = count_df.loc[repeated_idx]
+    fin = fin.drop(columns=[count_col])
+    fin = fin.reset_index(drop=True)
+    return fin
+
+
+def convert_full_to_marg_count(
+    full_pop: pd.DataFrame, filter_ls: list[str]
+) -> pd.DataFrame:
+    cols = [x for x in full_pop.columns if x not in filter_ls]
+    ls_temp_hold = []
+    for att in cols:
+        full_pop[att] = full_pop[att].astype(str)
+        temp_hold = full_pop.groupby(zone_field)[att].value_counts().unstack().fillna(0)
+        temp_hold.columns = [(temp_hold.columns.name, x) for x in temp_hold.columns]
+        temp_hold = temp_hold.astype(int)
+        ls_temp_hold.append(temp_hold)
+    marg_new_raw = pd.concat(ls_temp_hold, axis=1)
+    convert_marg_dict = {
+        com_col: marg_new_raw[com_col] for com_col in marg_new_raw.columns
+    }
+    convert_marg_dict[(zone_field, None)] = marg_new_raw.index
+    new_marg_hh = pd.DataFrame(convert_marg_dict)
+    ls_drop_m = list(
+        new_marg_hh.columns[new_marg_hh.columns.get_level_values(0).isin([zone_field])]
+    )
+    new_marg_hh = new_marg_hh.drop(columns=ls_drop_m)
+    return new_marg_hh
+
+
+def add_0_to_missing(df: pd.DataFrame, ls_missing: List[str], axis: Literal[0, 1]) -> pd.DataFrame:
+    for missing in ls_missing:
+        if axis == 1:  # by row
+            df.loc[missing] = 0
+        elif axis == 0:  # by col
+            df[missing] = 0
+    return df
+
+
+def get_diff_marg(converted_census_marg: pd.DataFrame, converted_new_hh_marg: pd.DataFrame) -> pd.DataFrame:
+    print("getting the diff marg df")
+    converted_census_marg.index = converted_census_marg.index.astype(str)
+    converted_new_hh_marg.index = converted_new_hh_marg.index.astype(str)
+    # make sure they both have the same rows and cols, if not it means 0
+    missing_cols_ori = set(converted_new_hh_marg.columns) - set(
+        converted_census_marg.columns
+    )
+    missing_cols_kept = set(converted_census_marg.columns) - set(
+        converted_new_hh_marg.columns
+    )
+    missing_rows_ori = set(converted_new_hh_marg.index) - set(
+        converted_census_marg.index
+    )
+    missing_rows_kept = set(converted_census_marg.index) - set(
+        converted_new_hh_marg.index
+    )
+
+    converted_new_hh_marg = add_0_to_missing(
+        converted_new_hh_marg, missing_cols_kept, 0
+    )
+    converted_new_hh_marg = add_0_to_missing(
+        converted_new_hh_marg, missing_rows_kept, 1
+    )
+    converted_census_marg = add_0_to_missing(converted_census_marg, missing_cols_ori, 0)
+    converted_census_marg = add_0_to_missing(converted_census_marg, missing_rows_ori, 1)
+    return converted_census_marg - converted_new_hh_marg
+
+
+def convert_to_dict_ls(tup: Tuple[Tuple[str, str]]) -> Dict[str, str]:
+    di = {}
+    for a, b in tup:
+        di.setdefault(a, []).append((a, b))
+    return di
+
+
+def adjust_kept_rec_match_census(syn_records: pd.DataFrame, diff_census: pd.DataFrame) -> pd.DataFrame:
+    # The point is to remove the chosen in
+    count_kept = syn_records.value_counts()
+    # diff_census = diff_census.head(10) # sample to check smaller
+    # diff_census = diff_census.set_index(diff_census.columns[diff_census.columns.get_level_values(0)==zone_field])
+    for zone, r in diff_census.iterrows():
+        print(f"DOING deleting to match cencus diff for {zone}")
+        before_sum = count_kept.loc[
+            count_kept.index.get_level_values(zone_field) == zone
+        ].sum()
+        sub_count_kept = count_kept.loc[
+            count_kept.index.get_level_values(zone_field) == zone
+        ]
+        prev_indexs = sub_count_kept.index
+        neg_cols = r[r < 0]
+        # re check with neg val
+        dict_neg_v = convert_to_dict_ls(neg_cols.index)
+        for i in range(len(dict_neg_v)):
+            raws_before_comb = combinations(dict_neg_v.values(), len(dict_neg_v) - i)
+            for raw in raws_before_comb:
+                if neg_cols.sum() == 0:
+                    break
+                ls_pos_neg_comb = list(product(*raw))
+                for comb in ls_pos_neg_comb:
+                    # loop through each neg combs all
+                    condi_check = True
+                    to_del_n = np.inf
+                    # search for sub df with combs and also dfind the
+                    for att, state in comb:
+                        condi_check &= (
+                            sub_count_kept.index.get_level_values(att) == state
+                        )
+                        if att != zone_field:
+                            check_v = neg_cols.loc[(att, state)] * -1
+                            if check_v < to_del_n:
+                                to_del_n = check_v
+                    filtered_combs_from_kept = sub_count_kept.loc[condi_check]
+
+                    if len(filtered_combs_from_kept) == 0 or to_del_n == 0:
+                        continue
+
+                    sum_val = filtered_combs_from_kept.sum()
+                    if sum_val < to_del_n:
+                        to_del_n = sum_val
+
+                    # we need to spread the del_n by the dist
+                    temp_hold_combs = filtered_combs_from_kept / sum_val
+                    temp_hold_combs = temp_hold_combs * to_del_n
+
+                    # First del by just normal rounding
+                    to_del_first = np.floor(temp_hold_combs)
+                    filtered_combs_from_kept = filtered_combs_from_kept - to_del_first
+                    remaining_to_del = to_del_n - to_del_first.sum()
+                    # we will spread the remaing to del for the top
+                    filtered_combs_from_kept.sort_values(ascending=False, inplace=True)
+                    filtered_combs_from_kept.iloc[: int(remaining_to_del)] -= 1
+                    # Make sure there are no neg
+                    assert not any(filtered_combs_from_kept < 0)
+
+                    # Update the count kept
+                    sub_count_kept.loc[
+                        filtered_combs_from_kept.index
+                    ] = filtered_combs_from_kept
+                    neg_cols.loc[list(comb)] += to_del_n
+                    sub_count_kept = sub_count_kept[sub_count_kept > 0]
+        zero_indexes = set(prev_indexs) - set(sub_count_kept.index)
+        count_kept.loc[sub_count_kept.index] = sub_count_kept
+        count_kept.loc[list(zero_indexes)] = 0
+        assert neg_cols.sum() == 0
+        after_sum = count_kept.loc[
+            count_kept.index.get_level_values(zone_field) == zone
+        ].sum()
+        print(
+            f"FNISHED deleting {before_sum - after_sum} records to match cencus diff for {zone}"
+        )
+    return convert_count_to_full(count_kept.reset_index())
diff --git a/job.script b/job.script
@@ -4,7 +4,7 @@
 #SBATCH --nodes=1
 #SBATCH --cpus-per-task=2
 #SBATCH --ntasks=1
-#SBATCH --time=150:00:00
+#SBATCH --time=200:00:00
 #SBATCH --output=run_hh_sequential.out
 
 source .venv/bin/activate