From e9010319c9b67edbcbca1de833883cf3d9542fba Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Wed, 29 Nov 2023 13:55:20 -0800 Subject: [PATCH] fix outdated pandas checksums in fetch_process_wbm_dataset.py (closes #66) --- data/wbm/fetch_process_wbm_dataset.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py index 5f78db0c..debed349 100644 --- a/data/wbm/fetch_process_wbm_dataset.py +++ b/data/wbm/fetch_process_wbm_dataset.py @@ -86,16 +86,16 @@ assert len(json_paths) == len(step_lens), "Mismatch in WBM steps and JSON files" -wbm_struct_json_checksums = ( - -7815922250032563359, - -86268461085685423, - -7707371069320539066, - -3579196048285845088, - -248039116266365352, +wbm_structs_index_checksums = ( + 10630821823676988257, + 18360475612623866193, + 10739373004389012550, + 14867548025423706528, + 18198704957443186264, ) - -dfs_wbm_structs = {} +if "dfs_wbm_structs" not in locals(): + dfs_wbm_structs = {} for json_path in json_paths: step = int(json_path.split(".json.bz2")[0][-1]) assert step in range(1, 6) @@ -110,7 +110,11 @@ # we hash index only for speed # could use joblib.hash(df) to hash whole df but it's slow checksum = pd.util.hash_pandas_object(df.index).sum() - assert checksum == wbm_struct_json_checksums[step - 1], "bad JSON file checksum" + expected = wbm_structs_index_checksums[step - 1] + assert checksum == expected, ( + f"bad df.index checksum for {step=}, {expected=}, got {checksum=}\n" + f"\n{json_path=}" + ) if step == 3: df = df.drop(index=[f"step_3_{id}" for id in bad_struct_ids])