From e9010319c9b67edbcbca1de833883cf3d9542fba Mon Sep 17 00:00:00 2001
From: Janosh Riebesell <janosh.riebesell@gmail.com>
Date: Wed, 29 Nov 2023 13:55:20 -0800
Subject: [PATCH] fix outdated pandas checksums in fetch_process_wbm_dataset.py
 (closes #66)

---
 data/wbm/fetch_process_wbm_dataset.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py
index 5f78db0c..debed349 100644
--- a/data/wbm/fetch_process_wbm_dataset.py
+++ b/data/wbm/fetch_process_wbm_dataset.py
@@ -86,16 +86,16 @@
 
 
 assert len(json_paths) == len(step_lens), "Mismatch in WBM steps and JSON files"
-wbm_struct_json_checksums = (
-    -7815922250032563359,
-    -86268461085685423,
-    -7707371069320539066,
-    -3579196048285845088,
-    -248039116266365352,
+wbm_structs_index_checksums = (
+    10630821823676988257,
+    18360475612623866193,
+    10739373004389012550,
+    14867548025423706528,
+    18198704957443186264,
 )
 
-
-dfs_wbm_structs = {}
+if "dfs_wbm_structs" not in locals():
+    dfs_wbm_structs = {}
 for json_path in json_paths:
     step = int(json_path.split(".json.bz2")[0][-1])
     assert step in range(1, 6)
@@ -110,7 +110,11 @@
     # we hash index only for speed
     # could use joblib.hash(df) to hash whole df but it's slow
     checksum = pd.util.hash_pandas_object(df.index).sum()
-    assert checksum == wbm_struct_json_checksums[step - 1], "bad JSON file checksum"
+    expected = wbm_structs_index_checksums[step - 1]
+    assert checksum == expected, (
+        f"bad df.index checksum for {step=}, {expected=}, got {checksum=}\n"
+        f"\n{json_path=}"
+    )
 
     if step == 3:
         df = df.drop(index=[f"step_3_{id}" for id in bad_struct_ids])