Skip to content

Commit

Permalink
fix outdated pandas checksums in
Browse files Browse the repository at this point in the history
fetch_process_wbm_dataset.py (closes #66)
  • Loading branch information
janosh committed Nov 29, 2023
1 parent 82d07f1 commit e901031
Showing 1 changed file with 13 additions and 9 deletions.
22 changes: 13 additions & 9 deletions data/wbm/fetch_process_wbm_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,16 @@


assert len(json_paths) == len(step_lens), "Mismatch in WBM steps and JSON files"
wbm_struct_json_checksums = (
-7815922250032563359,
-86268461085685423,
-7707371069320539066,
-3579196048285845088,
-248039116266365352,
wbm_structs_index_checksums = (
10630821823676988257,
18360475612623866193,
10739373004389012550,
14867548025423706528,
18198704957443186264,
)


dfs_wbm_structs = {}
if "dfs_wbm_structs" not in locals():
dfs_wbm_structs = {}
for json_path in json_paths:
step = int(json_path.split(".json.bz2")[0][-1])
assert step in range(1, 6)
Expand All @@ -110,7 +110,11 @@
# we hash index only for speed
# could use joblib.hash(df) to hash whole df but it's slow
checksum = pd.util.hash_pandas_object(df.index).sum()
assert checksum == wbm_struct_json_checksums[step - 1], "bad JSON file checksum"
expected = wbm_structs_index_checksums[step - 1]
assert checksum == expected, (
f"bad df.index checksum for {step=}, {expected=}, got {checksum=}\n"
f"\n{json_path=}"
)

if step == 3:
df = df.drop(index=[f"step_3_{id}" for id in bad_struct_ids])
Expand Down

0 comments on commit e901031

Please sign in to comment.