From b5ae0480418f245313173f5f48440be9a26ec554 Mon Sep 17 00:00:00 2001 From: Ben Welsh Date: Thu, 28 Dec 2023 13:40:58 +0000 Subject: [PATCH] Simplify how we get the deleted records --- src/transform.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/transform.py b/src/transform.py index 61842c7..8928b32 100644 --- a/src/transform.py +++ b/src/transform.py @@ -34,6 +34,8 @@ def transform(verbose: bool) -> None: df_list.append(df) # Combine them all + if verbose: + print(f"Concatenating {len(df_list)} dataframes") master_df = pd.concat(df_list).apply(parse_row, axis=1) # Count by scrape @@ -104,22 +106,19 @@ def transform(verbose: bool) -> None: ) new_df.to_csv(new_path, index=False) - # Identify deleted records - deleted_ids = prev_ids - latest_ids - print(f"Found [bold]{len(deleted_ids)}[/bold] deleted records") + # Identify past datasets that didn't appear in the latest scrape + deleted_df = latest_df[latest_df.scrape_date < latest_scrape_date] + if verbose: + print( + f"Found [bold]{len(deleted_df)}[/bold] past records that do not appear in the latest scrape" + ) # Append to the deleted records file - deleted_df = prev_df[prev_df["id"].isin(deleted_ids)] deleted_path = clean_dir / "deleted.csv" if verbose: print( f"Writing [bold]{len(deleted_df)}[/bold] deleted records to [bold]{deleted_path}[/bold]" ) - deleted_df.to_csv(deleted_path, index=False, mode="a", header=False) - - # Deduplicate the deleted file - deleted_df = pd.read_csv(deleted_path) - deleted_df = deleted_df.drop_duplicates(subset=["id"], keep="first") deleted_df.to_csv(deleted_path, index=False) # Write out the full dataset