From ec804ef3ee94aec720d4a48f2cec6e372a57a367 Mon Sep 17 00:00:00 2001 From: Jennifer Melot Date: Tue, 23 Jan 2024 11:15:30 -0500 Subject: [PATCH] Ensure done files cleared out in case of retry --- utils/run_ids_scripts.sh | 1 + utils/run_simhash_scripts.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/utils/run_ids_scripts.sh b/utils/run_ids_scripts.sh index 72ccaac..db0ad75 100644 --- a/utils/run_ids_scripts.sh +++ b/utils/run_ids_scripts.sh @@ -1,4 +1,5 @@ cd /mnt/disks/data/run +gsutil rm gs://airflow-data-exchange/article_linkage/tmp/done_files/ids_are_done python3 create_merge_ids.py --match_dir usable_ids --prev_id_mapping_dir prev_id_mapping --merge_file id_mapping.jsonl --current_ids_dir article_pairs /snap/bin/gsutil -m cp id_mapping.jsonl gs://airflow-data-exchange/article_linkage/tmp/ /snap/bin/gsutil -m cp simhash_results/* gs://airflow-data-exchange/article_linkage/simhash_results/ diff --git a/utils/run_simhash_scripts.sh b/utils/run_simhash_scripts.sh index 78b649d..7b76579 100644 --- a/utils/run_simhash_scripts.sh +++ b/utils/run_simhash_scripts.sh @@ -1,4 +1,5 @@ cd /mnt/disks/data/run +gsutil rm gs://airflow-data-exchange/article_linkage/tmp/done_files/simhash_is_done python3 run_simhash.py simhash_input simhash_results --simhash_indexes simhash_indexes --new_simhash_indexes new_simhash_indexes cp -r article_pairs usable_ids cp simhash_results/* article_pairs/