Skip to content

Commit

Permalink
Merge pull request #49 from georgetown-cset/dataflow-idempotence
Browse files Browse the repository at this point in the history
Clear out dataflow output dir on retry
  • Loading branch information
jmelot authored Oct 11, 2024
2 parents fe795c0 + 5c6c64d commit 4f3e24f
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions linkage_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
get_default_args,
get_post_success,
)
from dataloader.airflow_utils.utils import clear_gcs_dir
from dataloader.scripts.populate_documentation import update_table_descriptions

production_dataset = "literature"
Expand Down Expand Up @@ -190,6 +191,8 @@
"fields_to_clean": "title,abstract,last_names",
"region": "us-east1",
},
on_retry_callback=clear_gcs_dir(DATA_BUCKET, f"{tmp_dir}/cleaned_meta/clean"),
on_execute_callback=clear_gcs_dir(DATA_BUCKET, f"{tmp_dir}/cleaned_meta/clean"),
)

import_clean_metadata = GCSToBigQueryOperator(
Expand Down Expand Up @@ -438,6 +441,8 @@
table_id="all_metadata_with_cld2_lid",
)
],
on_retry_callback=clear_gcs_dir(DATA_BUCKET, f"{tmp_dir}/lid_output/lid"),
on_execute_callback=clear_gcs_dir(DATA_BUCKET, f"{tmp_dir}/lid_output/lid"),
)

# turn off the expensive godzilla of article linkage when we're done with it, then import the id mappings and
Expand Down

0 comments on commit 4f3e24f

Please sign in to comment.