Skip to content

Commit

Permalink
Add variable to disable removing sql source files during ingestion (#…
Browse files Browse the repository at this point in the history
…4216)

* add variable to disable removig sql source files for ingestion workflows.

* Change default for removed parameter, allow either flag to remove

* Whitespace change

---------

Co-authored-by: Madison Swain-Bowden <bowdenm@spu.edu>
  • Loading branch information
madewithkode and AetherUnbound authored May 6, 2024
1 parent e6fa7b3 commit 12e7c87
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 4 deletions.
9 changes: 6 additions & 3 deletions catalog/dags/providers/provider_api_scripts/inaturalist.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ def create_preingestion_tasks():
python_callable=INaturalistDataIngester.load_catalog_of_life_names,
doc_md="Load vernacular taxon names from Catalog of Life",
op_kwargs={
"remove_api_files": "{{params.sql_rm_source_data_after_ingesting}}"
"remove_api_files": "{{ params.sql_rm_source_data_after_ingesting or var.json.SQL_RM_SOURCE_DATA_AFTER_INGESTION }}",
},
execution_timeout=timedelta(minutes=15),
)
Expand All @@ -347,8 +347,11 @@ def create_postingestion_tasks():
check_drop_parameter = ShortCircuitOperator(
task_id="check_drop_parameter",
doc_md="Skip post-ingestion if NOT sql_rm_source_data_after_ingesting.",
op_args=["{{ params.sql_rm_source_data_after_ingesting }}"],
python_callable=(lambda x: x),
op_args=[
"{{ params.sql_rm_source_data_after_ingesting }}",
"{{ var.json.SQL_RM_SOURCE_DATA_AFTER_INGESTION }}",
],
python_callable=(lambda *x: any(x)),
trigger_rule=TriggerRule.NONE_SKIPPED,
# just skip the drop steps, not the final reporting step in the dag
ignore_downstream_trigger_rules=False,
Expand Down
2 changes: 1 addition & 1 deletion catalog/dags/providers/provider_dag_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def create_provider_api_workflow_dag(provider_conf: ProviderWorkflow):
),
),
"sql_rm_source_data_after_ingesting": Param(
default=True,
default=False,
type="boolean",
description=(
"Whether to delete source data from airflow and DB once ingestion"
Expand Down
4 changes: 4 additions & 0 deletions catalog/env.template
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,7 @@ AIRFLOW_VAR_AIRFLOW_RDS_SNAPSHOTS_TO_RETAIN=7
# Whether to toggle production CloudWatch alarms when running a data refresh DAG.
# Used to prevent requiring AWS credentials when running locally.
AIRFLOW_VAR_TOGGLE_CLOUDWATCH_ALARMS=false

# Whether to delete source data from airflow and DB once ingestion is complete.
# This is used to support data quality testing in SQL-only DAGs like iNaturalist
AIRFLOW_VAR_SQL_RM_SOURCE_DATA_AFTER_INGESTION=false

0 comments on commit 12e7c87

Please sign in to comment.