Merge pull request #32 from georgetown-cset/other-table-renaming

Rename remaining tables
georgetown-cset · Aug 1, 2023 · caec1f7 · caec1f7
2 parents 4b5c296 + 3b070f2
commit caec1f7
Show file tree

Hide file tree

Showing 15 changed files with 22 additions and 209 deletions.
diff --git a/linkage_dag.py b/linkage_dag.py
@@ -20,7 +20,7 @@
     DAGS_DIR, get_default_args, get_post_success
 
 
-production_dataset = "gcp_cset_links_v3"
+production_dataset = "literature"
 staging_dataset = f"staging_{production_dataset}"
 
 with DAG("article_linkage_updater",
@@ -410,8 +410,7 @@
     # we're about to copy tables from staging to production, so do checks to make sure we haven't broken anything
     # along the way
     check_queries = []
-    production_tables = ["all_metadata_with_cld2_lid", "article_links", "article_links_with_dataset",
-                         "article_merged_meta", "references", "article_links_nested"]
+    production_tables = ["sources", "references", "all_metadata_with_cld2_lid"]
     for table_name in production_tables:
         check_queries.append(BigQueryCheckOperator(
             task_id="check_monotonic_increase_"+table_name.lower(),
@@ -420,15 +419,12 @@
             use_legacy_sql=False
         ))
 
-    for table_name, pk in [("article_links", "orig_id"), ("article_links_with_dataset", "orig_id"),
-                           ("article_merged_meta", "merged_id")]:
-        check_queries.append(BigQueryCheckOperator(
-            task_id="check_pks_are_unique_"+table_name.lower(),
-            sql=f"select count({pk}) = count(distinct({pk})) from {staging_dataset}.{table_name}",
-            use_legacy_sql=False
-        ))
-
     check_queries.extend([
+        BigQueryCheckOperator(
+            task_id="check_pks_are_unique_sources",
+            sql=f"select count(orig_id) = count(distinct(orig_id)) from {staging_dataset}.sources",
+            use_legacy_sql=False
+        ),
         BigQueryCheckOperator(
             task_id="all_ids_survived",
             sql=(f"select count(0) = 0 from (select id from {staging_dataset}.union_ids "
@@ -463,38 +459,26 @@
 
     # We're done! Checks passed, so copy to production and post success to slack
     start_production_cp = DummyOperator(task_id="start_production_cp")
+    success_alert = get_post_success("Article linkage update succeeded!", dag)
+    curr_date = datetime.now().strftime("%Y%m%d")
+    with open(f"{os.environ.get('DAGS_FOLDER')}/schemas/{gcs_folder}/table_descriptions.json") as f:
+        table_desc = json.loads(f.read())
 
-    push_to_production = []
     for table in production_tables:
-        push_to_production.append(BigQueryToBigQueryOperator(
+        push_to_production = BigQueryToBigQueryOperator(
             task_id="copy_"+table.lower(),
             source_project_dataset_tables=[f"{staging_dataset}.{table}"],
             destination_project_dataset_table=f"{production_dataset}.{table}",
             create_disposition="CREATE_IF_NEEDED",
             write_disposition="WRITE_TRUNCATE"
-        ))
-
-    wait_for_production_copy = DummyOperator(task_id="wait_for_production_copy")
-
-    snapshots = []
-    curr_date = datetime.now().strftime("%Y%m%d")
-    for table in ["article_links", "article_links_nested"]:
-        # mk the snapshot predictions table
-        snapshots.append(BigQueryToBigQueryOperator(
+        )
+        snapshot = BigQueryToBigQueryOperator(
             task_id=f"snapshot_{table}",
             source_project_dataset_tables=[f"{production_dataset}.{table}"],
             destination_project_dataset_table=f"{backup_dataset}.{table}_{curr_date}",
             create_disposition="CREATE_IF_NEEDED",
             write_disposition="WRITE_TRUNCATE"
-        ))
-
-    wait_for_snapshots = DummyOperator(task_id="wait_for_snapshots")
-
-    success_alert = get_post_success("Article linkage update succeeded!", dag)
-
-    with open(f"{os.environ.get('DAGS_FOLDER')}/schemas/{gcs_folder}/table_descriptions.json") as f:
-        table_desc = json.loads(f.read())
-    for table in production_tables:
+        )
         pop_descriptions = PythonOperator(
             task_id="populate_column_documentation_for_" + table,
             op_kwargs={
@@ -504,7 +488,7 @@
             },
             python_callable=update_table_descriptions
         )
-        wait_for_snapshots >> pop_descriptions >> success_alert
+        start_production_cp >> push_to_production >> snapshot >> pop_descriptions >> success_alert
 
     downstream_tasks = [
         TriggerDagRunOperator(task_id="trigger_article_classification", trigger_dag_id="article_classification"),
@@ -520,7 +504,6 @@
     (last_combination_query >> heavy_compute_inputs >> gce_instance_start >> [create_cset_ids, run_lid] >>
         gce_instance_stop >> [import_id_mapping, import_lid] >> start_final_transform_queries)
 
-    (last_transform_query >> check_queries >> start_production_cp >> push_to_production >> wait_for_production_copy >>
-        snapshots >> wait_for_snapshots)
+    last_transform_query >> check_queries >> start_production_cp
 
     success_alert >> downstream_tasks
diff --git a/schemas/all_metadata_norm.json b/schemas/all_metadata_norm.json
diff --git a/schemas/article_links.json b/schemas/article_links.json
diff --git a/schemas/article_links_nested.json b/schemas/article_links_nested.json
diff --git a/schemas/article_merged_meta.json b/schemas/article_merged_meta.json
diff --git a/schemas/paper_references_merged.json b/schemas/paper_references_merged.json
diff --git a/schemas/article_links_with_dataset.json → schemas/sources.json b/schemas/article_links_with_dataset.json → schemas/sources.json
diff --git a/schemas/table_descriptions.json b/schemas/table_descriptions.json
@@ -1,8 +1,5 @@
 {
   "all_metadata_with_cld2_lid": "All metadata for the articles used in linkage.",
-  "article_links": "Maps orig_ids to merged_ids",
-  "article_links_nested": "Maps one merged_id to all its constitutent orig_ids",
-  "article_links_with_dataset": "Maps a merged id to each of its orig_ids and their dataset",
-  "article_merged_meta": "*Deprecated* -- use gcp_cset_links_v2.corpus_merged",
+  "sources": "Maps a merged id to each of its orig_ids and their dataset",
   "references": "Maps a paper's merged id to the merged ids of papers it references"
 }
diff --git a/sequences/generate_merged_metadata.tsv b/sequences/generate_merged_metadata.tsv
@@ -1,5 +1,2 @@
-article_links_with_meta
-article_links_nested
-article_merged_meta
-article_links_with_dataset
-references
+sources
+references
diff --git a/sql/article_links_nested.sql b/sql/article_links_nested.sql
diff --git a/sql/article_links_with_meta.sql b/sql/article_links_with_meta.sql
diff --git a/sql/article_merged_meta.sql b/sql/article_merged_meta.sql
diff --git a/sql/references.sql b/sql/references.sql
@@ -18,7 +18,7 @@ FROM (
     SELECT
       orig_id
     FROM
-      {{ staging_dataset }}.article_links_with_dataset )) AS references
+      {{ staging_dataset }}.sources )) AS references
 LEFT JOIN
   {{ staging_dataset }}.article_links AS links1
 ON

diff --git a/sql/article_links_with_dataset.sql → sql/sources.sql b/sql/article_links_with_dataset.sql → sql/sources.sql
diff --git a/sql/union_metadata.sql b/sql/union_metadata.sql
@@ -33,7 +33,7 @@ mapped_references as (
     meta
   cross join unnest(split(references, ",")) as orig_id_ref
     inner join
-    {{ production_dataset }}.article_links
+    {{ production_dataset }}.sources
   on orig_id_ref = orig_id
   group by id
 )