Rename references table and remove table duplication

Closes georgetown-cset/cset_article_schema#51
georgetown-cset · Jul 19, 2023 · 4b5c296 · 4b5c296
1 parent 5a6bcc0
commit 4b5c296
Show file tree

Hide file tree

Showing 5 changed files with 9 additions and 32 deletions.
diff --git a/linkage_dag.py b/linkage_dag.py
@@ -411,7 +411,7 @@
     # along the way
     check_queries = []
     production_tables = ["all_metadata_with_cld2_lid", "article_links", "article_links_with_dataset",
-                         "article_merged_meta", "mapped_references", "article_links_nested"]
+                         "article_merged_meta", "references", "article_links_nested"]
     for table_name in production_tables:
         check_queries.append(BigQueryCheckOperator(
             task_id="check_monotonic_increase_"+table_name.lower(),
@@ -456,7 +456,7 @@
         ),
         BigQueryCheckOperator(
             task_id="no_null_references",
-            sql=f"select count(0) = 0 from {staging_dataset}.mapped_references where id is null or ref_id is null",
+            sql=f"select count(0) = 0 from {staging_dataset}.references where id is null or ref_id is null",
             use_legacy_sql = False
         ),
     ])
@@ -474,33 +474,11 @@
             write_disposition="WRITE_TRUNCATE"
         ))
 
-    # this query is essentially just copying mapped_references to paper_references_merged, so
-    # putting this in the push_to_production array is not risky
-    push_to_production.append(
-        BigQueryInsertJobOperator(
-            task_id="copy_mapped_references_to_paper_references_merged",
-            configuration={
-                "query": {
-                    "query": f"select id as merged_id, ref_id from {staging_dataset}.mapped_references",
-                    "useLegacySql": False,
-                    "destinationTable": {
-                        "projectId": project_id,
-                        "datasetId": production_dataset,
-                        "tableId": "paper_references_merged"
-                    },
-                    "allowLargeResults": True,
-                    "createDisposition": "CREATE_IF_NEEDED",
-                    "writeDisposition": "WRITE_TRUNCATE"
-                }
-            },
-        )
-    )
-
     wait_for_production_copy = DummyOperator(task_id="wait_for_production_copy")
 
     snapshots = []
     curr_date = datetime.now().strftime("%Y%m%d")
-    for table in ["article_links", "article_links_nested", "paper_references_merged"]:
+    for table in ["article_links", "article_links_nested"]:
         # mk the snapshot predictions table
         snapshots.append(BigQueryToBigQueryOperator(
             task_id=f"snapshot_{table}",
@@ -516,7 +494,7 @@
 
     with open(f"{os.environ.get('DAGS_FOLDER')}/schemas/{gcs_folder}/table_descriptions.json") as f:
         table_desc = json.loads(f.read())
-    for table in production_tables + ["paper_references_merged"]:
+    for table in production_tables:
         pop_descriptions = PythonOperator(
             task_id="populate_column_documentation_for_" + table,
             op_kwargs={

diff --git a/schemas/mapped_references.json → schemas/references.json b/schemas/mapped_references.json → schemas/references.json
@@ -1,7 +1,7 @@
 [
   {
     "mode": "REQUIRED",
-    "name": "id",
+    "name": "merged_id",
     "type": "STRING",
     "description": "CSET merged id of a set of articles"
   },
@@ -11,4 +11,4 @@
     "type": "STRING",
     "description": "Articles referenced by `id`"
   }
-]
+]
diff --git a/schemas/table_descriptions.json b/schemas/table_descriptions.json
@@ -4,6 +4,5 @@
   "article_links_nested": "Maps one merged_id to all its constitutent orig_ids",
   "article_links_with_dataset": "Maps a merged id to each of its orig_ids and their dataset",
   "article_merged_meta": "*Deprecated* -- use gcp_cset_links_v2.corpus_merged",
-  "mapped_references": "Maps a paper's id to the id of papers it references",
-  "paper_references_merged": "Maps a paper's id to the id of papers it references. This table is the same as mapped_references."
+  "references": "Maps a paper's merged id to the merged ids of papers it references"
 }
diff --git a/sequences/generate_merged_metadata.tsv b/sequences/generate_merged_metadata.tsv
@@ -2,4 +2,4 @@ article_links_with_meta
 article_links_nested
 article_merged_meta
 article_links_with_dataset
-mapped_references
+references
diff --git a/sql/mapped_references.sql → sql/references.sql b/sql/mapped_references.sql → sql/references.sql
@@ -3,7 +3,7 @@
 -- For each merged_id, we take all its orig_ids' references, and look up the merged_ids of the references.
 -- We exclude references that appear outside our merged corpus.
 SELECT
-  DISTINCT links1.merged_id AS id,
+  DISTINCT links1.merged_id AS merged_id,
   links2.merged_id AS ref_id
 FROM (
   SELECT