Skip to content

Commit

Permalink
Rename references table and remove table duplication
Browse files Browse the repository at this point in the history
  • Loading branch information
jmelot committed Jul 19, 2023
1 parent 5a6bcc0 commit 4b5c296
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 32 deletions.
30 changes: 4 additions & 26 deletions linkage_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@
# along the way
check_queries = []
production_tables = ["all_metadata_with_cld2_lid", "article_links", "article_links_with_dataset",
"article_merged_meta", "mapped_references", "article_links_nested"]
"article_merged_meta", "references", "article_links_nested"]
for table_name in production_tables:
check_queries.append(BigQueryCheckOperator(
task_id="check_monotonic_increase_"+table_name.lower(),
Expand Down Expand Up @@ -456,7 +456,7 @@
),
BigQueryCheckOperator(
task_id="no_null_references",
sql=f"select count(0) = 0 from {staging_dataset}.mapped_references where id is null or ref_id is null",
sql=f"select count(0) = 0 from {staging_dataset}.references where id is null or ref_id is null",
use_legacy_sql = False
),
])
Expand All @@ -474,33 +474,11 @@
write_disposition="WRITE_TRUNCATE"
))

# this query is essentially just copying mapped_references to paper_references_merged, so
# putting this in the push_to_production array is not risky
push_to_production.append(
BigQueryInsertJobOperator(
task_id="copy_mapped_references_to_paper_references_merged",
configuration={
"query": {
"query": f"select id as merged_id, ref_id from {staging_dataset}.mapped_references",
"useLegacySql": False,
"destinationTable": {
"projectId": project_id,
"datasetId": production_dataset,
"tableId": "paper_references_merged"
},
"allowLargeResults": True,
"createDisposition": "CREATE_IF_NEEDED",
"writeDisposition": "WRITE_TRUNCATE"
}
},
)
)

wait_for_production_copy = DummyOperator(task_id="wait_for_production_copy")

snapshots = []
curr_date = datetime.now().strftime("%Y%m%d")
for table in ["article_links", "article_links_nested", "paper_references_merged"]:
for table in ["article_links", "article_links_nested"]:
# mk the snapshot predictions table
snapshots.append(BigQueryToBigQueryOperator(
task_id=f"snapshot_{table}",
Expand All @@ -516,7 +494,7 @@

with open(f"{os.environ.get('DAGS_FOLDER')}/schemas/{gcs_folder}/table_descriptions.json") as f:
table_desc = json.loads(f.read())
for table in production_tables + ["paper_references_merged"]:
for table in production_tables:
pop_descriptions = PythonOperator(
task_id="populate_column_documentation_for_" + table,
op_kwargs={
Expand Down
4 changes: 2 additions & 2 deletions schemas/mapped_references.json → schemas/references.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[
{
"mode": "REQUIRED",
"name": "id",
"name": "merged_id",
"type": "STRING",
"description": "CSET merged id of a set of articles"
},
Expand All @@ -11,4 +11,4 @@
"type": "STRING",
"description": "Articles referenced by `id`"
}
]
]
3 changes: 1 addition & 2 deletions schemas/table_descriptions.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,5 @@
"article_links_nested": "Maps one merged_id to all its constitutent orig_ids",
"article_links_with_dataset": "Maps a merged id to each of its orig_ids and their dataset",
"article_merged_meta": "*Deprecated* -- use gcp_cset_links_v2.corpus_merged",
"mapped_references": "Maps a paper's id to the id of papers it references",
"paper_references_merged": "Maps a paper's id to the id of papers it references. This table is the same as mapped_references."
"references": "Maps a paper's merged id to the merged ids of papers it references"
}
2 changes: 1 addition & 1 deletion sequences/generate_merged_metadata.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ article_links_with_meta
article_links_nested
article_merged_meta
article_links_with_dataset
mapped_references
references
2 changes: 1 addition & 1 deletion sql/mapped_references.sql → sql/references.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
-- For each merged_id, we take all its orig_ids' references, and look up the merged_ids of the references.
-- We exclude references that appear outside our merged corpus.
SELECT
DISTINCT links1.merged_id AS id,
DISTINCT links1.merged_id AS merged_id,
links2.merged_id AS ref_id
FROM (
SELECT
Expand Down

0 comments on commit 4b5c296

Please sign in to comment.