diff --git a/linkage_dag.py b/linkage_dag.py index 4711aa8..a16a6c4 100644 --- a/linkage_dag.py +++ b/linkage_dag.py @@ -258,7 +258,7 @@ heavy_compute_inputs = [ BigQueryToGCSOperator( task_id="export_old_cset_ids", - source_project_dataset_table=f"{production_dataset}.article_links", + source_project_dataset_table=f"{production_dataset}.sources", destination_cloud_storage_uris=f"gs://{bucket}/{tmp_dir}/prev_id_mapping/prev_id_mapping*.jsonl", export_format="NEWLINE_DELIMITED_JSON" ), @@ -361,8 +361,8 @@ task_id="import_id_mapping", bucket=bucket, source_objects=[f"{tmp_dir}/id_mapping.jsonl"], - schema_object=f"{schema_dir}/article_links.json", - destination_project_dataset_table=f"{staging_dataset}.article_links", + schema_object=f"{schema_dir}/sources.json", + destination_project_dataset_table=f"{staging_dataset}.sources", source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE" @@ -429,7 +429,7 @@ BigQueryCheckOperator( task_id="all_ids_survived", sql=(f"select count(0) = 0 from (select id from {staging_dataset}.union_ids " - f"where id not in (select orig_id from {staging_dataset}.article_links))"), + f"where id not in (select orig_id from {staging_dataset}.sources))"), use_legacy_sql=False ), BigQueryCheckOperator( @@ -443,9 +443,9 @@ select concat(links1.orig_id, " ", links2.orig_id) from - {staging_dataset}.article_links links1 + {staging_dataset}.sources links1 left join - {staging_dataset}.article_links links2 + {staging_dataset}.sources links2 on links1.merged_id = links2.merged_id ) """, diff --git a/sql/references.sql b/sql/references.sql index a3f514d..51b003e 100644 --- a/sql/references.sql +++ b/sql/references.sql @@ -15,18 +15,18 @@ WITH references AS ( SELECT orig_id FROM - {{ staging_dataset }}.article_links ) + {{ staging_dataset }}.sources ) ) SELECT DISTINCT referencing_papers.merged_id AS merged_id, referenced_papers.merged_id AS ref_id FROM references LEFT JOIN - {{ staging_dataset }}.article_links AS referencing_papers + {{ staging_dataset }}.sources AS referencing_papers ON references.id = referencing_papers.orig_id LEFT JOIN - {{ staging_dataset }}.article_links AS referenced_papers + {{ staging_dataset }}.sources AS referenced_papers ON references.reference = referenced_papers.orig_id WHERE diff --git a/sql/sources.sql b/sql/sources.sql index c7985c6..5287ec1 100644 --- a/sql/sources.sql +++ b/sql/sources.sql @@ -1,9 +1,9 @@ --- add orig_id dataset to the article_links table +-- add orig_id dataset to the sources table select distinct a.merged_id, a.orig_id, b.dataset -from {{staging_dataset}}.article_links a +from {{staging_dataset}}.sources a left join {{staging_dataset}}.union_metadata b -on a.orig_id = b.id \ No newline at end of file +on a.orig_id = b.id