Skip to content

Commit

Permalink
Remove more stray references to article_links
Browse files Browse the repository at this point in the history
  • Loading branch information
jmelot committed Aug 16, 2023
1 parent 2d2f835 commit d10f871
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 12 deletions.
12 changes: 6 additions & 6 deletions linkage_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@
heavy_compute_inputs = [
BigQueryToGCSOperator(
task_id="export_old_cset_ids",
source_project_dataset_table=f"{production_dataset}.article_links",
source_project_dataset_table=f"{production_dataset}.sources",
destination_cloud_storage_uris=f"gs://{bucket}/{tmp_dir}/prev_id_mapping/prev_id_mapping*.jsonl",
export_format="NEWLINE_DELIMITED_JSON"
),
Expand Down Expand Up @@ -361,8 +361,8 @@
task_id="import_id_mapping",
bucket=bucket,
source_objects=[f"{tmp_dir}/id_mapping.jsonl"],
schema_object=f"{schema_dir}/article_links.json",
destination_project_dataset_table=f"{staging_dataset}.article_links",
schema_object=f"{schema_dir}/sources.json",
destination_project_dataset_table=f"{staging_dataset}.sources",
source_format="NEWLINE_DELIMITED_JSON",
create_disposition="CREATE_IF_NEEDED",
write_disposition="WRITE_TRUNCATE"
Expand Down Expand Up @@ -429,7 +429,7 @@
BigQueryCheckOperator(
task_id="all_ids_survived",
sql=(f"select count(0) = 0 from (select id from {staging_dataset}.union_ids "
f"where id not in (select orig_id from {staging_dataset}.article_links))"),
f"where id not in (select orig_id from {staging_dataset}.sources))"),
use_legacy_sql=False
),
BigQueryCheckOperator(
Expand All @@ -443,9 +443,9 @@
select
concat(links1.orig_id, " ", links2.orig_id)
from
{staging_dataset}.article_links links1
{staging_dataset}.sources links1
left join
{staging_dataset}.article_links links2
{staging_dataset}.sources links2
on links1.merged_id = links2.merged_id
)
""",
Expand Down
6 changes: 3 additions & 3 deletions sql/references.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,18 @@ WITH references AS (
SELECT
orig_id
FROM
{{ staging_dataset }}.article_links )
{{ staging_dataset }}.sources )
)
SELECT
DISTINCT referencing_papers.merged_id AS merged_id,
referenced_papers.merged_id AS ref_id
FROM references
LEFT JOIN
{{ staging_dataset }}.article_links AS referencing_papers
{{ staging_dataset }}.sources AS referencing_papers
ON
references.id = referencing_papers.orig_id
LEFT JOIN
{{ staging_dataset }}.article_links AS referenced_papers
{{ staging_dataset }}.sources AS referenced_papers
ON
references.reference = referenced_papers.orig_id
WHERE
Expand Down
6 changes: 3 additions & 3 deletions sql/sources.sql
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
-- add orig_id dataset to the article_links table
-- add orig_id dataset to the sources table
select distinct
a.merged_id,
a.orig_id,
b.dataset
from {{staging_dataset}}.article_links a
from {{staging_dataset}}.sources a
left join
{{staging_dataset}}.union_metadata b
on a.orig_id = b.id
on a.orig_id = b.id

0 comments on commit d10f871

Please sign in to comment.