Skip to content

Commit

Permalink
Merge pull request #41 from georgetown-cset/improve-match-check-perfo…
Browse files Browse the repository at this point in the history
…rmance

Improve performance of query to check that trivial article matches survive
  • Loading branch information
katnquinn authored Apr 24, 2024
2 parents 9750867 + c43a0c7 commit 97aa649
Showing 1 changed file with 18 additions and 14 deletions.
32 changes: 18 additions & 14 deletions linkage_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,20 +533,24 @@
BigQueryCheckOperator(
task_id="all_trivial_matches_survived",
sql=f"""
select
count(concat(all1_id, " ", all2_id)) = 0
from
{staging_dataset}.metadata_match
where concat(all1_id, " ", all2_id) not in (
select
concat(links1.orig_id, " ", links2.orig_id)
from
{staging_dataset}.sources links1
left join
{staging_dataset}.sources links2
on links1.merged_id = links2.merged_id
)
""",
-- check that all article pairs generated by exact matches make it through the simhash and
-- merged id assignment, except ones we've deliberately unlinked
select
count(0) = 0
from
{staging_dataset}.metadata_match
left join
{staging_dataset}.sources as links1
on all1_id = links1.orig_id
left join
{staging_dataset}.sources as links2
on (links1.merged_id = links2.merged_id) and (all2_id = links2.orig_id)
-- don't count pairs which we've deliberately unlinked
left join
{staging_dataset}.unlink
on (all1_id = id1) and (all2_id = id2)
where ((links1.orig_id is null) or (links2.orig_id is null)) and ((id1 is null) and (id2 is null))
""",
use_legacy_sql=False,
),
BigQueryCheckOperator(
Expand Down

0 comments on commit 97aa649

Please sign in to comment.