Skip to content

Commit

Permalink
Split merged ids if they lose orig ids
Browse files Browse the repository at this point in the history
Closes #45
  • Loading branch information
jmelot committed Aug 14, 2024
1 parent c046654 commit 017ac4d
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 16 deletions.
4 changes: 4 additions & 0 deletions tests/static/test_create_match_keys/input/input.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@
{"orig_id": "F", "merged_id": "carticle_0000000001"}
{"orig_id": "I", "merged_id": "carticle_0000000003"}
{"orig_id": "J", "merged_id": "carticle_0000000003"}
{"orig_id": "K", "merged_id": "carticle_0000000004"}
{"orig_id": "L", "merged_id": "carticle_0000000004"}
{"orig_id": "M", "merged_id": "carticle_0000000005"}
{"orig_id": "N", "merged_id": "carticle_0000000005"}
47 changes: 31 additions & 16 deletions tests/test_create_merge_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,15 @@ def test_skip_matches(self):
)

def test_create_match_keys(self):
# The first set (A, B, C) contains two old elts from the same match set and one new elt; should keep its id.
# The next (D, E, F) contains one elt from one match set, two from another; should change ids.
# Another (G, H) contains only new ids; should get a new id.
# The last two (I and J) are two different match sets that share an old id and are in ids_to_drop;
# each should get a new id (this is in case of unlinking).
match_sets = [{"A", "B", "C"}, {"D", "E", "F"}, {"G", "H"}, {"I"}, {"J"}]
match_sets = [
{"A", "B", "C"},
{"D", "E", "F"},
{"G", "H"},
{"I"},
{"J"},
{"K", "L"},
{"M", "N", "O"},
]
out_dir = os.path.join(static_dir, "test_create_match_keys", "output")
if os.path.exists(out_dir):
shutil.rmtree(out_dir)
Expand All @@ -89,16 +92,28 @@ def test_create_match_keys(self):
id_mapping_dir = os.path.join(static_dir, "test_create_match_keys", "input")
ids_to_drop = os.path.join(static_dir, "test_create_match_keys", "ids_to_drop")
expected_output = [
{"orig_id": "A", "merged_id": "carticle_0000000001"},
{"orig_id": "B", "merged_id": "carticle_0000000001"},
{"orig_id": "C", "merged_id": "carticle_0000000001"},
{"orig_id": "D", "merged_id": "carticle_0000000004"},
{"orig_id": "E", "merged_id": "carticle_0000000004"},
{"orig_id": "F", "merged_id": "carticle_0000000004"},
{"orig_id": "G", "merged_id": "carticle_0000000005"},
{"orig_id": "H", "merged_id": "carticle_0000000005"},
{"orig_id": "I", "merged_id": "carticle_0000000006"},
{"orig_id": "J", "merged_id": "carticle_0000000007"},
# F was removed from this match set so A B and C should get a new merged id
{"orig_id": "A", "merged_id": "carticle_0000000006"},
{"orig_id": "B", "merged_id": "carticle_0000000006"},
{"orig_id": "C", "merged_id": "carticle_0000000006"},
# D, E, F contains one elt from one match set, two from another; should change ids
{"orig_id": "D", "merged_id": "carticle_0000000007"},
{"orig_id": "E", "merged_id": "carticle_0000000007"},
{"orig_id": "F", "merged_id": "carticle_0000000007"},
# G, H is a completely new match set with new ids, should get a new id
{"orig_id": "G", "merged_id": "carticle_0000000008"},
{"orig_id": "H", "merged_id": "carticle_0000000008"},
# The last two (I and J) are two different match sets that share an old id and are in ids_to_drop;
# each should get a new id
{"orig_id": "I", "merged_id": "carticle_0000000009"},
{"orig_id": "J", "merged_id": "carticle_0000000010"},
# Nothing changed for this match set so the merged id stays the same
{"orig_id": "K", "merged_id": "carticle_0000000004"},
{"orig_id": "L", "merged_id": "carticle_0000000004"},
# This match set got one new article so the merged id stays the same
{"orig_id": "M", "merged_id": "carticle_0000000005"},
{"orig_id": "N", "merged_id": "carticle_0000000005"},
{"orig_id": "O", "merged_id": "carticle_0000000005"},
]
print(expected_output)
create_match_keys(match_sets, out_fi, ids_to_drop, id_mapping_dir)
Expand Down
9 changes: 9 additions & 0 deletions utils/create_merge_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,10 @@ def create_match_keys(
:param prev_id_mapping_dir: optional dir containing previous id mappings in jsonl form
:return: None
"""
print("Creating merged ids")
with open(match_file, mode="w") as out:
prev_orig_to_merg = {}
merg_to_orig = {}
max_merg = "carticle_0"
if prev_id_mapping_dir is not None:
for fi in os.listdir(prev_id_mapping_dir):
Expand All @@ -151,6 +153,9 @@ def create_match_keys(
merg_id = js["merged_id"]
assert orig_id not in prev_orig_to_merg
prev_orig_to_merg[orig_id] = merg_id
if merg_id not in merg_to_orig:
merg_to_orig[merg_id] = set()
merg_to_orig[merg_id].add(orig_id)
if merg_id > max_merg:
max_merg = merg_id
ignore_ids = set()
Expand All @@ -171,6 +176,10 @@ def create_match_keys(
)
if len(existing_ids) == 1 and list(existing_ids)[0] not in ignore_ids:
cset_article_id = existing_ids.pop()
# In some cases, merged ids can "split apart", if their constituent articles no longer
# match. We'll detect this case by checking whether the old set of articles assigned to
# this merged id contain any entries missing from our current set
if cset_article_id and (len(merg_to_orig[cset_article_id] - set(ms)) == 0):
num_old += 1
else:
cset_article_id = create_cset_article_id(match_id)
Expand Down

0 comments on commit 017ac4d

Please sign in to comment.