Split merged ids if they lose orig ids

Closes #45
georgetown-cset · Aug 14, 2024 · 017ac4d · 017ac4d
1 parent c046654
commit 017ac4d
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 16 deletions.
diff --git a/tests/static/test_create_match_keys/input/input.jsonl b/tests/static/test_create_match_keys/input/input.jsonl
@@ -5,3 +5,7 @@
 {"orig_id": "F", "merged_id": "carticle_0000000001"}
 {"orig_id": "I", "merged_id": "carticle_0000000003"}
 {"orig_id": "J", "merged_id": "carticle_0000000003"}
+{"orig_id": "K", "merged_id": "carticle_0000000004"}
+{"orig_id": "L", "merged_id": "carticle_0000000004"}
+{"orig_id": "M", "merged_id": "carticle_0000000005"}
+{"orig_id": "N", "merged_id": "carticle_0000000005"}
diff --git a/tests/test_create_merge_ids.py b/tests/test_create_merge_ids.py
@@ -75,12 +75,15 @@ def test_skip_matches(self):
         )
 
     def test_create_match_keys(self):
-        # The first set (A, B, C) contains two old elts from the same match set and one new elt; should keep its id.
-        # The next (D, E, F) contains one elt from one match set, two from another; should change ids.
-        # Another (G, H) contains only new ids; should get a new id.
-        # The last two (I and J) are two different match sets that share an old id and are in ids_to_drop;
-        # each should get a new id (this is in case of unlinking).
-        match_sets = [{"A", "B", "C"}, {"D", "E", "F"}, {"G", "H"}, {"I"}, {"J"}]
+        match_sets = [
+            {"A", "B", "C"},
+            {"D", "E", "F"},
+            {"G", "H"},
+            {"I"},
+            {"J"},
+            {"K", "L"},
+            {"M", "N", "O"},
+        ]
         out_dir = os.path.join(static_dir, "test_create_match_keys", "output")
         if os.path.exists(out_dir):
             shutil.rmtree(out_dir)
@@ -89,16 +92,28 @@ def test_create_match_keys(self):
         id_mapping_dir = os.path.join(static_dir, "test_create_match_keys", "input")
         ids_to_drop = os.path.join(static_dir, "test_create_match_keys", "ids_to_drop")
         expected_output = [
-            {"orig_id": "A", "merged_id": "carticle_0000000001"},
-            {"orig_id": "B", "merged_id": "carticle_0000000001"},
-            {"orig_id": "C", "merged_id": "carticle_0000000001"},
-            {"orig_id": "D", "merged_id": "carticle_0000000004"},
-            {"orig_id": "E", "merged_id": "carticle_0000000004"},
-            {"orig_id": "F", "merged_id": "carticle_0000000004"},
-            {"orig_id": "G", "merged_id": "carticle_0000000005"},
-            {"orig_id": "H", "merged_id": "carticle_0000000005"},
-            {"orig_id": "I", "merged_id": "carticle_0000000006"},
-            {"orig_id": "J", "merged_id": "carticle_0000000007"},
+            # F was removed from this match set so A B and C should get a new merged id
+            {"orig_id": "A", "merged_id": "carticle_0000000006"},
+            {"orig_id": "B", "merged_id": "carticle_0000000006"},
+            {"orig_id": "C", "merged_id": "carticle_0000000006"},
+            # D, E, F contains one elt from one match set, two from another; should change ids
+            {"orig_id": "D", "merged_id": "carticle_0000000007"},
+            {"orig_id": "E", "merged_id": "carticle_0000000007"},
+            {"orig_id": "F", "merged_id": "carticle_0000000007"},
+            # G, H is a completely new match set with new ids, should get a new id
+            {"orig_id": "G", "merged_id": "carticle_0000000008"},
+            {"orig_id": "H", "merged_id": "carticle_0000000008"},
+            # The last two (I and J) are two different match sets that share an old id and are in ids_to_drop;
+            # each should get a new id
+            {"orig_id": "I", "merged_id": "carticle_0000000009"},
+            {"orig_id": "J", "merged_id": "carticle_0000000010"},
+            # Nothing changed for this match set so the merged id stays the same
+            {"orig_id": "K", "merged_id": "carticle_0000000004"},
+            {"orig_id": "L", "merged_id": "carticle_0000000004"},
+            # This match set got one new article so the merged id stays the same
+            {"orig_id": "M", "merged_id": "carticle_0000000005"},
+            {"orig_id": "N", "merged_id": "carticle_0000000005"},
+            {"orig_id": "O", "merged_id": "carticle_0000000005"},
         ]
         print(expected_output)
         create_match_keys(match_sets, out_fi, ids_to_drop, id_mapping_dir)

diff --git a/utils/create_merge_ids.py b/utils/create_merge_ids.py
@@ -139,8 +139,10 @@ def create_match_keys(
     :param prev_id_mapping_dir: optional dir containing previous id mappings in jsonl form
     :return: None
     """
+    print("Creating merged ids")
     with open(match_file, mode="w") as out:
         prev_orig_to_merg = {}
+        merg_to_orig = {}
         max_merg = "carticle_0"
         if prev_id_mapping_dir is not None:
             for fi in os.listdir(prev_id_mapping_dir):
@@ -151,6 +153,9 @@ def create_match_keys(
                         merg_id = js["merged_id"]
                         assert orig_id not in prev_orig_to_merg
                         prev_orig_to_merg[orig_id] = merg_id
+                        if merg_id not in merg_to_orig:
+                            merg_to_orig[merg_id] = set()
+                        merg_to_orig[merg_id].add(orig_id)
                         if merg_id > max_merg:
                             max_merg = merg_id
         ignore_ids = set()
@@ -171,6 +176,10 @@ def create_match_keys(
             )
             if len(existing_ids) == 1 and list(existing_ids)[0] not in ignore_ids:
                 cset_article_id = existing_ids.pop()
+            # In some cases, merged ids can "split apart", if their constituent articles no longer
+            # match. We'll detect this case by checking whether the old set of articles assigned to
+            # this merged id contain any entries missing from our current set
+            if cset_article_id and (len(merg_to_orig[cset_article_id] - set(ms)) == 0):
                 num_old += 1
             else:
                 cset_article_id = create_cset_article_id(match_id)