diff --git a/tests/static/test_create_match_keys/input/input.jsonl b/tests/static/test_create_match_keys/input/input.jsonl index 1876580..8bcbf43 100644 --- a/tests/static/test_create_match_keys/input/input.jsonl +++ b/tests/static/test_create_match_keys/input/input.jsonl @@ -5,3 +5,7 @@ {"orig_id": "F", "merged_id": "carticle_0000000001"} {"orig_id": "I", "merged_id": "carticle_0000000003"} {"orig_id": "J", "merged_id": "carticle_0000000003"} +{"orig_id": "K", "merged_id": "carticle_0000000004"} +{"orig_id": "L", "merged_id": "carticle_0000000004"} +{"orig_id": "M", "merged_id": "carticle_0000000005"} +{"orig_id": "N", "merged_id": "carticle_0000000005"} diff --git a/tests/test_create_merge_ids.py b/tests/test_create_merge_ids.py index ee4b8cd..67be887 100644 --- a/tests/test_create_merge_ids.py +++ b/tests/test_create_merge_ids.py @@ -75,12 +75,15 @@ def test_skip_matches(self): ) def test_create_match_keys(self): - # The first set (A, B, C) contains two old elts from the same match set and one new elt; should keep its id. - # The next (D, E, F) contains one elt from one match set, two from another; should change ids. - # Another (G, H) contains only new ids; should get a new id. - # The last two (I and J) are two different match sets that share an old id and are in ids_to_drop; - # each should get a new id (this is in case of unlinking). - match_sets = [{"A", "B", "C"}, {"D", "E", "F"}, {"G", "H"}, {"I"}, {"J"}] + match_sets = [ + {"A", "B", "C"}, + {"D", "E", "F"}, + {"G", "H"}, + {"I"}, + {"J"}, + {"K", "L"}, + {"M", "N", "O"}, + ] out_dir = os.path.join(static_dir, "test_create_match_keys", "output") if os.path.exists(out_dir): shutil.rmtree(out_dir) @@ -89,16 +92,28 @@ def test_create_match_keys(self): id_mapping_dir = os.path.join(static_dir, "test_create_match_keys", "input") ids_to_drop = os.path.join(static_dir, "test_create_match_keys", "ids_to_drop") expected_output = [ - {"orig_id": "A", "merged_id": "carticle_0000000001"}, - {"orig_id": "B", "merged_id": "carticle_0000000001"}, - {"orig_id": "C", "merged_id": "carticle_0000000001"}, - {"orig_id": "D", "merged_id": "carticle_0000000004"}, - {"orig_id": "E", "merged_id": "carticle_0000000004"}, - {"orig_id": "F", "merged_id": "carticle_0000000004"}, - {"orig_id": "G", "merged_id": "carticle_0000000005"}, - {"orig_id": "H", "merged_id": "carticle_0000000005"}, - {"orig_id": "I", "merged_id": "carticle_0000000006"}, - {"orig_id": "J", "merged_id": "carticle_0000000007"}, + # F was removed from this match set so A B and C should get a new merged id + {"orig_id": "A", "merged_id": "carticle_0000000006"}, + {"orig_id": "B", "merged_id": "carticle_0000000006"}, + {"orig_id": "C", "merged_id": "carticle_0000000006"}, + # D, E, F contains one elt from one match set, two from another; should change ids + {"orig_id": "D", "merged_id": "carticle_0000000007"}, + {"orig_id": "E", "merged_id": "carticle_0000000007"}, + {"orig_id": "F", "merged_id": "carticle_0000000007"}, + # G, H is a completely new match set with new ids, should get a new id + {"orig_id": "G", "merged_id": "carticle_0000000008"}, + {"orig_id": "H", "merged_id": "carticle_0000000008"}, + # The last two (I and J) are two different match sets that share an old id and are in ids_to_drop; + # each should get a new id + {"orig_id": "I", "merged_id": "carticle_0000000009"}, + {"orig_id": "J", "merged_id": "carticle_0000000010"}, + # Nothing changed for this match set so the merged id stays the same + {"orig_id": "K", "merged_id": "carticle_0000000004"}, + {"orig_id": "L", "merged_id": "carticle_0000000004"}, + # This match set got one new article so the merged id stays the same + {"orig_id": "M", "merged_id": "carticle_0000000005"}, + {"orig_id": "N", "merged_id": "carticle_0000000005"}, + {"orig_id": "O", "merged_id": "carticle_0000000005"}, ] print(expected_output) create_match_keys(match_sets, out_fi, ids_to_drop, id_mapping_dir) diff --git a/utils/create_merge_ids.py b/utils/create_merge_ids.py index 299d133..06a5472 100644 --- a/utils/create_merge_ids.py +++ b/utils/create_merge_ids.py @@ -139,8 +139,10 @@ def create_match_keys( :param prev_id_mapping_dir: optional dir containing previous id mappings in jsonl form :return: None """ + print("Creating merged ids") with open(match_file, mode="w") as out: prev_orig_to_merg = {} + merg_to_orig = {} max_merg = "carticle_0" if prev_id_mapping_dir is not None: for fi in os.listdir(prev_id_mapping_dir): @@ -151,6 +153,9 @@ def create_match_keys( merg_id = js["merged_id"] assert orig_id not in prev_orig_to_merg prev_orig_to_merg[orig_id] = merg_id + if merg_id not in merg_to_orig: + merg_to_orig[merg_id] = set() + merg_to_orig[merg_id].add(orig_id) if merg_id > max_merg: max_merg = merg_id ignore_ids = set() @@ -171,6 +176,10 @@ def create_match_keys( ) if len(existing_ids) == 1 and list(existing_ids)[0] not in ignore_ids: cset_article_id = existing_ids.pop() + # In some cases, merged ids can "split apart", if their constituent articles no longer + # match. We'll detect this case by checking whether the old set of articles assigned to + # this merged id contain any entries missing from our current set + if cset_article_id and (len(merg_to_orig[cset_article_id] - set(ms)) == 0): num_old += 1 else: cset_article_id = create_cset_article_id(match_id)