Merge pull request #58 from mitre/further_household_fixes

dehall · web-flow · commit 6ec01951f37c · 2023-04-26T09:53:01.000-04:00
Additional households fixes based on testing
diff --git a/households.py b/households.py
@@ -75,6 +75,19 @@ def parse_arguments():
         " Smaller numbers may result in out of memory errors. Larger numbers"
         " may increase runtime. Default is 4",
     )
+    parser.add_argument(
+        "--exact_addresses",
+        action="store_true",
+        help="Use exact matches on address as the definition of a household."
+        " By default the inference process will split up addresses into"
+        " street, number, suffix, etc, and considers phone # and family name"
+        " when making a determination which records belong to which household."
+        " Enabling this feature causes the process to use the entire address"
+        " as a single string for comparisons, and only the address. "
+        " If addresses have not been standardized/validated, this setting"
+        " will likely increase false negatives  (records not being included "
+        " in households where they should be).",
+    )
     parser.add_argument(
         "--pairsfile",
         help="Location of matching pairs file",
@@ -179,7 +192,7 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
     # so it can be traversed sort of like a graph from any given patient
     # note the key is patient position within the pii_lines dataframe
     pos_to_pairs = get_household_matches(
-        pii_lines, args.split_factor, args.debug, args.pairsfile
+        pii_lines, args.split_factor, args.debug, args.exact_addresses, args.pairsfile
     )
 
     mapping_file = Path(args.mappingfile)
@@ -207,12 +220,13 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
             pii_lines["written_to_file"] = False
             hclk_position = 0
             lines_processed = 0
+            hh_sizes = []
             five_percent = int(len(pii_lines) / 20)
             # Match households
-            for position, line in pii_lines.sample(frac=1).iterrows():
+            for position, _line in pii_lines.sample(frac=1).iterrows():
                 # sample(frac=1) shuffles the entire dataframe
                 # note that "position" is the index and still relative to the original
-
+                line = pii_lines.loc[position]
                 lines_processed += 1
 
                 if args.debug and (lines_processed % five_percent) == 0:
@@ -223,20 +237,22 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
 
                 if line["written_to_file"]:
                     continue
-                line["written_to_file"] = True
 
                 if position in pos_to_pairs:
                     pat_positions = bfs_traverse_matches(pos_to_pairs, position)
                     # map those row numbers to PATIDs
                     pat_ids = list(
                         map(lambda p: pii_lines.at[p, "record_id"], pat_positions)
                     )
-                    # mark all these rows as written to file
-                    pii_lines.loc[pat_positions, ["written_to_file"]] = True
                 else:
                     pat_positions = [position]
                     pat_ids = [line[0]]
 
+                # mark all these rows as written to file
+                pii_lines.loc[pat_positions, ["written_to_file"]] = True
+
+                hh_sizes.append(len(pat_positions))
+
                 string_pat_positions = [str(p) for p in pat_positions]
                 pat_string = ",".join(string_pat_positions)
                 mapping_writer.writerow([hclk_position, pat_string])
@@ -258,6 +274,12 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
                 ]
                 hclk_position += 1
                 pii_writer.writerow(output_row)
+
+    hh_sizes_series = pd.Series(hh_sizes, dtype=int)
+
+    print("Household size stats:")
+    print(hh_sizes_series.describe())
+
     return n_households
 
 
diff --git a/households/matching.py b/households/matching.py
@@ -13,10 +13,22 @@
 from definitions import TIMESTAMP_FMT
 
 MATCH_THRESHOLD = 0.85
-FN_WEIGHT = 0.2
-PHONE_WEIGHT = 0.15
-ADDR_WEIGHT = 0.35
-ZIP_WEIGHT = 0.3
+FN_WEIGHT = 0.25
+PHONE_WEIGHT = 0.2
+ADDR_WEIGHT = 0.55
+# ZIP_WEIGHT = 0.25
+# zip is not used in weighting since all candidate pairs match on zip
+
+# a separate address threshold so that pairs with medium-low scores across all fields
+# don't wind up getting matched anyway
+ADDR_THRESHOLD = 0.95
+# using address_distance() below:
+# "205 GARDEN ST" v "206 GARDEN ST" --> 0.8333
+# "205 GARDEN ST" v "205 GAREDN ST" --> 0.98444
+# "205 GARDEN STREET" v  "205 GAREDN ST" --> 0.9666
+# "205 GARDEN ST APT 5F" v "205 GARDEN ST APT 5J" --> 0.9472
+# so 0.95 should give us a good balance of not linking all apartments together
+# while still allowing some room for typos and variation
 
 
 def addr_parse(addr):
@@ -53,6 +65,23 @@ def address_distance(addr1, addr2):
     score = 0
     secondary_score = 0
 
+    a1 = addr1["household_street_address"]
+    a2 = addr2["household_street_address"]
+
+    if not a1 or not a2:
+        # if either is blank they get a score of 0
+        # this matches textdistance.jaro_winkler("", x)
+        # but textdistance.jaro_winkler("", "") is normally 1
+        # without this, 2 missing addresses could be a "perfect match"
+        # which is not what we want
+        return 0
+
+    if a1 == a2:
+        # if the strings are exactly identical,
+        #  don't waste time with detailed comparisons
+        #  this matches textdistance.jaro_winkler(x, x)
+        return 1
+
     # Change weights based on existence of second level address
     if (
         not addr1["prefix"]
@@ -213,15 +242,10 @@ def address_distance(addr1, addr2):
 
     # See if simple string compare of all things combined
     # with a 0.6 adjustment is better
-    a1 = addr1["household_street_address"]
-    a2 = addr2["household_street_address"]
-    if a1 and a2:
-        score = max(
-            score,
-            textdistance.jaro_winkler(a1, a2)
-            * (weight_number + weight_street_name)
-            * 0.6,
-        ) + (secondary_score * weight_secondary)
+    score = max(
+        score,
+        textdistance.jaro_winkler(a1, a2) * (weight_number + weight_street_name) * 0.6,
+    ) + (secondary_score * weight_secondary)
     return score
 
 
@@ -271,7 +295,9 @@ def explode_address(row):
     return parsed
 
 
-def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None):
+def get_household_matches(
+    pii_lines, split_factor=4, debug=False, exact_addresses=False, pairsfile=None
+):
     if pairsfile:
         if debug:
             print(f"[{datetime.now()}] Loading matching pairs file")
@@ -283,28 +309,42 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
             print(f"[{datetime.now()}] Done loading matching pairs")
 
     else:
-        # break out the address into number, street, suffix, etc,
-        # so we can prefilter matches based on those
-        addr_cols = pii_lines.apply(
-            explode_address,
-            axis="columns",
-            result_type="expand",
-        )
-        pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns")
+
+        if exact_addresses:
+            pii_lines_exploded = pii_lines
+        else:
+            # break out the address into number, street, suffix, etc,
+            # so we can prefilter matches based on those
+            addr_cols = pii_lines.apply(
+                explode_address,
+                axis="columns",
+                result_type="expand",
+            )
+            pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns")
 
         if debug:
             print(f"[{datetime.now()}] Done pre-processing PII file")
 
-        candidate_links = get_candidate_links(pii_lines_exploded, split_factor, debug)
-        gc.collect()
-
-        matching_pairs = get_matching_pairs(
-            pii_lines_exploded, candidate_links, split_factor, debug
+        candidate_links = get_candidate_links(
+            pii_lines_exploded, split_factor, exact_addresses, debug
         )
-        del candidate_links
-        del pii_lines_exploded
         gc.collect()
 
+        if exact_addresses:
+            # the candidate links are already all the pairs with matching [address, zip]
+            matching_pairs = candidate_links
+        else:
+            matching_pairs = get_matching_pairs(
+                pii_lines_exploded,
+                candidate_links,
+                split_factor,
+                exact_addresses,
+                debug,
+            )
+            del pii_lines_exploded
+            del candidate_links
+            gc.collect()
+
         if debug:
             timestamp = datetime.now().strftime(TIMESTAMP_FMT)
             pairs_path = Path("temp-data") / f"households_pairs-{timestamp}.csv"
@@ -347,21 +387,25 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
     return pos_to_pairs
 
 
-def get_candidate_links(pii_lines, split_factor=4, debug=False):
+def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=False):
     # indexing step defines the pairs of records for comparison
     # indexer.full() does a full n^2 comparison, but we can do better
     indexer = recordlinkage.Index()
-    # use two block indexes to reduce the number of candidates
+    # use block indexes to reduce the number of candidates
     # while still retaining enough candidates to identify real households.
     # a block only on zip could work, but seems to run into memory issues
     # note sortedneighborhood on zip probably doesn't make sense
     # (zip codes in a geographic area will be too similar)
     # but if data is dirty then blocks may discard typos
 
-    indexer.block(["household_zip", "street", "number"])
-    indexer.block(["household_zip", "family_name"])
+    if exact_addresses:
+        indexer.block(["household_zip", "household_street_address"])
+    else:
+        indexer.block(["household_zip", "street", "number"])
+        indexer.block(["household_zip", "family_name"])
 
-    candidate_links = None
+    # start with an empty index we can append to
+    candidate_links = pd.MultiIndex.from_tuples([], names=[0, 1])
 
     # break up the dataframe into subframes,
     # and iterate over every pair of subframes.
@@ -404,20 +448,26 @@ def get_candidate_links(pii_lines, split_factor=4, debug=False):
             pairs_subset = pairs_subset[pairs_subset[0] < pairs_subset[1]]
             pairs_subset = pd.MultiIndex.from_frame(pairs_subset)
 
-            if candidate_links is None:
-                candidate_links = pairs_subset
-            else:
-                candidate_links = candidate_links.append(pairs_subset)
+            candidate_links = candidate_links.append(pairs_subset)
 
             gc.collect()
 
+    # rows with blank address match ("" == "") so drop those here
+    # TODO: ideally we wouldn't compare blank address lines in the first place
+    #       but the indexing and splitting bits get complicated if we drop them earlier
+    blank_addresses = pii_lines[pii_lines["household_street_address"] == ""].index
+    candidate_links = candidate_links.drop(blank_addresses, level=0, errors="ignore")
+    candidate_links = candidate_links.drop(blank_addresses, level=1, errors="ignore")
+
     if debug:
         print(f"[{datetime.now()}] Found {len(candidate_links)} candidate pairs")
 
     return candidate_links
 
 
-def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
+def get_matching_pairs(
+    pii_lines, candidate_links, split_factor, exact_addresses, debug
+):
     # Comparison step performs the defined comparison algorithms
     # against the candidate pairs
     compare_cl = recordlinkage.Compare()
@@ -428,24 +478,35 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
     compare_cl.string(
         "phone_number", "phone_number", method="jarowinkler", label="phone_number"
     )
-    compare_cl.add(
-        AddressComparison(
-            "exploded_address",
-            "exploded_address",
+    if exact_addresses:
+        compare_cl.string(
+            "household_street_address",
+            "household_street_address",
+            method="jarowinkler",
             label="household_street_address",
         )
-    )
-    compare_cl.string(
-        "household_zip", "household_zip", method="levenshtein", label="household_zip"
-    )
+    else:
+        compare_cl.add(
+            AddressComparison(
+                "exploded_address",
+                "exploded_address",
+                label="household_street_address",
+            )
+        )
+
+    # NOTE: zip code is DISABLED because our indexes block on zip code
+    # compare_cl.string(
+    #     "household_zip", "household_zip", method="levenshtein", label="household_zip"
+    # )
     # note: hamming distance is not implemented in this library,
     # but levenshtein is. the two metrics are likely similar enough
     # that it's not worth implementing hamming again
 
     if debug:
         print(f"[{datetime.now()}] Starting detailed comparison of indexed pairs")
 
-    matching_pairs = None
+    # start with an empty index we can append to
+    matching_pairs = pd.MultiIndex.from_tuples([], names=[0, 1])
     # we know that we could support len(subset_A) in memory above,
     # so use the same amount here
     len_subset_A = int(len(pii_lines) / split_factor)
@@ -470,18 +531,18 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
 
         features = compare_cl.compute(subset_links, relevant_pii_lines)
 
+        # first filter by address similarity
+        features = features[features["household_street_address"] > ADDR_THRESHOLD]
+
         features["family_name"] *= FN_WEIGHT
         features["phone_number"] *= PHONE_WEIGHT
         features["household_street_address"] *= ADDR_WEIGHT
-        features["household_zip"] *= ZIP_WEIGHT
+        # features["household_zip"] *= ZIP_WEIGHT
 
         # filter the matches down based on the cumulative score
         matches = features[features.sum(axis=1) > MATCH_THRESHOLD]
 
-        if matching_pairs is None:
-            matching_pairs = matches.index
-        else:
-            matching_pairs = matching_pairs.append(matches.index)
+        matching_pairs = matching_pairs.append(matches.index)
         # matching pairs are bi-directional and not duplicated,
         # ex if (1,9) is in the list then (9,1) won't be
 
@@ -492,9 +553,6 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
         del matches
         gc.collect()
 
-    # drop exploded address because it's not used past this point
-    pii_lines.drop(columns=["exploded_address"], inplace=True)
-
     if debug:
         print(f"[{datetime.now()}] Found {len(matching_pairs)} matching pairs")
 
diff --git a/linkid_to_patid.py b/linkid_to_patid.py