Skip to content

Commit 6ec0195

Browse files
authored
Merge pull request #58 from mitre/further_household_fixes
Additional households fixes based on testing
2 parents f6d11fc + 4b9256a commit 6ec0195

File tree

3 files changed

+148
-64
lines changed

3 files changed

+148
-64
lines changed

households.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,19 @@ def parse_arguments():
7575
" Smaller numbers may result in out of memory errors. Larger numbers"
7676
" may increase runtime. Default is 4",
7777
)
78+
parser.add_argument(
79+
"--exact_addresses",
80+
action="store_true",
81+
help="Use exact matches on address as the definition of a household."
82+
" By default the inference process will split up addresses into"
83+
" street, number, suffix, etc, and considers phone # and family name"
84+
" when making a determination which records belong to which household."
85+
" Enabling this feature causes the process to use the entire address"
86+
" as a single string for comparisons, and only the address. "
87+
" If addresses have not been standardized/validated, this setting"
88+
" will likely increase false negatives (records not being included "
89+
" in households where they should be).",
90+
)
7891
parser.add_argument(
7992
"--pairsfile",
8093
help="Location of matching pairs file",
@@ -179,7 +192,7 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
179192
# so it can be traversed sort of like a graph from any given patient
180193
# note the key is patient position within the pii_lines dataframe
181194
pos_to_pairs = get_household_matches(
182-
pii_lines, args.split_factor, args.debug, args.pairsfile
195+
pii_lines, args.split_factor, args.debug, args.exact_addresses, args.pairsfile
183196
)
184197

185198
mapping_file = Path(args.mappingfile)
@@ -207,12 +220,13 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
207220
pii_lines["written_to_file"] = False
208221
hclk_position = 0
209222
lines_processed = 0
223+
hh_sizes = []
210224
five_percent = int(len(pii_lines) / 20)
211225
# Match households
212-
for position, line in pii_lines.sample(frac=1).iterrows():
226+
for position, _line in pii_lines.sample(frac=1).iterrows():
213227
# sample(frac=1) shuffles the entire dataframe
214228
# note that "position" is the index and still relative to the original
215-
229+
line = pii_lines.loc[position]
216230
lines_processed += 1
217231

218232
if args.debug and (lines_processed % five_percent) == 0:
@@ -223,20 +237,22 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
223237

224238
if line["written_to_file"]:
225239
continue
226-
line["written_to_file"] = True
227240

228241
if position in pos_to_pairs:
229242
pat_positions = bfs_traverse_matches(pos_to_pairs, position)
230243
# map those row numbers to PATIDs
231244
pat_ids = list(
232245
map(lambda p: pii_lines.at[p, "record_id"], pat_positions)
233246
)
234-
# mark all these rows as written to file
235-
pii_lines.loc[pat_positions, ["written_to_file"]] = True
236247
else:
237248
pat_positions = [position]
238249
pat_ids = [line[0]]
239250

251+
# mark all these rows as written to file
252+
pii_lines.loc[pat_positions, ["written_to_file"]] = True
253+
254+
hh_sizes.append(len(pat_positions))
255+
240256
string_pat_positions = [str(p) for p in pat_positions]
241257
pat_string = ",".join(string_pat_positions)
242258
mapping_writer.writerow([hclk_position, pat_string])
@@ -258,6 +274,12 @@ def write_pii_and_mapping_file(pos_pid_rows, hid_pat_id_rows, household_time, ar
258274
]
259275
hclk_position += 1
260276
pii_writer.writerow(output_row)
277+
278+
hh_sizes_series = pd.Series(hh_sizes, dtype=int)
279+
280+
print("Household size stats:")
281+
print(hh_sizes_series.describe())
282+
261283
return n_households
262284

263285

households/matching.py

Lines changed: 114 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,22 @@
1313
from definitions import TIMESTAMP_FMT
1414

1515
MATCH_THRESHOLD = 0.85
16-
FN_WEIGHT = 0.2
17-
PHONE_WEIGHT = 0.15
18-
ADDR_WEIGHT = 0.35
19-
ZIP_WEIGHT = 0.3
16+
FN_WEIGHT = 0.25
17+
PHONE_WEIGHT = 0.2
18+
ADDR_WEIGHT = 0.55
19+
# ZIP_WEIGHT = 0.25
20+
# zip is not used in weighting since all candidate pairs match on zip
21+
22+
# a separate address threshold so that pairs with medium-low scores across all fields
23+
# don't wind up getting matched anyway
24+
ADDR_THRESHOLD = 0.95
25+
# using address_distance() below:
26+
# "205 GARDEN ST" v "206 GARDEN ST" --> 0.8333
27+
# "205 GARDEN ST" v "205 GAREDN ST" --> 0.98444
28+
# "205 GARDEN STREET" v "205 GAREDN ST" --> 0.9666
29+
# "205 GARDEN ST APT 5F" v "205 GARDEN ST APT 5J" --> 0.9472
30+
# so 0.95 should give us a good balance of not linking all apartments together
31+
# while still allowing some room for typos and variation
2032

2133

2234
def addr_parse(addr):
@@ -53,6 +65,23 @@ def address_distance(addr1, addr2):
5365
score = 0
5466
secondary_score = 0
5567

68+
a1 = addr1["household_street_address"]
69+
a2 = addr2["household_street_address"]
70+
71+
if not a1 or not a2:
72+
# if either is blank they get a score of 0
73+
# this matches textdistance.jaro_winkler("", x)
74+
# but textdistance.jaro_winkler("", "") is normally 1
75+
# without this, 2 missing addresses could be a "perfect match"
76+
# which is not what we want
77+
return 0
78+
79+
if a1 == a2:
80+
# if the strings are exactly identical,
81+
# don't waste time with detailed comparisons
82+
# this matches textdistance.jaro_winkler(x, x)
83+
return 1
84+
5685
# Change weights based on existence of second level address
5786
if (
5887
not addr1["prefix"]
@@ -213,15 +242,10 @@ def address_distance(addr1, addr2):
213242

214243
# See if simple string compare of all things combined
215244
# with a 0.6 adjustment is better
216-
a1 = addr1["household_street_address"]
217-
a2 = addr2["household_street_address"]
218-
if a1 and a2:
219-
score = max(
220-
score,
221-
textdistance.jaro_winkler(a1, a2)
222-
* (weight_number + weight_street_name)
223-
* 0.6,
224-
) + (secondary_score * weight_secondary)
245+
score = max(
246+
score,
247+
textdistance.jaro_winkler(a1, a2) * (weight_number + weight_street_name) * 0.6,
248+
) + (secondary_score * weight_secondary)
225249
return score
226250

227251

@@ -271,7 +295,9 @@ def explode_address(row):
271295
return parsed
272296

273297

274-
def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None):
298+
def get_household_matches(
299+
pii_lines, split_factor=4, debug=False, exact_addresses=False, pairsfile=None
300+
):
275301
if pairsfile:
276302
if debug:
277303
print(f"[{datetime.now()}] Loading matching pairs file")
@@ -283,28 +309,42 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
283309
print(f"[{datetime.now()}] Done loading matching pairs")
284310

285311
else:
286-
# break out the address into number, street, suffix, etc,
287-
# so we can prefilter matches based on those
288-
addr_cols = pii_lines.apply(
289-
explode_address,
290-
axis="columns",
291-
result_type="expand",
292-
)
293-
pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns")
312+
313+
if exact_addresses:
314+
pii_lines_exploded = pii_lines
315+
else:
316+
# break out the address into number, street, suffix, etc,
317+
# so we can prefilter matches based on those
318+
addr_cols = pii_lines.apply(
319+
explode_address,
320+
axis="columns",
321+
result_type="expand",
322+
)
323+
pii_lines_exploded = pd.concat([pii_lines, addr_cols], axis="columns")
294324

295325
if debug:
296326
print(f"[{datetime.now()}] Done pre-processing PII file")
297327

298-
candidate_links = get_candidate_links(pii_lines_exploded, split_factor, debug)
299-
gc.collect()
300-
301-
matching_pairs = get_matching_pairs(
302-
pii_lines_exploded, candidate_links, split_factor, debug
328+
candidate_links = get_candidate_links(
329+
pii_lines_exploded, split_factor, exact_addresses, debug
303330
)
304-
del candidate_links
305-
del pii_lines_exploded
306331
gc.collect()
307332

333+
if exact_addresses:
334+
# the candidate links are already all the pairs with matching [address, zip]
335+
matching_pairs = candidate_links
336+
else:
337+
matching_pairs = get_matching_pairs(
338+
pii_lines_exploded,
339+
candidate_links,
340+
split_factor,
341+
exact_addresses,
342+
debug,
343+
)
344+
del pii_lines_exploded
345+
del candidate_links
346+
gc.collect()
347+
308348
if debug:
309349
timestamp = datetime.now().strftime(TIMESTAMP_FMT)
310350
pairs_path = Path("temp-data") / f"households_pairs-{timestamp}.csv"
@@ -347,21 +387,25 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
347387
return pos_to_pairs
348388

349389

350-
def get_candidate_links(pii_lines, split_factor=4, debug=False):
390+
def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=False):
351391
# indexing step defines the pairs of records for comparison
352392
# indexer.full() does a full n^2 comparison, but we can do better
353393
indexer = recordlinkage.Index()
354-
# use two block indexes to reduce the number of candidates
394+
# use block indexes to reduce the number of candidates
355395
# while still retaining enough candidates to identify real households.
356396
# a block only on zip could work, but seems to run into memory issues
357397
# note sortedneighborhood on zip probably doesn't make sense
358398
# (zip codes in a geographic area will be too similar)
359399
# but if data is dirty then blocks may discard typos
360400

361-
indexer.block(["household_zip", "street", "number"])
362-
indexer.block(["household_zip", "family_name"])
401+
if exact_addresses:
402+
indexer.block(["household_zip", "household_street_address"])
403+
else:
404+
indexer.block(["household_zip", "street", "number"])
405+
indexer.block(["household_zip", "family_name"])
363406

364-
candidate_links = None
407+
# start with an empty index we can append to
408+
candidate_links = pd.MultiIndex.from_tuples([], names=[0, 1])
365409

366410
# break up the dataframe into subframes,
367411
# and iterate over every pair of subframes.
@@ -404,20 +448,26 @@ def get_candidate_links(pii_lines, split_factor=4, debug=False):
404448
pairs_subset = pairs_subset[pairs_subset[0] < pairs_subset[1]]
405449
pairs_subset = pd.MultiIndex.from_frame(pairs_subset)
406450

407-
if candidate_links is None:
408-
candidate_links = pairs_subset
409-
else:
410-
candidate_links = candidate_links.append(pairs_subset)
451+
candidate_links = candidate_links.append(pairs_subset)
411452

412453
gc.collect()
413454

455+
# rows with blank address match ("" == "") so drop those here
456+
# TODO: ideally we wouldn't compare blank address lines in the first place
457+
# but the indexing and splitting bits get complicated if we drop them earlier
458+
blank_addresses = pii_lines[pii_lines["household_street_address"] == ""].index
459+
candidate_links = candidate_links.drop(blank_addresses, level=0, errors="ignore")
460+
candidate_links = candidate_links.drop(blank_addresses, level=1, errors="ignore")
461+
414462
if debug:
415463
print(f"[{datetime.now()}] Found {len(candidate_links)} candidate pairs")
416464

417465
return candidate_links
418466

419467

420-
def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
468+
def get_matching_pairs(
469+
pii_lines, candidate_links, split_factor, exact_addresses, debug
470+
):
421471
# Comparison step performs the defined comparison algorithms
422472
# against the candidate pairs
423473
compare_cl = recordlinkage.Compare()
@@ -428,24 +478,35 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
428478
compare_cl.string(
429479
"phone_number", "phone_number", method="jarowinkler", label="phone_number"
430480
)
431-
compare_cl.add(
432-
AddressComparison(
433-
"exploded_address",
434-
"exploded_address",
481+
if exact_addresses:
482+
compare_cl.string(
483+
"household_street_address",
484+
"household_street_address",
485+
method="jarowinkler",
435486
label="household_street_address",
436487
)
437-
)
438-
compare_cl.string(
439-
"household_zip", "household_zip", method="levenshtein", label="household_zip"
440-
)
488+
else:
489+
compare_cl.add(
490+
AddressComparison(
491+
"exploded_address",
492+
"exploded_address",
493+
label="household_street_address",
494+
)
495+
)
496+
497+
# NOTE: zip code is DISABLED because our indexes block on zip code
498+
# compare_cl.string(
499+
# "household_zip", "household_zip", method="levenshtein", label="household_zip"
500+
# )
441501
# note: hamming distance is not implemented in this library,
442502
# but levenshtein is. the two metrics are likely similar enough
443503
# that it's not worth implementing hamming again
444504

445505
if debug:
446506
print(f"[{datetime.now()}] Starting detailed comparison of indexed pairs")
447507

448-
matching_pairs = None
508+
# start with an empty index we can append to
509+
matching_pairs = pd.MultiIndex.from_tuples([], names=[0, 1])
449510
# we know that we could support len(subset_A) in memory above,
450511
# so use the same amount here
451512
len_subset_A = int(len(pii_lines) / split_factor)
@@ -470,18 +531,18 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
470531

471532
features = compare_cl.compute(subset_links, relevant_pii_lines)
472533

534+
# first filter by address similarity
535+
features = features[features["household_street_address"] > ADDR_THRESHOLD]
536+
473537
features["family_name"] *= FN_WEIGHT
474538
features["phone_number"] *= PHONE_WEIGHT
475539
features["household_street_address"] *= ADDR_WEIGHT
476-
features["household_zip"] *= ZIP_WEIGHT
540+
# features["household_zip"] *= ZIP_WEIGHT
477541

478542
# filter the matches down based on the cumulative score
479543
matches = features[features.sum(axis=1) > MATCH_THRESHOLD]
480544

481-
if matching_pairs is None:
482-
matching_pairs = matches.index
483-
else:
484-
matching_pairs = matching_pairs.append(matches.index)
545+
matching_pairs = matching_pairs.append(matches.index)
485546
# matching pairs are bi-directional and not duplicated,
486547
# ex if (1,9) is in the list then (9,1) won't be
487548

@@ -492,9 +553,6 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
492553
del matches
493554
gc.collect()
494555

495-
# drop exploded address because it's not used past this point
496-
pii_lines.drop(columns=["exploded_address"], inplace=True)
497-
498556
if debug:
499557
print(f"[{datetime.now()}] Found {len(matching_pairs)} matching pairs")
500558

0 commit comments

Comments
 (0)